mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-20 16:56:55 +01:00
Compare commits
111 Commits
311e2aab3f
...
develop
Author | SHA1 | Date | |
---|---|---|---|
7aa06329d0 | |||
9d6a38c44c | |||
6ec5cee368 | |||
f2e9a68825 | |||
d88750e6b6 | |||
821358eda7 | |||
fce6e1f135 | |||
8f0bb3e676 | |||
262c70d967 | |||
da43ef7c2d | |||
7b60ab5df1 | |||
f6b961a64e | |||
f1ed988aa3 | |||
eea51bb604 | |||
9203126aa5 | |||
f90ba4712a | |||
3737a24096 | |||
d418f78352 | |||
25163998a0 | |||
dc546aaa4b | |||
5364d580c9 | |||
2a9a6347e3 | |||
cfdb56f314 | |||
b517e88db3 | |||
bb317aba8d | |||
644cc6647e | |||
72397ce23b | |||
d60a80c098 | |||
bb8b6d9d73 | |||
677b4cc5b0 | |||
be565ffab6 | |||
df6120e5f6 | |||
21de6f7da8 | |||
dbe39f9ce0 | |||
ab3de50d5e | |||
c545bd2139 | |||
6a1c64fbdd | |||
b75809ed61 | |||
ecaf228e5c | |||
6d015ae8fc | |||
233150d93f | |||
7af8c77a52 | |||
a957e7bfa1 | |||
cee4c8ce8c | |||
96bf814d8c | |||
7ddc422788 | |||
e652fc2825 | |||
a49fa3f8d0 | |||
cd452a2f91 | |||
4f89f603ae | |||
11dc2c5e1d | |||
6fec3c15ca | |||
938c47480f | |||
3811d19298 | |||
83a3ab6b6f | |||
d66a9af6a3 | |||
adc90d3a86 | |||
ebbd015c5c | |||
4ab73b36b2 | |||
130e07a422 | |||
8f47bb367e | |||
0c3cb60135 | |||
9eae8fca5d | |||
882a217074 | |||
e465fce201 | |||
d41542c64b | |||
199818bd6c | |||
fe66c7ca30 | |||
e9177e4af3 | |||
d15a6c5933 | |||
25ab9325e7 | |||
19f9378b98 | |||
785bc7a14f | |||
1a1fe85428 | |||
0000d2e558 | |||
9ffd1ed4ce | |||
3d014864e2 | |||
1d22841811 | |||
a1cdda833f | |||
ad6db92690 | |||
e8ff9d8e50 | |||
795769c636 | |||
267a39d943 | |||
3624bd3d22 | |||
bc12dbbb38 | |||
eb8a008a8f | |||
c4d9aa1a21 | |||
6ae809ed40 | |||
b1ba209696 | |||
cb3e529b1e | |||
717f647418 | |||
98e7418187 | |||
fe05bf48b1 | |||
d2dd8f54e2 | |||
7726ee4b16 | |||
8729c46169 | |||
09f81fe7c3 | |||
1876e5b7c0 | |||
355ec76257 | |||
84cab5e6e7 | |||
4f17c8d081 | |||
aaab753982 | |||
570b72a47b | |||
a5798a89ed | |||
f7e2f9a401 | |||
2848a9b558 | |||
d4868991af | |||
e99d42404e | |||
3ba019c747 | |||
47429218bb | |||
8d305df0db |
@ -51,11 +51,13 @@ directory
|
|||||||
#pragma nv_diag_suppress cast_to_qualified_type
|
#pragma nv_diag_suppress cast_to_qualified_type
|
||||||
//disables nvcc specific warning in many files
|
//disables nvcc specific warning in many files
|
||||||
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
|
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
|
||||||
|
#pragma nv_diag_suppress declared_but_not_referenced
|
||||||
#pragma nv_diag_suppress extra_semicolon
|
#pragma nv_diag_suppress extra_semicolon
|
||||||
#else
|
#else
|
||||||
//disables nvcc specific warning in json.hpp
|
//disables nvcc specific warning in json.hpp
|
||||||
#pragma diag_suppress unsigned_compare_with_zero
|
#pragma diag_suppress unsigned_compare_with_zero
|
||||||
#pragma diag_suppress cast_to_qualified_type
|
#pragma diag_suppress cast_to_qualified_type
|
||||||
|
#pragma diag_suppress declared_but_not_referenced
|
||||||
//disables nvcc specific warning in many files
|
//disables nvcc specific warning in many files
|
||||||
#pragma diag_suppress esa_on_defaulted_function_ignored
|
#pragma diag_suppress esa_on_defaulted_function_ignored
|
||||||
#pragma diag_suppress extra_semicolon
|
#pragma diag_suppress extra_semicolon
|
||||||
|
@ -191,7 +191,7 @@ public:
|
|||||||
|
|
||||||
Lattice<sobj> pgbuf(&pencil_g);
|
Lattice<sobj> pgbuf(&pencil_g);
|
||||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||||
std::cout << "CPU view" << std::endl;
|
//std::cout << "CPU view" << std::endl;
|
||||||
|
|
||||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||||
@ -215,7 +215,7 @@ public:
|
|||||||
else if ( sign == forward ) div = 1.0;
|
else if ( sign == forward ) div = 1.0;
|
||||||
else assert(0);
|
else assert(0);
|
||||||
|
|
||||||
std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
||||||
FFTW_plan p;
|
FFTW_plan p;
|
||||||
{
|
{
|
||||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
||||||
@ -229,7 +229,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Barrel shift and collect global pencil
|
// Barrel shift and collect global pencil
|
||||||
std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
||||||
Coordinate lcoor(Nd), gcoor(Nd);
|
Coordinate lcoor(Nd), gcoor(Nd);
|
||||||
result = source;
|
result = source;
|
||||||
int pc = processor_coor[dim];
|
int pc = processor_coor[dim];
|
||||||
@ -251,7 +251,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
|
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
|
||||||
// Loop over orthog coords
|
// Loop over orthog coords
|
||||||
int NN=pencil_g.lSites();
|
int NN=pencil_g.lSites();
|
||||||
GridStopWatch timer;
|
GridStopWatch timer;
|
||||||
@ -274,7 +274,7 @@ public:
|
|||||||
usec += timer.useconds();
|
usec += timer.useconds();
|
||||||
flops+= flops_call*NN;
|
flops+= flops_call*NN;
|
||||||
|
|
||||||
std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
|
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
|
||||||
// writing out result
|
// writing out result
|
||||||
{
|
{
|
||||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||||
@ -291,7 +291,7 @@ public:
|
|||||||
}
|
}
|
||||||
result = result*div;
|
result = result*div;
|
||||||
|
|
||||||
std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
|
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
|
||||||
// destroying plan
|
// destroying plan
|
||||||
FFTW<scalar>::fftw_destroy_plan(p);
|
FFTW<scalar>::fftw_destroy_plan(p);
|
||||||
#endif
|
#endif
|
||||||
|
@ -277,6 +277,38 @@ public:
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
template<class Matrix,class Field>
|
||||||
|
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
|
||||||
|
Matrix &_Mat;
|
||||||
|
RealD shift;
|
||||||
|
public:
|
||||||
|
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
|
||||||
|
// Support for coarsening to a multigrid
|
||||||
|
void OpDiag (const Field &in, Field &out) {
|
||||||
|
_Mat.Mdiag(in,out);
|
||||||
|
out = out + shift*in;
|
||||||
|
}
|
||||||
|
void OpDir (const Field &in, Field &out,int dir,int disp) {
|
||||||
|
_Mat.Mdir(in,out,dir,disp);
|
||||||
|
}
|
||||||
|
void OpDirAll (const Field &in, std::vector<Field> &out){
|
||||||
|
_Mat.MdirAll(in,out);
|
||||||
|
};
|
||||||
|
void Op (const Field &in, Field &out){
|
||||||
|
_Mat.M(in,out);
|
||||||
|
out = out + shift * in;
|
||||||
|
}
|
||||||
|
void AdjOp (const Field &in, Field &out){
|
||||||
|
_Mat.Mdag(in,out);
|
||||||
|
out = out + shift * in;
|
||||||
|
}
|
||||||
|
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
void HermOp(const Field &in, Field &out){
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
// Even Odd Schur decomp operators; there are several
|
// Even Odd Schur decomp operators; there are several
|
||||||
|
@ -269,7 +269,9 @@ public:
|
|||||||
RealD xscale = 2.0/(hi-lo);
|
RealD xscale = 2.0/(hi-lo);
|
||||||
RealD mscale = -(hi+lo)/(hi-lo);
|
RealD mscale = -(hi+lo)/(hi-lo);
|
||||||
Linop.HermOp(T0,y);
|
Linop.HermOp(T0,y);
|
||||||
|
grid->Barrier();
|
||||||
axpby(T1,xscale,mscale,y,in);
|
axpby(T1,xscale,mscale,y,in);
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
// sum = .5 c[0] T0 + c[1] T1
|
// sum = .5 c[0] T0 + c[1] T1
|
||||||
// out = ()*T0 + Coeffs[1]*T1;
|
// out = ()*T0 + Coeffs[1]*T1;
|
||||||
|
@ -208,8 +208,8 @@ public:
|
|||||||
assert(Bkn.size()==batchCount);
|
assert(Bkn.size()==batchCount);
|
||||||
assert(Cmn.size()==batchCount);
|
assert(Cmn.size()==batchCount);
|
||||||
|
|
||||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
assert(OpB!=GridBLAS_OP_T);
|
//assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
@ -367,28 +367,67 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
} );
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
} );
|
} );
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -414,8 +453,8 @@ public:
|
|||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
assert(OpB!=GridBLAS_OP_T);
|
//assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
@ -514,28 +553,70 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
} );
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
} );
|
} );
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -661,29 +742,41 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
if (std::abs(beta) != 0.0)
|
||||||
} );
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@ -809,28 +902,40 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
if (std::abs(beta) != 0.0)
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
else
|
||||||
|
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
|
@ -245,9 +245,10 @@ until convergence
|
|||||||
_HermOp(src_n,tmp);
|
_HermOp(src_n,tmp);
|
||||||
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
|
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
|
||||||
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
|
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
|
||||||
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
|
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
|
||||||
|
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
|
||||||
RealD vden = norm2(src_n);
|
RealD vden = norm2(src_n);
|
||||||
RealD na = vnum/vden;
|
RealD na = std::sqrt(vnum/vden);
|
||||||
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
|
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
|
||||||
i=_MAX_ITER_IRL_MEVAPP_;
|
i=_MAX_ITER_IRL_MEVAPP_;
|
||||||
evalMaxApprox = na;
|
evalMaxApprox = na;
|
||||||
@ -255,6 +256,7 @@ until convergence
|
|||||||
src_n = tmp;
|
src_n = tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
|
||||||
|
|
||||||
std::vector<RealD> lme(Nm);
|
std::vector<RealD> lme(Nm);
|
||||||
std::vector<RealD> lme2(Nm);
|
std::vector<RealD> lme2(Nm);
|
||||||
|
@ -97,7 +97,7 @@ public:
|
|||||||
|
|
||||||
RealD scale;
|
RealD scale;
|
||||||
|
|
||||||
ConjugateGradient<FineField> CG(1.0e-2,100,false);
|
ConjugateGradient<FineField> CG(1.0e-3,400,false);
|
||||||
FineField noise(FineGrid);
|
FineField noise(FineGrid);
|
||||||
FineField Mn(FineGrid);
|
FineField Mn(FineGrid);
|
||||||
|
|
||||||
@ -110,7 +110,7 @@ public:
|
|||||||
|
|
||||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||||
|
|
||||||
for(int i=0;i<1;i++){
|
for(int i=0;i<4;i++){
|
||||||
|
|
||||||
CG(hermop,noise,subspace[b]);
|
CG(hermop,noise,subspace[b]);
|
||||||
|
|
||||||
@ -146,7 +146,7 @@ public:
|
|||||||
|
|
||||||
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
|
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
|
||||||
|
|
||||||
for(int i=0;i<3;i++){
|
for(int i=0;i<2;i++){
|
||||||
// void operator() (const Field &src, Field &psi){
|
// void operator() (const Field &src, Field &psi){
|
||||||
#if 1
|
#if 1
|
||||||
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
|
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
|
||||||
|
@ -441,8 +441,20 @@ public:
|
|||||||
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
// Galerkin projection of matrix
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||||
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
||||||
|
{
|
||||||
|
CoarsenOperator(linop,Subspace,Subspace);
|
||||||
|
}
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
// Petrov - Galerkin projection of matrix
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||||
|
Aggregation<Fobj,CComplex,nbasis> & U,
|
||||||
|
Aggregation<Fobj,CComplex,nbasis> & V)
|
||||||
{
|
{
|
||||||
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
|
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
|
||||||
GridBase *grid = FineGrid();
|
GridBase *grid = FineGrid();
|
||||||
@ -458,11 +470,9 @@ public:
|
|||||||
// Orthogonalise the subblocks over the basis
|
// Orthogonalise the subblocks over the basis
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
CoarseScalar InnerProd(CoarseGrid());
|
CoarseScalar InnerProd(CoarseGrid());
|
||||||
blockOrthogonalise(InnerProd,Subspace.subspace);
|
blockOrthogonalise(InnerProd,V.subspace);
|
||||||
|
blockOrthogonalise(InnerProd,U.subspace);
|
||||||
|
|
||||||
// for(int s=0;s<Subspace.subspace.size();s++){
|
|
||||||
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
|
|
||||||
// }
|
|
||||||
const int npoint = geom.npoint;
|
const int npoint = geom.npoint;
|
||||||
|
|
||||||
Coordinate clatt = CoarseGrid()->GlobalDimensions();
|
Coordinate clatt = CoarseGrid()->GlobalDimensions();
|
||||||
@ -542,7 +552,7 @@ public:
|
|||||||
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
|
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
|
||||||
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
|
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
|
||||||
tphaseBZ-=usecond();
|
tphaseBZ-=usecond();
|
||||||
phaV = phaF[p]*Subspace.subspace[i];
|
phaV = phaF[p]*V.subspace[i];
|
||||||
tphaseBZ+=usecond();
|
tphaseBZ+=usecond();
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
@ -555,7 +565,7 @@ public:
|
|||||||
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
|
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
|
||||||
|
|
||||||
tproj-=usecond();
|
tproj-=usecond();
|
||||||
blockProject(coarseInner,MphaV,Subspace.subspace);
|
blockProject(coarseInner,MphaV,U.subspace);
|
||||||
coarseInner = conjugate(pha[p]) * coarseInner;
|
coarseInner = conjugate(pha[p]) * coarseInner;
|
||||||
|
|
||||||
ComputeProj[p] = coarseInner;
|
ComputeProj[p] = coarseInner;
|
||||||
|
@ -69,7 +69,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
||||||
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
void construct(pointer __p, const _Tp& __val) { };
|
||||||
void construct(pointer __p) { };
|
void construct(pointer __p) { };
|
||||||
void destroy(pointer __p) { };
|
void destroy(pointer __p) { };
|
||||||
};
|
};
|
||||||
|
@ -234,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
|||||||
}
|
}
|
||||||
void MemoryManager::EvictVictims(uint64_t bytes)
|
void MemoryManager::EvictVictims(uint64_t bytes)
|
||||||
{
|
{
|
||||||
|
if(bytes>=DeviceMaxBytes) {
|
||||||
|
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
|
||||||
|
}
|
||||||
assert(bytes<DeviceMaxBytes);
|
assert(bytes<DeviceMaxBytes);
|
||||||
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
||||||
if ( DeviceLRUBytes > 0){
|
if ( DeviceLRUBytes > 0){
|
||||||
|
@ -149,7 +149,8 @@ public:
|
|||||||
sizeof(obj),d*100+p);
|
sizeof(obj),d*100+p);
|
||||||
|
|
||||||
}
|
}
|
||||||
CommsComplete(list);
|
if (!list.empty()) // avoid triggering assert in comms == none
|
||||||
|
CommsComplete(list);
|
||||||
for(int p=1;p<_processors[d];p++){
|
for(int p=1;p<_processors[d];p++){
|
||||||
accum = accum + column[p];
|
accum = accum + column[p];
|
||||||
}
|
}
|
||||||
@ -182,6 +183,7 @@ public:
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes);
|
int bytes);
|
||||||
|
|
||||||
|
int IsOffNode(int rank);
|
||||||
double StencilSendToRecvFrom(void *xmit,
|
double StencilSendToRecvFrom(void *xmit,
|
||||||
int xmit_to_rank,int do_xmit,
|
int xmit_to_rank,int do_xmit,
|
||||||
void *recv,
|
void *recv,
|
||||||
@ -200,9 +202,9 @@ public:
|
|||||||
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
||||||
|
|
||||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,void *xmit_comp,
|
||||||
int xmit_to_rank,int do_xmit,
|
int xmit_to_rank,int do_xmit,
|
||||||
void *recv,
|
void *recv,void *recv_comp,
|
||||||
int recv_from_rank,int do_recv,
|
int recv_from_rank,int do_recv,
|
||||||
int xbytes,int rbytes,int dir);
|
int xbytes,int rbytes,int dir);
|
||||||
|
|
||||||
|
@ -260,32 +260,39 @@ CartesianCommunicator::~CartesianCommunicator()
|
|||||||
}
|
}
|
||||||
#ifdef USE_GRID_REDUCTION
|
#ifdef USE_GRID_REDUCTION
|
||||||
void CartesianCommunicator::GlobalSum(float &f){
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
|
FlightRecorder::StepLog("GlobalSumP2P");
|
||||||
CartesianCommunicator::GlobalSumP2P(f);
|
CartesianCommunicator::GlobalSumP2P(f);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSum(double &d)
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("GlobalSumP2P");
|
||||||
CartesianCommunicator::GlobalSumP2P(d);
|
CartesianCommunicator::GlobalSumP2P(d);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
void CartesianCommunicator::GlobalSum(float &f){
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
|
FlightRecorder::StepLog("AllReduce");
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSum(double &d)
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("AllReduce");
|
||||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||||
|
FlightRecorder::StepLog("AllReduce");
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||||
|
FlightRecorder::StepLog("AllReduce");
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
|
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
|
||||||
|
FlightRecorder::StepLog("AllReduceVector");
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
@ -388,11 +395,16 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
{
|
{
|
||||||
std::vector<CommsRequest_t> list;
|
std::vector<CommsRequest_t> list;
|
||||||
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||||
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
|
||||||
StencilSendToRecvFromComplete(list,dir);
|
StencilSendToRecvFromComplete(list,dir);
|
||||||
return offbytes;
|
return offbytes;
|
||||||
}
|
}
|
||||||
|
int CartesianCommunicator::IsOffNode(int rank)
|
||||||
|
{
|
||||||
|
int grank = ShmRanks[rank];
|
||||||
|
if ( grank == MPI_UNDEFINED ) return true;
|
||||||
|
else return false;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||||
@ -407,9 +419,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
|||||||
return 0.0; // Do nothing -- no preparation required
|
return 0.0; // Do nothing -- no preparation required
|
||||||
}
|
}
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,void *xmit_comp,
|
||||||
int dest,int dox,
|
int dest,int dox,
|
||||||
void *recv,
|
void *recv,void *recv_comp,
|
||||||
int from,int dor,
|
int from,int dor,
|
||||||
int xbytes,int rbytes,int dir)
|
int xbytes,int rbytes,int dir)
|
||||||
{
|
{
|
||||||
@ -433,24 +445,35 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
if ( dor ) {
|
if ( dor ) {
|
||||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
tag= dir+from*32;
|
tag= dir+from*32;
|
||||||
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
// std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
|
||||||
|
ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.push_back(rrq);
|
list.push_back(rrq);
|
||||||
off_node_bytes+=rbytes;
|
off_node_bytes+=rbytes;
|
||||||
}
|
}
|
||||||
|
#ifdef NVLINK_GET
|
||||||
|
else {
|
||||||
|
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||||
|
assert(shm!=NULL);
|
||||||
|
// std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
|
||||||
|
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
// This is a NVLINK PUT
|
||||||
if (dox) {
|
if (dox) {
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
tag= dir+_processor*32;
|
tag= dir+_processor*32;
|
||||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.push_back(xrq);
|
list.push_back(xrq);
|
||||||
off_node_bytes+=xbytes;
|
off_node_bytes+=xbytes;
|
||||||
} else {
|
} else {
|
||||||
|
#ifndef NVLINK_GET
|
||||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||||
assert(shm!=NULL);
|
assert(shm!=NULL);
|
||||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
@ -459,7 +482,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||||
{
|
{
|
||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
|
/*finishes Get/Put*/
|
||||||
acceleratorCopySynchronise();
|
acceleratorCopySynchronise();
|
||||||
|
|
||||||
if (nreq==0) return;
|
if (nreq==0) return;
|
||||||
@ -660,9 +683,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
|
|||||||
}
|
}
|
||||||
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,void *xmit_comp,
|
||||||
int dest,int dox,
|
int dest,int dox,
|
||||||
void *recv,
|
void *recv,void *recv_comp,
|
||||||
int from,int dor,
|
int from,int dor,
|
||||||
int xbytes,int rbytes,int dir)
|
int xbytes,int rbytes,int dir)
|
||||||
{
|
{
|
||||||
@ -746,26 +769,31 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||||
{
|
{
|
||||||
// int nreq=list.size();
|
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
|
||||||
|
|
||||||
// if (nreq==0) return;
|
std::vector<MPI_Status> status;
|
||||||
// std::vector<MPI_Status> status(nreq);
|
std::vector<MPI_Request> MpiRequests;
|
||||||
// std::vector<MPI_Request> MpiRequests(nreq);
|
|
||||||
|
for(int r=0;r<list.size();r++){
|
||||||
|
// Must check each Send buf is clear to reuse
|
||||||
|
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
|
||||||
|
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
|
||||||
|
}
|
||||||
|
|
||||||
// for(int r=0;r<nreq;r++){
|
int nreq=MpiRequests.size();
|
||||||
// MpiRequests[r] = list[r].req;
|
|
||||||
// }
|
if (nreq>0) {
|
||||||
|
status.resize(MpiRequests.size());
|
||||||
|
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
|
||||||
// int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
|
||||||
// assert(ierr==0);
|
|
||||||
|
|
||||||
// for(int r=0;r<nreq;r++){
|
// for(int r=0;r<nreq;r++){
|
||||||
// if ( list[r].PacketType==InterNodeRecv ) {
|
// if ( list[r].PacketType==InterNodeRecv ) {
|
||||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
|
|
||||||
|
|
||||||
list.resize(0); // Delete the list
|
list.resize(0); // Delete the list
|
||||||
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
||||||
@ -780,6 +808,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
|||||||
|
|
||||||
void CartesianCommunicator::StencilBarrier(void)
|
void CartesianCommunicator::StencilBarrier(void)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("NodeBarrier");
|
||||||
MPI_Barrier (ShmComm);
|
MPI_Barrier (ShmComm);
|
||||||
}
|
}
|
||||||
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||||
@ -787,11 +816,13 @@ void CartesianCommunicator::StencilBarrier(void)
|
|||||||
//}
|
//}
|
||||||
void CartesianCommunicator::Barrier(void)
|
void CartesianCommunicator::Barrier(void)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("GridBarrier");
|
||||||
int ierr = MPI_Barrier(communicator);
|
int ierr = MPI_Barrier(communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("Broadcast");
|
||||||
int ierr=MPI_Bcast(data,
|
int ierr=MPI_Bcast(data,
|
||||||
bytes,
|
bytes,
|
||||||
MPI_BYTE,
|
MPI_BYTE,
|
||||||
@ -810,6 +841,7 @@ void CartesianCommunicator::BarrierWorld(void){
|
|||||||
}
|
}
|
||||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("BroadcastWorld");
|
||||||
int ierr= MPI_Bcast(data,
|
int ierr= MPI_Bcast(data,
|
||||||
bytes,
|
bytes,
|
||||||
MPI_BYTE,
|
MPI_BYTE,
|
||||||
@ -832,6 +864,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
|
|||||||
}
|
}
|
||||||
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
|
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("AllToAll");
|
||||||
// MPI is a pain and uses "int" arguments
|
// MPI is a pain and uses "int" arguments
|
||||||
// 64*64*64*128*16 == 500Million elements of data.
|
// 64*64*64*128*16 == 500Million elements of data.
|
||||||
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
|
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
|
||||||
|
@ -124,6 +124,8 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
|
|||||||
dest=0;
|
dest=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int CartesianCommunicator::IsOffNode(int rank) { return false; }
|
||||||
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||||
int xmit_to_rank,int dox,
|
int xmit_to_rank,int dox,
|
||||||
void *recv,
|
void *recv,
|
||||||
|
@ -137,7 +137,7 @@ public:
|
|||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
||||||
static void SharedMemoryFree(void);
|
static void SharedMemoryFree(void);
|
||||||
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
||||||
static void SharedMemoryZero(void *dest,size_t bytes);
|
static void SharedMemoryZero(void *dest,size_t bytes);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -542,38 +542,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
printf("Host buffer allocate for GPU non-aware MPI\n");
|
// printf("Host buffer allocate for GPU non-aware MPI\n");
|
||||||
#if 0
|
|
||||||
HostCommBuf= acceleratorAllocHost(bytes);
|
|
||||||
#else
|
|
||||||
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
||||||
#ifdef HAVE_NUMAIF_H
|
// acceleratorPin(HostCommBuf,bytes);
|
||||||
#warning "Moving host buffers to specific NUMA domain"
|
|
||||||
int numa;
|
|
||||||
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
|
|
||||||
if(numa_name) {
|
|
||||||
unsigned long page_size = sysconf(_SC_PAGESIZE);
|
|
||||||
numa = atoi(numa_name);
|
|
||||||
unsigned long page_count = bytes/page_size;
|
|
||||||
std::vector<void *> pages(page_count);
|
|
||||||
std::vector<int> nodes(page_count,numa);
|
|
||||||
std::vector<int> status(page_count,-1);
|
|
||||||
for(unsigned long p=0;p<page_count;p++){
|
|
||||||
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
|
|
||||||
}
|
|
||||||
int ret = move_pages(0,
|
|
||||||
page_count,
|
|
||||||
&pages[0],
|
|
||||||
&nodes[0],
|
|
||||||
&status[0],
|
|
||||||
MPOL_MF_MOVE);
|
|
||||||
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
|
|
||||||
if (ret) perror(" move_pages failed for reason:");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
acceleratorPin(HostCommBuf,bytes);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
@ -916,14 +887,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
|||||||
bzero(dest,bytes);
|
bzero(dest,bytes);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||||
{
|
//{
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
acceleratorCopyToDevice(src,dest,bytes);
|
// acceleratorCopyToDevice(src,dest,bytes);
|
||||||
#else
|
//#else
|
||||||
bcopy(src,dest,bytes);
|
// bcopy(src,dest,bytes);
|
||||||
#endif
|
//#endif
|
||||||
}
|
//}
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
// Global shared functionality finished
|
// Global shared functionality finished
|
||||||
// Now move to per communicator functionality
|
// Now move to per communicator functionality
|
||||||
@ -959,6 +930,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
|
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
|
||||||
|
|
||||||
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
|
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
|
||||||
|
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
|
||||||
}
|
}
|
||||||
ShmBufferFreeAll();
|
ShmBufferFreeAll();
|
||||||
|
|
||||||
@ -989,7 +961,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//SharedMemoryTest();
|
// SharedMemoryTest();
|
||||||
}
|
}
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
// On node barrier
|
// On node barrier
|
||||||
@ -1011,19 +983,18 @@ void SharedMemory::SharedMemoryTest(void)
|
|||||||
check[0]=GlobalSharedMemory::WorldNode;
|
check[0]=GlobalSharedMemory::WorldNode;
|
||||||
check[1]=r;
|
check[1]=r;
|
||||||
check[2]=magic;
|
check[2]=magic;
|
||||||
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
|
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ShmBarrier();
|
ShmBarrier();
|
||||||
for(uint64_t r=0;r<ShmSize;r++){
|
for(uint64_t r=0;r<ShmSize;r++){
|
||||||
ShmBarrier();
|
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
|
||||||
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
|
|
||||||
ShmBarrier();
|
|
||||||
assert(check[0]==GlobalSharedMemory::WorldNode);
|
assert(check[0]==GlobalSharedMemory::WorldNode);
|
||||||
assert(check[1]==r);
|
assert(check[1]==r);
|
||||||
assert(check[2]==magic);
|
assert(check[2]==magic);
|
||||||
ShmBarrier();
|
|
||||||
}
|
}
|
||||||
|
ShmBarrier();
|
||||||
|
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void *SharedMemory::ShmBuffer(int rank)
|
void *SharedMemory::ShmBuffer(int rank)
|
||||||
@ -1039,11 +1010,13 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
|
|||||||
{
|
{
|
||||||
int gpeer = ShmRanks[rank];
|
int gpeer = ShmRanks[rank];
|
||||||
assert(gpeer!=ShmRank); // never send to self
|
assert(gpeer!=ShmRank); // never send to self
|
||||||
|
// std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
|
||||||
if (gpeer == MPI_UNDEFINED){
|
if (gpeer == MPI_UNDEFINED){
|
||||||
return NULL;
|
return NULL;
|
||||||
} else {
|
} else {
|
||||||
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
|
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
|
||||||
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
|
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
|
||||||
|
// std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
|
||||||
return (void *) remote;
|
return (void *) remote;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
|||||||
{
|
{
|
||||||
acceleratorMemSet(dest,0,bytes);
|
acceleratorMemSet(dest,0,bytes);
|
||||||
}
|
}
|
||||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||||
{
|
//{
|
||||||
acceleratorCopyToDevice(src,dest,bytes);
|
// acceleratorCopyToDevice(src,dest,bytes);
|
||||||
}
|
//}
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
// Global shared functionality finished
|
// Global shared functionality finished
|
||||||
// Now move to per communicator functionality
|
// Now move to per communicator functionality
|
||||||
|
@ -126,8 +126,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||||
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
||||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
@ -244,7 +244,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
scalar_object * recv_buf_extract_mpi;
|
scalar_object * recv_buf_extract_mpi;
|
||||||
scalar_object * send_buf_extract_mpi;
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
|
||||||
|
|
||||||
for(int s=0;s<Nsimd;s++){
|
for(int s=0;s<Nsimd;s++){
|
||||||
send_buf_extract[s].resize(buffer_size);
|
send_buf_extract[s].resize(buffer_size);
|
||||||
recv_buf_extract[s].resize(buffer_size);
|
recv_buf_extract[s].resize(buffer_size);
|
||||||
|
@ -236,7 +236,7 @@ public:
|
|||||||
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
||||||
vobj vtmp;
|
vobj vtmp;
|
||||||
vtmp = r;
|
vtmp = r;
|
||||||
#if 0
|
#if 1
|
||||||
deviceVector<vobj> vvtmp(1);
|
deviceVector<vobj> vvtmp(1);
|
||||||
acceleratorPut(vvtmp[0],vtmp);
|
acceleratorPut(vvtmp[0],vtmp);
|
||||||
vobj *vvtmp_p = & vvtmp[0];
|
vobj *vvtmp_p = & vvtmp[0];
|
||||||
|
@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
|
|||||||
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
|
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
|
||||||
|
|
||||||
//copy offsets to device
|
//copy offsets to device
|
||||||
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||||
|
|
||||||
|
|
||||||
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
||||||
@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
|
|||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||||
|
|
||||||
//sync after copy
|
//sync after copy
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
|
@ -510,7 +510,6 @@ public:
|
|||||||
grid->SendToRecvFromBegin(fwd_req,
|
grid->SendToRecvFromBegin(fwd_req,
|
||||||
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
|
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
|
||||||
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||||
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
|
|
||||||
#endif
|
#endif
|
||||||
t_comms+=usecond()-t;
|
t_comms+=usecond()-t;
|
||||||
}
|
}
|
||||||
@ -531,7 +530,6 @@ public:
|
|||||||
grid->SendToRecvFromBegin(bwd_req,
|
grid->SendToRecvFromBegin(bwd_req,
|
||||||
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
|
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||||
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||||
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
|
|
||||||
#endif
|
#endif
|
||||||
t_comms+=usecond()-t;
|
t_comms+=usecond()-t;
|
||||||
}
|
}
|
||||||
@ -555,8 +553,13 @@ public:
|
|||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
grid->CommsComplete(fwd_req);
|
grid->CommsComplete(fwd_req);
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
for ( int d=0;d < depth ; d ++ ) {
|
||||||
|
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
t_comms+= usecond() - t;
|
t_comms+= usecond() - t;
|
||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
for ( int d=0;d < depth ; d ++ ) {
|
for ( int d=0;d < depth ; d ++ ) {
|
||||||
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
|
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
|
||||||
@ -565,6 +568,11 @@ public:
|
|||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
grid->CommsComplete(bwd_req);
|
grid->CommsComplete(bwd_req);
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
for ( int d=0;d < depth ; d ++ ) {
|
||||||
|
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
t_comms+= usecond() - t;
|
t_comms+= usecond() - t;
|
||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
|
196
Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Normal file
196
Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
|
||||||
|
|
||||||
|
Copyright (C) 2020 - 2025
|
||||||
|
|
||||||
|
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
|
||||||
|
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
|
||||||
|
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
|
||||||
|
#include <Grid/qcd/action/fermion/CloverHelpers.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
|
||||||
|
public WilsonCloverHelpers<Impl>,
|
||||||
|
public CompactWilsonCloverHelpers<Impl> {
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Sizes
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
INHERIT_COMPACT_CLOVER_SIZES(Impl);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Type definitions
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
|
INHERIT_CLOVER_TYPES(Impl);
|
||||||
|
INHERIT_COMPACT_CLOVER_TYPES(Impl);
|
||||||
|
|
||||||
|
typedef WilsonFermion5D<Impl> WilsonBase;
|
||||||
|
typedef WilsonCloverHelpers<Impl> Helpers;
|
||||||
|
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Constructors
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
CompactWilsonCloverFermion5D(GaugeField& _Umu,
|
||||||
|
GridCartesian &FiveDimGrid,
|
||||||
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
|
GridCartesian &FourDimGrid,
|
||||||
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
|
const RealD _mass,
|
||||||
|
const RealD _csw_r = 0.0,
|
||||||
|
const RealD _csw_t = 0.0,
|
||||||
|
const RealD _cF = 1.0,
|
||||||
|
const ImplParams& impl_p = ImplParams());
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Member functions (implementing interface)
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
virtual void Instantiatable() {};
|
||||||
|
int ConstEE() override { return 0; };
|
||||||
|
int isTrivialEE() override { return 0; };
|
||||||
|
|
||||||
|
void Dhop(const FermionField& in, FermionField& out, int dag) override;
|
||||||
|
|
||||||
|
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
|
||||||
|
|
||||||
|
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
|
||||||
|
|
||||||
|
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
|
||||||
|
|
||||||
|
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
|
||||||
|
|
||||||
|
void M(const FermionField& in, FermionField& out) override;
|
||||||
|
|
||||||
|
void Mdag(const FermionField& in, FermionField& out) override;
|
||||||
|
|
||||||
|
void Meooe(const FermionField& in, FermionField& out) override;
|
||||||
|
|
||||||
|
void MeooeDag(const FermionField& in, FermionField& out) override;
|
||||||
|
|
||||||
|
void Mooee(const FermionField& in, FermionField& out) override;
|
||||||
|
|
||||||
|
void MooeeDag(const FermionField& in, FermionField& out) override;
|
||||||
|
|
||||||
|
void MooeeInv(const FermionField& in, FermionField& out) override;
|
||||||
|
|
||||||
|
void MooeeInvDag(const FermionField& in, FermionField& out) override;
|
||||||
|
|
||||||
|
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
|
||||||
|
|
||||||
|
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
|
||||||
|
|
||||||
|
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
|
||||||
|
|
||||||
|
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
|
||||||
|
|
||||||
|
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Member functions (internals)
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
|
||||||
|
void MooeeInternal(const FermionField& in,
|
||||||
|
FermionField& out,
|
||||||
|
const CloverDiagonalField& diagonal,
|
||||||
|
const CloverTriangleField& triangle);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Helpers
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
|
||||||
|
void ImportGauge(const GaugeField& _Umu) override;
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Helpers
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
const MaskField* getCorrectMaskField(const Field &in) const {
|
||||||
|
if(in.Grid()->_isCheckerBoarded) {
|
||||||
|
if(in.Checkerboard() == Odd) {
|
||||||
|
return &this->BoundaryMaskOdd;
|
||||||
|
} else {
|
||||||
|
return &this->BoundaryMaskEven;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return &this->BoundaryMask;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
void ApplyBoundaryMask(Field& f) {
|
||||||
|
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
|
||||||
|
assert(m != nullptr);
|
||||||
|
CompactHelpers::ApplyBoundaryMask(f, *m);
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Member Data
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
RealD csw_r;
|
||||||
|
RealD csw_t;
|
||||||
|
RealD cF;
|
||||||
|
int n_rhs;
|
||||||
|
|
||||||
|
bool fixedBoundaries;
|
||||||
|
|
||||||
|
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
|
||||||
|
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
|
||||||
|
|
||||||
|
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
|
||||||
|
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
|
||||||
|
|
||||||
|
FermionField Tmp;
|
||||||
|
|
||||||
|
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
|
|||||||
NAMESPACE_CHECK(WilsonTM);
|
NAMESPACE_CHECK(WilsonTM);
|
||||||
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
|
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
|
||||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
|
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
|
||||||
|
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
|
||||||
NAMESPACE_CHECK(WilsonClover);
|
NAMESPACE_CHECK(WilsonClover);
|
||||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
||||||
NAMESPACE_CHECK(Wilson5D);
|
NAMESPACE_CHECK(Wilson5D);
|
||||||
@ -164,12 +165,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
|
|||||||
|
|
||||||
// Compact Clover fermions
|
// Compact Clover fermions
|
||||||
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
|
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
|
||||||
|
template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
|
||||||
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
|
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
|
||||||
|
|
||||||
typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
|
typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
|
||||||
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
|
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
|
||||||
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
|
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
|
||||||
|
|
||||||
|
typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
|
||||||
|
typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
|
||||||
|
typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
|
||||||
|
|
||||||
typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
|
typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
|
||||||
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
|
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
|
||||||
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
|
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
|
||||||
|
@ -154,6 +154,12 @@ public:
|
|||||||
StencilImpl Stencil;
|
StencilImpl Stencil;
|
||||||
StencilImpl StencilEven;
|
StencilImpl StencilEven;
|
||||||
StencilImpl StencilOdd;
|
StencilImpl StencilOdd;
|
||||||
|
void SloppyComms(int sloppy)
|
||||||
|
{
|
||||||
|
Stencil.SetSloppyComms(sloppy);
|
||||||
|
StencilEven.SetSloppyComms(sloppy);
|
||||||
|
StencilOdd.SetSloppyComms(sloppy);
|
||||||
|
}
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
// Copy of the gauge field , with even and odd subsets
|
||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
|
@ -179,6 +179,12 @@ public:
|
|||||||
StencilImpl Stencil;
|
StencilImpl Stencil;
|
||||||
StencilImpl StencilEven;
|
StencilImpl StencilEven;
|
||||||
StencilImpl StencilOdd;
|
StencilImpl StencilOdd;
|
||||||
|
void SloppyComms(int sloppy)
|
||||||
|
{
|
||||||
|
Stencil.SetSloppyComms(sloppy);
|
||||||
|
StencilEven.SetSloppyComms(sloppy);
|
||||||
|
StencilOdd.SetSloppyComms(sloppy);
|
||||||
|
}
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
// Copy of the gauge field , with even and odd subsets
|
||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
|
@ -146,6 +146,12 @@ public:
|
|||||||
StencilImpl Stencil;
|
StencilImpl Stencil;
|
||||||
StencilImpl StencilEven;
|
StencilImpl StencilEven;
|
||||||
StencilImpl StencilOdd;
|
StencilImpl StencilOdd;
|
||||||
|
void SloppyComms(int sloppy)
|
||||||
|
{
|
||||||
|
Stencil.SetSloppyComms(sloppy);
|
||||||
|
StencilEven.SetSloppyComms(sloppy);
|
||||||
|
StencilOdd.SetSloppyComms(sloppy);
|
||||||
|
}
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
// Copy of the gauge field , with even and odd subsets
|
||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
|
@ -32,209 +32,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// Wilson compressor will need FaceGather policies for:
|
|
||||||
// Periodic, Dirichlet, and partial Dirichlet for DWF
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
const int dwf_compressor_depth=2;
|
|
||||||
#define DWF_COMPRESS
|
|
||||||
class FaceGatherPartialDWF
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
|
|
||||||
#else
|
|
||||||
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
|
||||||
#endif
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
|
||||||
const Lattice<vobj> &rhs,
|
|
||||||
cobj *buffer,
|
|
||||||
compressor &compress,
|
|
||||||
int off,int so,int partial)
|
|
||||||
{
|
|
||||||
//DWF only hack: If a direction that is OFF node we use Partial Dirichlet
|
|
||||||
// Shrinks local and remote comms buffers
|
|
||||||
GridBase *Grid = rhs.Grid();
|
|
||||||
int Ls = Grid->_rdimensions[0];
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
#else
|
|
||||||
int depth=Ls/2;
|
|
||||||
#endif
|
|
||||||
std::pair<int,int> *table_v = & table[0];
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
|
||||||
int vol=table.size()/Ls;
|
|
||||||
accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
|
|
||||||
Integer i=idx/Ls;
|
|
||||||
Integer s=idx%Ls;
|
|
||||||
Integer sc=depth+s-(Ls-depth);
|
|
||||||
if(s<depth) compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
|
|
||||||
if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
|
|
||||||
});
|
|
||||||
rhs_v.ViewClose();
|
|
||||||
}
|
|
||||||
template<class decompressor,class Decompression>
|
|
||||||
static void DecompressFace(decompressor decompress,Decompression &dd)
|
|
||||||
{
|
|
||||||
auto Ls = dd.dims[0];
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
#else
|
|
||||||
int depth=Ls/2;
|
|
||||||
#endif
|
|
||||||
// Just pass in the Grid
|
|
||||||
auto kp = dd.kernel_p;
|
|
||||||
auto mp = dd.mpi_p;
|
|
||||||
int size= dd.buffer_size;
|
|
||||||
int vol= size/Ls;
|
|
||||||
accelerator_forNB(o,size,1,{
|
|
||||||
int idx=o/Ls;
|
|
||||||
int s=o%Ls;
|
|
||||||
if ( s < depth ) {
|
|
||||||
int oo=s*vol+idx;
|
|
||||||
kp[o]=mp[oo];
|
|
||||||
} else if ( s >= Ls-depth ) {
|
|
||||||
int sc = depth + s - (Ls-depth);
|
|
||||||
int oo=sc*vol+idx;
|
|
||||||
kp[o]=mp[oo];
|
|
||||||
} else {
|
|
||||||
kp[o] = Zero();//fill rest with zero if partial dirichlet
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Need to gather *interior portions* for ALL s-slices in simd directions
|
|
||||||
// Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
|
|
||||||
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
|
||||||
compressor &compress,int type,int partial)
|
|
||||||
{
|
|
||||||
GridBase *Grid = rhs.Grid();
|
|
||||||
int Ls = Grid->_rdimensions[0];
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
#else
|
|
||||||
int depth = Ls/2;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// insertion of zeroes...
|
|
||||||
assert( (table.size()&0x1)==0);
|
|
||||||
int num=table.size()/2;
|
|
||||||
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
|
||||||
auto p0=&pointers[0][0];
|
|
||||||
auto p1=&pointers[1][0];
|
|
||||||
auto tp=&table[0];
|
|
||||||
int nnum=num/Ls;
|
|
||||||
accelerator_forNB(j, num, vobj::Nsimd(), {
|
|
||||||
// Reorders both local and remote comms buffers
|
|
||||||
//
|
|
||||||
int s = j % Ls;
|
|
||||||
int sp1 = (s+depth)%Ls; // peri incremented s slice
|
|
||||||
|
|
||||||
int hxyz= j/Ls;
|
|
||||||
|
|
||||||
int xyz0= hxyz*2; // xyzt part of coor
|
|
||||||
int xyz1= hxyz*2+1;
|
|
||||||
|
|
||||||
int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
|
|
||||||
|
|
||||||
int kk0= xyz0*Ls + s ; // s=0 goes to s=1
|
|
||||||
int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
|
|
||||||
compress.CompressExchange(p0[jj],p1[jj],
|
|
||||||
rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
|
|
||||||
rhs_v[so+tp[kk1 ].second],
|
|
||||||
type);
|
|
||||||
});
|
|
||||||
rhs_v.ViewClose();
|
|
||||||
}
|
|
||||||
// Merge routine is for SIMD faces
|
|
||||||
template<class decompressor,class Merger>
|
|
||||||
static void MergeFace(decompressor decompress,Merger &mm)
|
|
||||||
{
|
|
||||||
auto Ls = mm.dims[0];
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
#else
|
|
||||||
int depth = Ls/2;
|
|
||||||
#endif
|
|
||||||
int num= mm.buffer_size/2; // relate vol and Ls to buffer size
|
|
||||||
auto mp = &mm.mpointer[0];
|
|
||||||
auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
|
|
||||||
auto vp1= &mm.vpointers[1][0];
|
|
||||||
auto type= mm.type;
|
|
||||||
int nnum = num/Ls;
|
|
||||||
accelerator_forNB(o,num,Merger::Nsimd,{
|
|
||||||
|
|
||||||
int s=o%Ls;
|
|
||||||
int hxyz=o/Ls; // xyzt related component
|
|
||||||
int xyz0=hxyz*2;
|
|
||||||
int xyz1=hxyz*2+1;
|
|
||||||
|
|
||||||
int sp = (s+depth)%Ls;
|
|
||||||
int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
|
|
||||||
|
|
||||||
int oo0= s+xyz0*Ls;
|
|
||||||
int oo1= s+xyz1*Ls;
|
|
||||||
|
|
||||||
// same ss0, ss1 pair goes to new layout
|
|
||||||
decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
class FaceGatherDWFMixedBCs
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
|
|
||||||
#else
|
|
||||||
static int PartialCompressionFactor(GridBase *grid) {return 1;}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
|
||||||
const Lattice<vobj> &rhs,
|
|
||||||
cobj *buffer,
|
|
||||||
compressor &compress,
|
|
||||||
int off,int so,int partial)
|
|
||||||
{
|
|
||||||
// std::cout << " face gather simple DWF partial "<<partial <<std::endl;
|
|
||||||
if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
|
||||||
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
|
||||||
}
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
|
||||||
compressor &compress,int type,int partial)
|
|
||||||
{
|
|
||||||
// std::cout << " face gather exch DWF partial "<<partial <<std::endl;
|
|
||||||
if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
|
|
||||||
else FaceGatherSimple::Gather_plane_exchange (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
|
|
||||||
}
|
|
||||||
template<class decompressor,class Merger>
|
|
||||||
static void MergeFace(decompressor decompress,Merger &mm)
|
|
||||||
{
|
|
||||||
int partial = mm.partial;
|
|
||||||
// std::cout << " merge DWF partial "<<partial <<std::endl;
|
|
||||||
if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
|
|
||||||
else FaceGatherSimple::MergeFace(decompress,mm);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class decompressor,class Decompression>
|
|
||||||
static void DecompressFace(decompressor decompress,Decompression &dd)
|
|
||||||
{
|
|
||||||
int partial = dd.partial;
|
|
||||||
// std::cout << " decompress DWF partial "<<partial <<std::endl;
|
|
||||||
if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
|
|
||||||
else FaceGatherSimple::DecompressFace(decompress,dd);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// optimised versions supporting half precision too??? Deprecate
|
// optimised versions supporting half precision too??? Deprecate
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -242,8 +39,7 @@ public:
|
|||||||
|
|
||||||
//Could make FaceGather a template param, but then behaviour is runtime not compile time
|
//Could make FaceGather a template param, but then behaviour is runtime not compile time
|
||||||
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
|
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
|
||||||
class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs
|
class WilsonCompressorTemplate : public FaceGatherSimple
|
||||||
// : public FaceGatherSimple
|
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@ -485,7 +281,6 @@ public:
|
|||||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
#ifdef NVLINK_GET
|
#ifdef NVLINK_GET
|
||||||
#warning "NVLINK_GET"
|
|
||||||
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
||||||
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
||||||
// Or issue barrier AFTER the DMA is running
|
// Or issue barrier AFTER the DMA is running
|
||||||
|
@ -165,6 +165,12 @@ public:
|
|||||||
StencilImpl Stencil;
|
StencilImpl Stencil;
|
||||||
StencilImpl StencilEven;
|
StencilImpl StencilEven;
|
||||||
StencilImpl StencilOdd;
|
StencilImpl StencilOdd;
|
||||||
|
void SloppyComms(int sloppy)
|
||||||
|
{
|
||||||
|
Stencil.SetSloppyComms(sloppy);
|
||||||
|
StencilEven.SetSloppyComms(sloppy);
|
||||||
|
StencilOdd.SetSloppyComms(sloppy);
|
||||||
|
}
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
// Copy of the gauge field , with even and odd subsets
|
||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
|
@ -91,13 +91,13 @@ public:
|
|||||||
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
|
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
|
||||||
|
|
||||||
// half checkerboard operations; leave unimplemented as abstract for now
|
// half checkerboard operations; leave unimplemented as abstract for now
|
||||||
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};
|
virtual void Meooe (const FermionField &in, FermionField &out);
|
||||||
virtual void Mooee (const FermionField &in, FermionField &out){assert(0);};
|
virtual void Mooee (const FermionField &in, FermionField &out);
|
||||||
virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);};
|
virtual void MooeeInv (const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
|
virtual void MeooeDag (const FermionField &in, FermionField &out);
|
||||||
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
|
virtual void MooeeDag (const FermionField &in, FermionField &out);
|
||||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
|
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
|
||||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||||
|
|
||||||
@ -204,7 +204,14 @@ public:
|
|||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
|
||||||
|
void SloppyComms(int sloppy)
|
||||||
|
{
|
||||||
|
Stencil.SetSloppyComms(sloppy);
|
||||||
|
StencilEven.SetSloppyComms(sloppy);
|
||||||
|
StencilOdd.SetSloppyComms(sloppy);
|
||||||
|
}
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|
||||||
|
@ -0,0 +1,376 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
|
||||||
|
|
||||||
|
Copyright (C) 2017 - 2025
|
||||||
|
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||||
|
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
#include <Grid/qcd/spin/Dirac.h>
|
||||||
|
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
|
||||||
|
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
|
||||||
|
GridCartesian &FiveDimGrid,
|
||||||
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
|
GridCartesian &FourDimGrid,
|
||||||
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
|
const RealD _mass,
|
||||||
|
const RealD _csw_r,
|
||||||
|
const RealD _csw_t,
|
||||||
|
const RealD _cF,
|
||||||
|
const ImplParams& impl_p)
|
||||||
|
: WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
|
||||||
|
, csw_r(_csw_r)
|
||||||
|
, csw_t(_csw_t)
|
||||||
|
, cF(_cF)
|
||||||
|
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
|
||||||
|
, Diagonal(&FourDimGrid), Triangle(&FourDimGrid)
|
||||||
|
, DiagonalEven(&FourDimRedBlackGrid), TriangleEven(&FourDimRedBlackGrid)
|
||||||
|
, DiagonalOdd(&FourDimRedBlackGrid), TriangleOdd(&FourDimRedBlackGrid)
|
||||||
|
, DiagonalInv(&FourDimGrid), TriangleInv(&FourDimGrid)
|
||||||
|
, DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
|
||||||
|
, DiagonalInvOdd(&FourDimRedBlackGrid), TriangleInvOdd(&FourDimRedBlackGrid)
|
||||||
|
, Tmp(&FiveDimGrid)
|
||||||
|
, BoundaryMask(&FiveDimGrid)
|
||||||
|
, BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
|
||||||
|
{
|
||||||
|
assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
|
||||||
|
|
||||||
|
csw_r *= 0.5;
|
||||||
|
csw_t *= 0.5;
|
||||||
|
//if (clover_anisotropy.isAnisotropic)
|
||||||
|
// csw_r /= clover_anisotropy.xi_0;
|
||||||
|
|
||||||
|
ImportGauge(_Umu);
|
||||||
|
if (fixedBoundaries) {
|
||||||
|
this->BoundaryMaskEven.Checkerboard() = Even;
|
||||||
|
this->BoundaryMaskOdd.Checkerboard() = Odd;
|
||||||
|
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
|
||||||
|
WilsonBase::Dhop(in, out, dag);
|
||||||
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
|
||||||
|
WilsonBase::DhopOE(in, out, dag);
|
||||||
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
|
||||||
|
WilsonBase::DhopEO(in, out, dag);
|
||||||
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
|
||||||
|
WilsonBase::DhopDir(in, out, dir, disp);
|
||||||
|
if(this->fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
|
||||||
|
WilsonBase::DhopDirAll(in, out);
|
||||||
|
if(this->fixedBoundaries) {
|
||||||
|
for(auto& o : out) ApplyBoundaryMask(o);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
|
||||||
|
Mooee(in, Tmp);
|
||||||
|
axpy(out, 1.0, out, Tmp);
|
||||||
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
|
||||||
|
MooeeDag(in, Tmp);
|
||||||
|
axpy(out, 1.0, out, Tmp);
|
||||||
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
|
||||||
|
WilsonBase::Meooe(in, out);
|
||||||
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
|
||||||
|
WilsonBase::MeooeDag(in, out);
|
||||||
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
|
||||||
|
if(in.Grid()->_isCheckerBoarded) {
|
||||||
|
if(in.Checkerboard() == Odd) {
|
||||||
|
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
|
||||||
|
} else {
|
||||||
|
MooeeInternal(in, out, DiagonalEven, TriangleEven);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
MooeeInternal(in, out, Diagonal, Triangle);
|
||||||
|
}
|
||||||
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
|
||||||
|
Mooee(in, out); // blocks are hermitian
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
|
||||||
|
if(in.Grid()->_isCheckerBoarded) {
|
||||||
|
if(in.Checkerboard() == Odd) {
|
||||||
|
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
|
||||||
|
} else {
|
||||||
|
MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
MooeeInternal(in, out, DiagonalInv, TriangleInv);
|
||||||
|
}
|
||||||
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
|
||||||
|
MooeeInv(in, out); // blocks are hermitian
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
|
||||||
|
DhopDir(in, out, dir, disp);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
|
||||||
|
DhopDirAll(in, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
|
||||||
|
assert(!fixedBoundaries); // TODO check for changes required for open bc
|
||||||
|
|
||||||
|
// NOTE: code copied from original clover term
|
||||||
|
conformable(X.Grid(), Y.Grid());
|
||||||
|
conformable(X.Grid(), force.Grid());
|
||||||
|
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
|
||||||
|
GaugeField clover_force(force.Grid());
|
||||||
|
PropagatorField Lambda(force.Grid());
|
||||||
|
|
||||||
|
// Guido: Here we are hitting some performance issues:
|
||||||
|
// need to extract the components of the DoubledGaugeField
|
||||||
|
// for each call
|
||||||
|
// Possible solution
|
||||||
|
// Create a vector object to store them? (cons: wasting space)
|
||||||
|
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
|
||||||
|
|
||||||
|
Impl::extractLinkField(U, this->Umu);
|
||||||
|
|
||||||
|
force = Zero();
|
||||||
|
// Derivative of the Wilson hopping term
|
||||||
|
this->DhopDeriv(force, X, Y, dag);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
// Clover term derivative
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
Impl::outerProductImpl(Lambda, X, Y);
|
||||||
|
//std::cout << "Lambda:" << Lambda << std::endl;
|
||||||
|
|
||||||
|
Gamma::Algebra sigma[] = {
|
||||||
|
Gamma::Algebra::SigmaXY,
|
||||||
|
Gamma::Algebra::SigmaXZ,
|
||||||
|
Gamma::Algebra::SigmaXT,
|
||||||
|
Gamma::Algebra::MinusSigmaXY,
|
||||||
|
Gamma::Algebra::SigmaYZ,
|
||||||
|
Gamma::Algebra::SigmaYT,
|
||||||
|
Gamma::Algebra::MinusSigmaXZ,
|
||||||
|
Gamma::Algebra::MinusSigmaYZ,
|
||||||
|
Gamma::Algebra::SigmaZT,
|
||||||
|
Gamma::Algebra::MinusSigmaXT,
|
||||||
|
Gamma::Algebra::MinusSigmaYT,
|
||||||
|
Gamma::Algebra::MinusSigmaZT};
|
||||||
|
|
||||||
|
/*
|
||||||
|
sigma_{\mu \nu}=
|
||||||
|
| 0 sigma[0] sigma[1] sigma[2] |
|
||||||
|
| sigma[3] 0 sigma[4] sigma[5] |
|
||||||
|
| sigma[6] sigma[7] 0 sigma[8] |
|
||||||
|
| sigma[9] sigma[10] sigma[11] 0 |
|
||||||
|
*/
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
|
clover_force = Zero();
|
||||||
|
for (int mu = 0; mu < 4; mu++)
|
||||||
|
{
|
||||||
|
force_mu = Zero();
|
||||||
|
for (int nu = 0; nu < 4; nu++)
|
||||||
|
{
|
||||||
|
if (mu == nu)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
RealD factor;
|
||||||
|
if (nu == 4 || mu == 4)
|
||||||
|
{
|
||||||
|
factor = 2.0 * csw_t;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
factor = 2.0 * csw_r;
|
||||||
|
}
|
||||||
|
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
|
||||||
|
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
|
||||||
|
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
pokeLorentz(clover_force, U[mu] * force_mu, mu);
|
||||||
|
}
|
||||||
|
//clover_force *= csw;
|
||||||
|
force += clover_force;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField& in,
|
||||||
|
FermionField& out,
|
||||||
|
const CloverDiagonalField& diagonal,
|
||||||
|
const CloverTriangleField& triangle) {
|
||||||
|
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
conformable(in, out);
|
||||||
|
CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl, class CloverHelpers>
|
||||||
|
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
|
||||||
|
// NOTE: parts copied from original implementation
|
||||||
|
|
||||||
|
// Import gauge into base class
|
||||||
|
double t0 = usecond();
|
||||||
|
WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
|
||||||
|
|
||||||
|
// Initialize temporary variables
|
||||||
|
double t1 = usecond();
|
||||||
|
conformable(_Umu.Grid(), this->GaugeGrid());
|
||||||
|
GridBase* grid = _Umu.Grid();
|
||||||
|
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
|
||||||
|
CloverField TmpOriginal(grid);
|
||||||
|
CloverField TmpInverse(grid);
|
||||||
|
|
||||||
|
// Compute the field strength terms mu>nu
|
||||||
|
double t2 = usecond();
|
||||||
|
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
|
||||||
|
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
|
||||||
|
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
|
||||||
|
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
|
||||||
|
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
|
||||||
|
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
|
||||||
|
|
||||||
|
// Compute the Clover Operator acting on Colour and Spin
|
||||||
|
// multiply here by the clover coefficients for the anisotropy
|
||||||
|
double t3 = usecond();
|
||||||
|
TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r;
|
||||||
|
TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
|
||||||
|
TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
|
||||||
|
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
|
||||||
|
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
|
||||||
|
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
|
||||||
|
|
||||||
|
// Instantiate the clover term
|
||||||
|
// - In case of the standard clover the mass term is added
|
||||||
|
// - In case of the exponential clover the clover term is exponentiated
|
||||||
|
double t4 = usecond();
|
||||||
|
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
|
||||||
|
|
||||||
|
// Convert the data layout of the clover term
|
||||||
|
double t5 = usecond();
|
||||||
|
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
|
||||||
|
|
||||||
|
// Modify the clover term at the temporal boundaries in case of open boundary conditions
|
||||||
|
double t6 = usecond();
|
||||||
|
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
|
||||||
|
|
||||||
|
// Invert the Clover term
|
||||||
|
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
|
||||||
|
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
|
||||||
|
// TODO: For now this inversion is explictly done on the CPU
|
||||||
|
double t7 = usecond();
|
||||||
|
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
|
||||||
|
|
||||||
|
// Fill the remaining clover fields
|
||||||
|
double t8 = usecond();
|
||||||
|
pickCheckerboard(Even, DiagonalEven, Diagonal);
|
||||||
|
pickCheckerboard(Even, TriangleEven, Triangle);
|
||||||
|
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
|
||||||
|
pickCheckerboard(Odd, TriangleOdd, Triangle);
|
||||||
|
pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
|
||||||
|
pickCheckerboard(Even, TriangleInvEven, TriangleInv);
|
||||||
|
pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv);
|
||||||
|
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
|
||||||
|
|
||||||
|
// Report timings
|
||||||
|
double t9 = usecond();
|
||||||
|
|
||||||
|
std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
|
||||||
|
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
|
||||||
|
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
|
||||||
|
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
|
||||||
|
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
|
||||||
|
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
|
||||||
|
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
|
||||||
|
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
|
||||||
|
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
|
||||||
|
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
|
||||||
|
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -14,6 +14,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||||
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
|
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
|
||||||
Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
|
Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de>
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -484,6 +485,54 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
|
|||||||
Dhop(in,out,dag); // -0.5 is included
|
Dhop(in,out,dag); // -0.5 is included
|
||||||
axpy(out,4.0-M5,in,out);
|
axpy(out,4.0-M5,in,out);
|
||||||
}
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
if (in.Checkerboard() == Odd) {
|
||||||
|
DhopEO(in, out, DaggerNo);
|
||||||
|
} else {
|
||||||
|
DhopOE(in, out, DaggerNo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void WilsonFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
if (in.Checkerboard() == Odd) {
|
||||||
|
DhopEO(in, out, DaggerYes);
|
||||||
|
} else {
|
||||||
|
DhopOE(in, out, DaggerYes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
typename FermionField::scalar_type scal(4.0 + M5);
|
||||||
|
out = scal * in;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void WilsonFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
Mooee(in, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
out = (1.0/(4.0 + M5))*in;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonFermion5D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
MooeeInv(in,out);
|
||||||
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)
|
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)
|
||||||
|
@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
} else { \
|
} else { \
|
||||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||||
} \
|
} \
|
||||||
acceleratorSynchronise(); \
|
acceleratorSynchronise(); \
|
||||||
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
||||||
Recon(result, Uchi);
|
Recon(result, Uchi);
|
||||||
|
|
||||||
@ -504,7 +504,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
autoView(st_v , st,AcceleratorRead);
|
autoView(st_v , st,AcceleratorRead);
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
// acceleratorFenceComputeStream();
|
acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
|
@ -0,0 +1,45 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master
|
||||||
|
|
||||||
|
Copyright (C) 2017 - 2025
|
||||||
|
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||||
|
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||||
|
Author: Mattia Bruno <mattia.bruno@cern.ch>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
#include <Grid/qcd/spin/Dirac.h>
|
||||||
|
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/CloverHelpers.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>;
|
||||||
|
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -0,0 +1 @@
|
|||||||
|
../CompactWilsonCloverFermion5DInstantiation.cc.master
|
@ -0,0 +1 @@
|
|||||||
|
../CompactWilsonCloverFermion5DInstantiation.cc.master
|
@ -62,7 +62,7 @@ do
|
|||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
CC_LIST="CompactWilsonCloverFermionInstantiation"
|
CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation"
|
||||||
|
|
||||||
for impl in $COMPACT_WILSON_IMPL_LIST
|
for impl in $COMPACT_WILSON_IMPL_LIST
|
||||||
do
|
do
|
||||||
|
@ -76,27 +76,27 @@ public:
|
|||||||
return action;
|
return action;
|
||||||
};
|
};
|
||||||
|
|
||||||
virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
|
virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
|
||||||
//extend Ta to include Lorentz indexes
|
//extend Ta to include Lorentz indexes
|
||||||
RealD factor_p = c_plaq/RealD(Nc)*0.5;
|
RealD factor_p = c_plaq/RealD(Nc)*0.5;
|
||||||
RealD factor_r = c_rect/RealD(Nc)*0.5;
|
RealD factor_r = c_rect/RealD(Nc)*0.5;
|
||||||
|
|
||||||
GridBase *grid = Umu.Grid();
|
GridBase *grid = U.Grid();
|
||||||
|
|
||||||
std::vector<GaugeLinkField> U (Nd,grid);
|
std::vector<GaugeLinkField> Umu (Nd,grid);
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
|
||||||
}
|
}
|
||||||
std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
|
std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
|
||||||
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
|
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, Umu, workspace);
|
||||||
|
|
||||||
GaugeLinkField dSdU_mu(grid);
|
GaugeLinkField dSdU_mu(grid);
|
||||||
GaugeLinkField staple(grid);
|
GaugeLinkField staple(grid);
|
||||||
|
|
||||||
for (int mu=0; mu < Nd; mu++){
|
for (int mu=0; mu < Nd; mu++){
|
||||||
dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
|
dSdU_mu = Ta(Umu[mu]*Staple[mu])*factor_p;
|
||||||
dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
|
dSdU_mu = dSdU_mu + Ta(Umu[mu]*RectStaple[mu])*factor_r;
|
||||||
|
|
||||||
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
|
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -73,20 +73,23 @@ public:
|
|||||||
// extend Ta to include Lorentz indexes
|
// extend Ta to include Lorentz indexes
|
||||||
|
|
||||||
RealD factor = 0.5 * beta / RealD(Nc);
|
RealD factor = 0.5 * beta / RealD(Nc);
|
||||||
|
GridBase *grid = U.Grid();
|
||||||
|
|
||||||
GaugeLinkField Umu(U.Grid());
|
GaugeLinkField dSdU_mu(grid);
|
||||||
GaugeLinkField dSdU_mu(U.Grid());
|
std::vector<GaugeLinkField> Umu(Nd, grid);
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
for (int mu = 0; mu < Nd; mu++) {
|
||||||
|
Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
|
||||||
|
}
|
||||||
|
|
||||||
Umu = PeekIndex<LorentzIndex>(U, mu);
|
for (int mu = 0; mu < Nd; mu++) {
|
||||||
|
|
||||||
// Staple in direction mu
|
// Staple in direction mu
|
||||||
WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
|
WilsonLoops<Gimpl>::Staple(dSdU_mu, Umu, mu);
|
||||||
dSdU_mu = Ta(Umu * dSdU_mu) * factor;
|
dSdU_mu = Ta(Umu[mu] * dSdU_mu) * factor;
|
||||||
|
|
||||||
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
|
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
RealD beta;
|
RealD beta;
|
||||||
};
|
};
|
||||||
|
@ -111,8 +111,8 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
|
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
|
||||||
std::string config, rng;
|
std::string config, rng, smr;
|
||||||
this->build_filenames(traj, Params, config, rng);
|
this->build_filenames(traj, Params, config, smr, rng);
|
||||||
this->check_filename(rng);
|
this->check_filename(rng);
|
||||||
this->check_filename(config);
|
this->check_filename(config);
|
||||||
|
|
||||||
|
@ -75,7 +75,7 @@ public:
|
|||||||
GridParallelRNG &pRNG) {
|
GridParallelRNG &pRNG) {
|
||||||
if ((traj % Params.saveInterval) == 0) {
|
if ((traj % Params.saveInterval) == 0) {
|
||||||
std::string config, rng, smr;
|
std::string config, rng, smr;
|
||||||
this->build_filenames(traj, Params, config, rng);
|
this->build_filenames(traj, Params, config, smr, rng);
|
||||||
GridBase *grid = SmartConfig.get_U(false).Grid();
|
GridBase *grid = SmartConfig.get_U(false).Grid();
|
||||||
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||||
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
@ -102,7 +102,7 @@ public:
|
|||||||
if ( Params.saveSmeared ) {
|
if ( Params.saveSmeared ) {
|
||||||
IldgWriter _IldgWriter(grid->IsBoss());
|
IldgWriter _IldgWriter(grid->IsBoss());
|
||||||
_IldgWriter.open(smr);
|
_IldgWriter.open(smr);
|
||||||
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
|
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, smr, smr);
|
||||||
_IldgWriter.close();
|
_IldgWriter.close();
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
|
std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
|
||||||
@ -118,8 +118,8 @@ public:
|
|||||||
|
|
||||||
void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
|
void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
|
||||||
GridParallelRNG &pRNG) {
|
GridParallelRNG &pRNG) {
|
||||||
std::string config, rng;
|
std::string config, rng, smr;
|
||||||
this->build_filenames(traj, Params, config, rng);
|
this->build_filenames(traj, Params, config, smr, rng);
|
||||||
this->check_filename(rng);
|
this->check_filename(rng);
|
||||||
this->check_filename(config);
|
this->check_filename(config);
|
||||||
|
|
||||||
|
@ -107,8 +107,8 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
|
|||||||
|
|
||||||
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
|
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
|
||||||
GridParallelRNG &pRNG) {
|
GridParallelRNG &pRNG) {
|
||||||
std::string config, rng;
|
std::string config, rng, smr;
|
||||||
this->build_filenames(traj, Params, config, rng);
|
this->build_filenames(traj, Params, config, smr, rng);
|
||||||
this->check_filename(rng);
|
this->check_filename(rng);
|
||||||
this->check_filename(config);
|
this->check_filename(config);
|
||||||
|
|
||||||
|
@ -62,15 +62,15 @@ accelerator_inline int stencilIndex(int mu, int nu) {
|
|||||||
|
|
||||||
|
|
||||||
/*! @brief structure holding the link treatment */
|
/*! @brief structure holding the link treatment */
|
||||||
struct SmearingParameters{
|
struct HISQSmearingParameters{
|
||||||
SmearingParameters(){}
|
HISQSmearingParameters(){}
|
||||||
Real c_1; // 1 link
|
Real c_1; // 1 link
|
||||||
Real c_naik; // Naik term
|
Real c_naik; // Naik term
|
||||||
Real c_3; // 3 link
|
Real c_3; // 3 link
|
||||||
Real c_5; // 5 link
|
Real c_5; // 5 link
|
||||||
Real c_7; // 7 link
|
Real c_7; // 7 link
|
||||||
Real c_lp; // 5 link Lepage
|
Real c_lp; // 5 link Lepage
|
||||||
SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
|
HISQSmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
|
||||||
: c_1(c1),
|
: c_1(c1),
|
||||||
c_naik(cnaik),
|
c_naik(cnaik),
|
||||||
c_3(c3),
|
c_3(c3),
|
||||||
@ -86,7 +86,7 @@ class Smear_HISQ : public Gimpl {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
GridCartesian* const _grid;
|
GridCartesian* const _grid;
|
||||||
SmearingParameters _linkTreatment;
|
HISQSmearingParameters _linkTreatment;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@ -117,7 +117,7 @@ public:
|
|||||||
// IN--u_thin
|
// IN--u_thin
|
||||||
void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
|
void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
|
||||||
|
|
||||||
SmearingParameters lt = this->_linkTreatment;
|
HISQSmearingParameters lt = this->_linkTreatment;
|
||||||
auto grid = this->_grid;
|
auto grid = this->_grid;
|
||||||
|
|
||||||
// Create a padded cell of extra padding depth=1 and fill the padding.
|
// Create a padded cell of extra padding depth=1 and fill the padding.
|
||||||
|
@ -207,11 +207,14 @@ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(con
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Gimpl>
|
template <class Gimpl>
|
||||||
void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
|
void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int meas_interval){
|
||||||
addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||||
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " << t << " " << energyDensityPlaquette(t,U) << std::endl;
|
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " << t << " " << energyDensityPlaquette(t,U) << std::endl;
|
||||||
});
|
});
|
||||||
addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||||
|
std::cout << GridLogMessage << "[WilsonFlow] Energy density (cloverleaf) : " << step << " " << t << " " << energyDensityCloverleaf(t,U) << std::endl;
|
||||||
|
});
|
||||||
|
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||||
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " << step << " " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
|
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " << step << " " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -249,6 +252,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
|
|||||||
|
|
||||||
out = in;
|
out = in;
|
||||||
RealD taus = 0.;
|
RealD taus = 0.;
|
||||||
|
|
||||||
|
// Perform initial t=0 measurements
|
||||||
|
for(auto const &meas : this->functions)
|
||||||
|
meas.second(0,taus,out);
|
||||||
|
|
||||||
for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
|
for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
|
||||||
auto start = std::chrono::high_resolution_clock::now();
|
auto start = std::chrono::high_resolution_clock::now();
|
||||||
evolve_step(out, taus);
|
evolve_step(out, taus);
|
||||||
@ -333,6 +341,11 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
|
|||||||
RealD taus = 0.;
|
RealD taus = 0.;
|
||||||
RealD eps = init_epsilon;
|
RealD eps = init_epsilon;
|
||||||
unsigned int step = 0;
|
unsigned int step = 0;
|
||||||
|
|
||||||
|
// Perform initial t=0 measurements
|
||||||
|
for(auto const &meas : this->functions)
|
||||||
|
meas.second(step,taus,out);
|
||||||
|
|
||||||
do{
|
do{
|
||||||
int step_success = evolve_step_adaptive(out, taus, eps);
|
int step_success = evolve_step_adaptive(out, taus, eps);
|
||||||
step += step_success; //step will not be incremented if the integration step fails
|
step += step_success; //step will not be incremented if the integration step fails
|
||||||
|
@ -292,19 +292,21 @@ public:
|
|||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// the sum over all nu-oriented staples for nu != mu on each site
|
// the sum over all nu-oriented staples for nu != mu on each site
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
static void Staple(GaugeMat &staple, const GaugeLorentz &U, int mu) {
|
||||||
|
|
||||||
GridBase *grid = Umu.Grid();
|
std::vector<GaugeMat> Umu(Nd, U.Grid());
|
||||||
|
|
||||||
std::vector<GaugeMat> U(Nd, grid);
|
|
||||||
for (int d = 0; d < Nd; d++) {
|
for (int d = 0; d < Nd; d++) {
|
||||||
U[d] = PeekIndex<LorentzIndex>(Umu, d);
|
Umu[d] = PeekIndex<LorentzIndex>(U, d);
|
||||||
}
|
}
|
||||||
Staple(staple, U, mu);
|
Staple(staple, Umu, mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
|
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &Umu, int mu) {
|
||||||
staple = Zero();
|
|
||||||
|
autoView(staple_v, staple, AcceleratorWrite);
|
||||||
|
accelerator_for(i, staple.Grid()->oSites(), Simd::Nsimd(), {
|
||||||
|
staple_v[i] = Zero();
|
||||||
|
});
|
||||||
|
|
||||||
for (int nu = 0; nu < Nd; nu++) {
|
for (int nu = 0; nu < Nd; nu++) {
|
||||||
|
|
||||||
@ -318,12 +320,12 @@ public:
|
|||||||
// |
|
// |
|
||||||
// __|
|
// __|
|
||||||
//
|
//
|
||||||
|
|
||||||
staple += Gimpl::ShiftStaple(
|
staple += Gimpl::ShiftStaple(
|
||||||
Gimpl::CovShiftForward(
|
Gimpl::CovShiftForward(
|
||||||
U[nu], nu,
|
Umu[nu], nu,
|
||||||
Gimpl::CovShiftBackward(
|
Gimpl::CovShiftBackward(
|
||||||
U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
|
Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))),
|
||||||
mu);
|
mu);
|
||||||
|
|
||||||
// __
|
// __
|
||||||
@ -333,8 +335,8 @@ public:
|
|||||||
//
|
//
|
||||||
|
|
||||||
staple += Gimpl::ShiftStaple(
|
staple += Gimpl::ShiftStaple(
|
||||||
Gimpl::CovShiftBackward(U[nu], nu,
|
Gimpl::CovShiftBackward(Umu[nu], nu,
|
||||||
Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
|
Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -30,25 +30,26 @@
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
uint64_t DslashFullCount;
|
uint64_t DslashFullCount;
|
||||||
uint64_t DslashPartialCount;
|
//uint64_t DslashPartialCount;
|
||||||
uint64_t DslashDirichletCount;
|
uint64_t DslashDirichletCount;
|
||||||
|
|
||||||
void DslashResetCounts(void)
|
void DslashResetCounts(void)
|
||||||
{
|
{
|
||||||
DslashFullCount=0;
|
DslashFullCount=0;
|
||||||
DslashPartialCount=0;
|
// DslashPartialCount=0;
|
||||||
DslashDirichletCount=0;
|
DslashDirichletCount=0;
|
||||||
}
|
}
|
||||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
|
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
|
||||||
{
|
{
|
||||||
dirichlet = DslashDirichletCount;
|
dirichlet = DslashDirichletCount;
|
||||||
partial = DslashPartialCount;
|
partial = 0;
|
||||||
full = DslashFullCount;
|
full = DslashFullCount;
|
||||||
}
|
}
|
||||||
void DslashLogFull(void) { DslashFullCount++;}
|
void DslashLogFull(void) { DslashFullCount++;}
|
||||||
void DslashLogPartial(void) { DslashPartialCount++;}
|
//void DslashLogPartial(void) { DslashPartialCount++;}
|
||||||
void DslashLogDirichlet(void){ DslashDirichletCount++;}
|
void DslashLogDirichlet(void){ DslashDirichletCount++;}
|
||||||
|
|
||||||
|
deviceVector<unsigned char> StencilBuffer::DeviceCommBuf;
|
||||||
|
|
||||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||||
int off,std::vector<std::pair<int,int> > & table)
|
int off,std::vector<std::pair<int,int> > & table)
|
||||||
|
@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// These can move into a params header and be given MacroMagic serialisation
|
// These can move into a params header and be given MacroMagic serialisation
|
||||||
struct DefaultImplParams {
|
struct DefaultImplParams {
|
||||||
Coordinate dirichlet; // Blocksize of dirichlet BCs
|
Coordinate dirichlet; // Blocksize of dirichlet BCs
|
||||||
int partialDirichlet;
|
// int partialDirichlet;
|
||||||
DefaultImplParams() {
|
DefaultImplParams() {
|
||||||
dirichlet.resize(0);
|
dirichlet.resize(0);
|
||||||
partialDirichlet=0;
|
// partialDirichlet=0;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -69,6 +69,12 @@ struct DefaultImplParams {
|
|||||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||||
int off,std::vector<std::pair<int,int> > & table);
|
int off,std::vector<std::pair<int,int> > & table);
|
||||||
|
|
||||||
|
class StencilBuffer
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static deviceVector<unsigned char> DeviceCommBuf; // placed in Stencil.cc
|
||||||
|
};
|
||||||
|
|
||||||
void DslashResetCounts(void);
|
void DslashResetCounts(void);
|
||||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
|
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
|
||||||
void DslashLogFull(void);
|
void DslashLogFull(void);
|
||||||
@ -113,8 +119,8 @@ class CartesianStencilAccelerator {
|
|||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
// If true, this is partially communicated per face
|
// If true, this is partially communicated per face
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
StencilVector _comms_partial_send;
|
// StencilVector _comms_partial_send;
|
||||||
StencilVector _comms_partial_recv;
|
// StencilVector _comms_partial_recv;
|
||||||
//
|
//
|
||||||
StencilVector _comm_buf_size;
|
StencilVector _comm_buf_size;
|
||||||
StencilVector _permute_type;
|
StencilVector _permute_type;
|
||||||
@ -205,16 +211,16 @@ public:
|
|||||||
struct Packet {
|
struct Packet {
|
||||||
void * send_buf;
|
void * send_buf;
|
||||||
void * recv_buf;
|
void * recv_buf;
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
void * compressed_send_buf;
|
||||||
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
|
void * compressed_recv_buf;
|
||||||
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
|
|
||||||
#endif
|
|
||||||
Integer to_rank;
|
Integer to_rank;
|
||||||
Integer from_rank;
|
Integer from_rank;
|
||||||
Integer do_send;
|
Integer do_send;
|
||||||
Integer do_recv;
|
Integer do_recv;
|
||||||
Integer xbytes;
|
Integer xbytes;
|
||||||
Integer rbytes;
|
Integer rbytes;
|
||||||
|
Integer xbytes_compressed;
|
||||||
|
Integer rbytes_compressed;
|
||||||
};
|
};
|
||||||
struct Merge {
|
struct Merge {
|
||||||
static constexpr int Nsimd = vobj::Nsimd();
|
static constexpr int Nsimd = vobj::Nsimd();
|
||||||
@ -223,7 +229,7 @@ public:
|
|||||||
std::vector<cobj *> vpointers;
|
std::vector<cobj *> vpointers;
|
||||||
Integer buffer_size;
|
Integer buffer_size;
|
||||||
Integer type;
|
Integer type;
|
||||||
Integer partial; // partial dirichlet BCs
|
// Integer partial; // partial dirichlet BCs
|
||||||
Coordinate dims;
|
Coordinate dims;
|
||||||
};
|
};
|
||||||
struct Decompress {
|
struct Decompress {
|
||||||
@ -231,7 +237,7 @@ public:
|
|||||||
cobj * kernel_p;
|
cobj * kernel_p;
|
||||||
cobj * mpi_p;
|
cobj * mpi_p;
|
||||||
Integer buffer_size;
|
Integer buffer_size;
|
||||||
Integer partial; // partial dirichlet BCs
|
// Integer partial; // partial dirichlet BCs
|
||||||
Coordinate dims;
|
Coordinate dims;
|
||||||
};
|
};
|
||||||
struct CopyReceiveBuffer {
|
struct CopyReceiveBuffer {
|
||||||
@ -252,9 +258,45 @@ public:
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
GridBase * _grid;
|
GridBase * _grid;
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// Sloppy comms will make a second buffer upon comms
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
size_t device_heap_top; //
|
||||||
|
size_t device_heap_bytes;//
|
||||||
|
size_t device_heap_size; //
|
||||||
|
void *DeviceBufferMalloc(size_t bytes)
|
||||||
|
{
|
||||||
|
void *ptr = (void *)device_heap_top;
|
||||||
|
device_heap_top += bytes;
|
||||||
|
device_heap_bytes+= bytes;
|
||||||
|
if ( device_heap_bytes > device_heap_size ) {
|
||||||
|
std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
|
||||||
|
assert (device_heap_bytes <= device_heap_size);
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void DeviceBufferFreeAll(void)
|
||||||
|
{
|
||||||
|
device_heap_size = _unified_buffer_size*sizeof(cobj);
|
||||||
|
// Resize up if necessary, never down
|
||||||
|
if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
|
||||||
|
StencilBuffer::DeviceCommBuf.resize(device_heap_size);
|
||||||
|
}
|
||||||
|
device_heap_top =(size_t) &StencilBuffer::DeviceCommBuf[0];
|
||||||
|
device_heap_size = StencilBuffer::DeviceCommBuf.size();
|
||||||
|
device_heap_bytes=0;
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
GridBase *Grid(void) const { return _grid; }
|
GridBase *Grid(void) const { return _grid; }
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
// Control reduced precision comms
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
int SloppyComms;
|
||||||
|
void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Needed to conveniently communicate gparity parameters into GPU memory
|
// Needed to conveniently communicate gparity parameters into GPU memory
|
||||||
// without adding parameters. Perhaps a template parameter to StenciView is
|
// without adding parameters. Perhaps a template parameter to StenciView is
|
||||||
@ -268,7 +310,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
int face_table_computed;
|
int face_table_computed;
|
||||||
int partialDirichlet;
|
// int partialDirichlet;
|
||||||
int fullDirichlet;
|
int fullDirichlet;
|
||||||
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
||||||
deviceVector<int> surface_list;
|
deviceVector<int> surface_list;
|
||||||
@ -361,24 +403,145 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Non blocking send and receive. Necessarily parallel.
|
// Non blocking send and receive. Necessarily parallel.
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
void DecompressPacket(Packet &packet)
|
||||||
|
{
|
||||||
|
if ( !SloppyComms ) return;
|
||||||
|
|
||||||
|
if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
|
||||||
|
|
||||||
|
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||||
|
uint64_t words = packet.rbytes/sizeof(word);
|
||||||
|
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||||
|
const uint64_t outer = words/nsimd;
|
||||||
|
|
||||||
|
if(sizeof(word)==8) {
|
||||||
|
|
||||||
|
// Can either choose to represent as float vs double and prec change
|
||||||
|
// OR
|
||||||
|
// truncate the mantissa bfp16 style
|
||||||
|
double *dbuf =(double *) packet.recv_buf;
|
||||||
|
float *fbuf =(float *) packet.compressed_recv_buf;
|
||||||
|
|
||||||
|
accelerator_forNB(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
|
||||||
|
});
|
||||||
|
|
||||||
|
} else if ( sizeof(word)==4){
|
||||||
|
// Can either choose to represent as half vs float and prec change
|
||||||
|
// OR
|
||||||
|
// truncate the mantissa bfp16 style
|
||||||
|
|
||||||
|
uint32_t *fbuf =(uint32_t *) packet.recv_buf;
|
||||||
|
uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
|
||||||
|
|
||||||
|
accelerator_forNB(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
|
||||||
|
});
|
||||||
|
|
||||||
|
} else {
|
||||||
|
assert(0 && "unknown floating point precision");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void CompressPacket(Packet &packet)
|
||||||
|
{
|
||||||
|
packet.xbytes_compressed = packet.xbytes;
|
||||||
|
packet.compressed_send_buf = packet.send_buf;
|
||||||
|
|
||||||
|
packet.rbytes_compressed = packet.rbytes;
|
||||||
|
packet.compressed_recv_buf = packet.recv_buf;
|
||||||
|
|
||||||
|
if ( !SloppyComms ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||||
|
uint64_t words = packet.xbytes/sizeof(word);
|
||||||
|
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||||
|
const uint64_t outer = words/nsimd;
|
||||||
|
|
||||||
|
if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
|
||||||
|
|
||||||
|
packet.rbytes_compressed = packet.rbytes/2;
|
||||||
|
packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
|
||||||
|
// std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
//else {
|
||||||
|
// std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
|
||||||
|
// }
|
||||||
|
|
||||||
|
if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
|
||||||
|
|
||||||
|
packet.xbytes_compressed = packet.xbytes/2;
|
||||||
|
packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
|
||||||
|
// std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
|
||||||
|
|
||||||
|
if(sizeof(word)==8) {
|
||||||
|
|
||||||
|
double *dbuf =(double *) packet.send_buf;
|
||||||
|
float *fbuf =(float *) packet.compressed_send_buf;
|
||||||
|
|
||||||
|
accelerator_forNB(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
|
||||||
|
});
|
||||||
|
|
||||||
|
} else if ( sizeof(word)==4){
|
||||||
|
|
||||||
|
uint32_t *fbuf =(uint32_t *) packet.send_buf;
|
||||||
|
uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
|
||||||
|
|
||||||
|
accelerator_forNB(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
|
||||||
|
});
|
||||||
|
|
||||||
|
} else {
|
||||||
|
assert(0 && "unknown floating point precision");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
// else {
|
||||||
|
// std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
|
||||||
|
// }
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
// std::cout << "Communicate Begin "<<std::endl;
|
|
||||||
// _grid->Barrier();
|
|
||||||
FlightRecorder::StepLog("Communicate begin");
|
FlightRecorder::StepLog("Communicate begin");
|
||||||
|
///////////////////////////////////////////////
|
||||||
// All GPU kernel tasks must complete
|
// All GPU kernel tasks must complete
|
||||||
// accelerator_barrier(); // All kernels should ALREADY be complete
|
// accelerator_barrier(); All kernels should ALREADY be complete
|
||||||
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
//Everyone is here, so noone running slow and still using receive buffer
|
||||||
// But the HaloGather had a barrier too.
|
_grid->StencilBarrier();
|
||||||
|
// But the HaloGather had a barrier too.
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
if (SloppyComms) {
|
||||||
|
DeviceBufferFreeAll();
|
||||||
|
}
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
this->CompressPacket(Packets[i]);
|
||||||
|
}
|
||||||
|
if (SloppyComms) {
|
||||||
|
accelerator_barrier();
|
||||||
|
#ifdef NVLINK_GET
|
||||||
|
_grid->StencilBarrier();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
// std::cout << "Communicate prepare "<<i<<std::endl;
|
// std::cout << "Communicate prepare "<<i<<std::endl;
|
||||||
// _grid->Barrier();
|
// _grid->Barrier();
|
||||||
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].compressed_send_buf,
|
||||||
Packets[i].to_rank,Packets[i].do_send,
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
Packets[i].recv_buf,
|
Packets[i].compressed_recv_buf,
|
||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||||
}
|
}
|
||||||
// std::cout << "Communicate PollDtoH "<<std::endl;
|
// std::cout << "Communicate PollDtoH "<<std::endl;
|
||||||
// _grid->Barrier();
|
// _grid->Barrier();
|
||||||
@ -389,18 +552,22 @@ public:
|
|||||||
// Starts intranode
|
// Starts intranode
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
// std::cout << "Communicate Begin "<<i<<std::endl;
|
// std::cout << "Communicate Begin "<<i<<std::endl;
|
||||||
|
// _grid->Barrier();
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,Packets[i].compressed_send_buf,
|
||||||
Packets[i].to_rank,Packets[i].do_send,
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,Packets[i].compressed_recv_buf,
|
||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||||
|
// std::cout << "Communicate Begin started "<<i<<std::endl;
|
||||||
|
// _grid->Barrier();
|
||||||
}
|
}
|
||||||
|
FlightRecorder::StepLog("Communicate begin has finished");
|
||||||
// Get comms started then run checksums
|
// Get comms started then run checksums
|
||||||
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
if ( Packets[i].do_send )
|
if ( Packets[i].do_send )
|
||||||
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
|
FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -415,14 +582,15 @@ public:
|
|||||||
// std::cout << "Communicate Complete Complete "<<std::endl;
|
// std::cout << "Communicate Complete Complete "<<std::endl;
|
||||||
// _grid->Barrier();
|
// _grid->Barrier();
|
||||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||||
if ( this->partialDirichlet ) DslashLogPartial();
|
// if ( this->partialDirichlet ) DslashLogPartial();
|
||||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||||
else DslashLogFull();
|
else DslashLogFull();
|
||||||
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
||||||
// accelerator_barrier();
|
// accelerator_barrier();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
this->DecompressPacket(Packets[i]);
|
||||||
if ( Packets[i].do_recv )
|
if ( Packets[i].do_recv )
|
||||||
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
|
||||||
}
|
}
|
||||||
FlightRecorder::StepLog("Finish communicate complete");
|
FlightRecorder::StepLog("Finish communicate complete");
|
||||||
}
|
}
|
||||||
@ -446,6 +614,7 @@ public:
|
|||||||
Communicate();
|
Communicate();
|
||||||
CommsMergeSHM(compress);
|
CommsMergeSHM(compress);
|
||||||
CommsMerge(compress);
|
CommsMerge(compress);
|
||||||
|
accelerator_barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
||||||
@ -518,7 +687,6 @@ public:
|
|||||||
}
|
}
|
||||||
accelerator_barrier(); // All my local gathers are complete
|
accelerator_barrier(); // All my local gathers are complete
|
||||||
#ifdef NVLINK_GET
|
#ifdef NVLINK_GET
|
||||||
#warning "NVLINK_GET"
|
|
||||||
_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
||||||
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
||||||
// Or issue barrier AFTER the DMA is running
|
// Or issue barrier AFTER the DMA is running
|
||||||
@ -617,7 +785,7 @@ public:
|
|||||||
}
|
}
|
||||||
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
|
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
|
||||||
Decompress d;
|
Decompress d;
|
||||||
d.partial = this->partialDirichlet;
|
// d.partial = this->partialDirichlet;
|
||||||
d.dims = _grid->_fdimensions;
|
d.dims = _grid->_fdimensions;
|
||||||
d.kernel_p = k_p;
|
d.kernel_p = k_p;
|
||||||
d.mpi_p = m_p;
|
d.mpi_p = m_p;
|
||||||
@ -626,7 +794,7 @@ public:
|
|||||||
}
|
}
|
||||||
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
||||||
Merge m;
|
Merge m;
|
||||||
m.partial = this->partialDirichlet;
|
// m.partial = this->partialDirichlet;
|
||||||
m.dims = _grid->_fdimensions;
|
m.dims = _grid->_fdimensions;
|
||||||
m.type = type;
|
m.type = type;
|
||||||
m.mpointer = merge_p;
|
m.mpointer = merge_p;
|
||||||
@ -690,6 +858,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
|
||||||
surface_list.resize(surface_list_size);
|
surface_list.resize(surface_list_size);
|
||||||
std::vector<int> surface_list_host(surface_list_size);
|
std::vector<int> surface_list_host(surface_list_size);
|
||||||
int32_t ss=0;
|
int32_t ss=0;
|
||||||
@ -709,7 +878,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
|
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
|
||||||
std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
|
// std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
|
||||||
}
|
}
|
||||||
/// Introduce a block structure and switch off comms on boundaries
|
/// Introduce a block structure and switch off comms on boundaries
|
||||||
void DirichletBlock(const Coordinate &dirichlet_block)
|
void DirichletBlock(const Coordinate &dirichlet_block)
|
||||||
@ -730,8 +899,8 @@ public:
|
|||||||
int block = dirichlet_block[dimension];
|
int block = dirichlet_block[dimension];
|
||||||
this->_comms_send[ii] = comm_dim;
|
this->_comms_send[ii] = comm_dim;
|
||||||
this->_comms_recv[ii] = comm_dim;
|
this->_comms_recv[ii] = comm_dim;
|
||||||
this->_comms_partial_send[ii] = 0;
|
// this->_comms_partial_send[ii] = 0;
|
||||||
this->_comms_partial_recv[ii] = 0;
|
// this->_comms_partial_recv[ii] = 0;
|
||||||
if ( block && comm_dim ) {
|
if ( block && comm_dim ) {
|
||||||
assert(abs(displacement) < ld );
|
assert(abs(displacement) < ld );
|
||||||
// Quiesce communication across block boundaries
|
// Quiesce communication across block boundaries
|
||||||
@ -752,10 +921,10 @@ public:
|
|||||||
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
|
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
|
||||||
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
|
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
|
||||||
}
|
}
|
||||||
if ( partialDirichlet ) {
|
// if ( partialDirichlet ) {
|
||||||
this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
// this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
||||||
this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
// this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -767,6 +936,7 @@ public:
|
|||||||
Parameters p=Parameters(),
|
Parameters p=Parameters(),
|
||||||
bool preserve_shm=false)
|
bool preserve_shm=false)
|
||||||
{
|
{
|
||||||
|
SloppyComms = 0;
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_grid = grid;
|
_grid = grid;
|
||||||
this->parameters=p;
|
this->parameters=p;
|
||||||
@ -784,7 +954,7 @@ public:
|
|||||||
this->same_node.resize(npoints);
|
this->same_node.resize(npoints);
|
||||||
|
|
||||||
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
|
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
|
||||||
partialDirichlet = p.partialDirichlet;
|
// partialDirichlet = p.partialDirichlet;
|
||||||
DirichletBlock(p.dirichlet); // comms send/recv set up
|
DirichletBlock(p.dirichlet); // comms send/recv set up
|
||||||
fullDirichlet=0;
|
fullDirichlet=0;
|
||||||
for(int d=0;d<p.dirichlet.size();d++){
|
for(int d=0;d<p.dirichlet.size();d++){
|
||||||
@ -801,8 +971,8 @@ public:
|
|||||||
this->_entries_host_p = &_entries[0];
|
this->_entries_host_p = &_entries[0];
|
||||||
this->_entries_p = &_entries_device[0];
|
this->_entries_p = &_entries_device[0];
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
|
// std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
|
||||||
<<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
|
// <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
|
||||||
|
|
||||||
for(int ii=0;ii<npoints;ii++){
|
for(int ii=0;ii<npoints;ii++){
|
||||||
|
|
||||||
@ -865,7 +1035,7 @@ public:
|
|||||||
/////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
const int Nsimd = grid->Nsimd();
|
const int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
// Allow for multiple stencils to exist simultaneously
|
// Allow for multiple stencils to be communicated simultaneously
|
||||||
if (!preserve_shm)
|
if (!preserve_shm)
|
||||||
_grid->ShmBufferFreeAll();
|
_grid->ShmBufferFreeAll();
|
||||||
|
|
||||||
@ -933,7 +1103,8 @@ public:
|
|||||||
GridBase *grid=_grid;
|
GridBase *grid=_grid;
|
||||||
const int Nsimd = grid->Nsimd();
|
const int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
// int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
||||||
|
int comms_recv = this->_comms_recv[point];
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int ld = _grid->_ldimensions[dimension];
|
int ld = _grid->_ldimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
@ -1122,8 +1293,8 @@ public:
|
|||||||
|
|
||||||
int comms_send = this->_comms_send[point];
|
int comms_send = this->_comms_send[point];
|
||||||
int comms_recv = this->_comms_recv[point];
|
int comms_recv = this->_comms_recv[point];
|
||||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||||
|
|
||||||
assert(rhs.Grid()==_grid);
|
assert(rhs.Grid()==_grid);
|
||||||
// conformable(_grid,rhs.Grid());
|
// conformable(_grid,rhs.Grid());
|
||||||
@ -1158,11 +1329,11 @@ public:
|
|||||||
int rbytes;
|
int rbytes;
|
||||||
|
|
||||||
if ( comms_send ) xbytes = bytes; // Full send
|
if ( comms_send ) xbytes = bytes; // Full send
|
||||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else xbytes = 0; // full dirichlet
|
else xbytes = 0; // full dirichlet
|
||||||
|
|
||||||
if ( comms_recv ) rbytes = bytes;
|
if ( comms_recv ) rbytes = bytes;
|
||||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else rbytes = 0;
|
else rbytes = 0;
|
||||||
|
|
||||||
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||||
@ -1189,7 +1360,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
// if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
||||||
|
if ( compress.DecompressionStep()&&comms_recv) {
|
||||||
recv_buf=u_simd_recv_buf[0];
|
recv_buf=u_simd_recv_buf[0];
|
||||||
} else {
|
} else {
|
||||||
recv_buf=this->u_recv_buf_p;
|
recv_buf=this->u_recv_buf_p;
|
||||||
@ -1223,7 +1395,8 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
|
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
|
||||||
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
// compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
||||||
|
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
|
||||||
|
|
||||||
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
|
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
|
||||||
if ( !duplicate ) { // Force comms for now
|
if ( !duplicate ) { // Force comms for now
|
||||||
@ -1232,8 +1405,8 @@ public:
|
|||||||
// Build a list of things to do after we synchronise GPUs
|
// Build a list of things to do after we synchronise GPUs
|
||||||
// Start comms now???
|
// Start comms now???
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
int do_send = (comms_send) && (!shm_send );
|
||||||
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
int do_recv = (comms_send) && (!shm_recv );
|
||||||
AddPacket((void *)&send_buf[comm_off],
|
AddPacket((void *)&send_buf[comm_off],
|
||||||
(void *)&recv_buf[comm_off],
|
(void *)&recv_buf[comm_off],
|
||||||
xmit_to_rank, do_send,
|
xmit_to_rank, do_send,
|
||||||
@ -1241,7 +1414,7 @@ public:
|
|||||||
xbytes,rbytes);
|
xbytes,rbytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
|
if ( (compress.DecompressionStep() && comms_recv) ) {
|
||||||
AddDecompress(&this->u_recv_buf_p[comm_off],
|
AddDecompress(&this->u_recv_buf_p[comm_off],
|
||||||
&recv_buf[comm_off],
|
&recv_buf[comm_off],
|
||||||
words,Decompressions);
|
words,Decompressions);
|
||||||
@ -1263,8 +1436,8 @@ public:
|
|||||||
|
|
||||||
int comms_send = this->_comms_send[point];
|
int comms_send = this->_comms_send[point];
|
||||||
int comms_recv = this->_comms_recv[point];
|
int comms_recv = this->_comms_recv[point];
|
||||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
@ -1339,18 +1512,20 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
if ( comms_send ) xbytes = bytes;
|
if ( comms_send ) xbytes = bytes;
|
||||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else xbytes = 0;
|
else xbytes = 0;
|
||||||
|
|
||||||
if ( comms_recv ) rbytes = bytes;
|
if ( comms_recv ) rbytes = bytes;
|
||||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else rbytes = 0;
|
else rbytes = 0;
|
||||||
|
|
||||||
// Gathers SIMD lanes for send and merge
|
// Gathers SIMD lanes for send and merge
|
||||||
// Different faces can be full comms or partial comms with multiple ranks per node
|
// Different faces can be full comms or partial comms with multiple ranks per node
|
||||||
if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
// if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
||||||
|
if ( comms_send || comms_recv ) {
|
||||||
|
|
||||||
int partial = partialDirichlet;
|
// int partial = partialDirichlet;
|
||||||
|
int partial = 0;
|
||||||
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
|
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
|
||||||
spointers,dimension,sx,cbmask,
|
spointers,dimension,sx,cbmask,
|
||||||
compress,permute_type,partial );
|
compress,permute_type,partial );
|
||||||
@ -1416,7 +1591,8 @@ public:
|
|||||||
if ( (bytes != rbytes) && (rbytes!=0) ){
|
if ( (bytes != rbytes) && (rbytes!=0) ){
|
||||||
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
||||||
}
|
}
|
||||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
// int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||||
|
int do_send = (comms_send) && (!shm_send );
|
||||||
AddPacket((void *)sp,(void *)rp,
|
AddPacket((void *)sp,(void *)rp,
|
||||||
xmit_to_rank,do_send,
|
xmit_to_rank,do_send,
|
||||||
recv_from_rank,do_send,
|
recv_from_rank,do_send,
|
||||||
@ -1430,7 +1606,8 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// rpointer may be doing a remote read in the gather over SHM
|
// rpointer may be doing a remote read in the gather over SHM
|
||||||
if ( comms_recv|comms_partial_recv ) {
|
// if ( comms_recv|comms_partial_recv ) {
|
||||||
|
if ( comms_recv ) {
|
||||||
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -67,7 +67,7 @@ void acceleratorInit(void)
|
|||||||
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
|
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
|
||||||
|
|
||||||
|
|
||||||
GPU_PROP_FMT(totalGlobalMem,"%lld");
|
GPU_PROP_FMT(totalGlobalMem,"%zu");
|
||||||
GPU_PROP(managedMemory);
|
GPU_PROP(managedMemory);
|
||||||
GPU_PROP(isMultiGpuBoard);
|
GPU_PROP(isMultiGpuBoard);
|
||||||
GPU_PROP(warpSize);
|
GPU_PROP(warpSize);
|
||||||
|
@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
|
|||||||
auto err = cudaMallocHost((void **)&ptr,bytes);
|
auto err = cudaMallocHost((void **)&ptr,bytes);
|
||||||
if( err != cudaSuccess ) {
|
if( err != cudaSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
|
printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
|
|||||||
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
||||||
if( err != cudaSuccess ) {
|
if( err != cudaSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
|
printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
@ -237,24 +237,38 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
auto err = cudaMalloc((void **)&ptr,bytes);
|
auto err = cudaMalloc((void **)&ptr,bytes);
|
||||||
if( err != cudaSuccess ) {
|
if( err != cudaSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
|
printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef int acceleratorEvent_t;
|
||||||
|
|
||||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
|
||||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
|
||||||
|
acceleratorCopyToDevice(from,to,bytes);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
|
||||||
|
acceleratorCopyFromDevice(from,to,bytes);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
||||||
|
inline void acceleratorEventWait(acceleratorEvent_t ev)
|
||||||
|
{
|
||||||
|
//auto discard=cudaStreamSynchronize(ev);
|
||||||
|
}
|
||||||
|
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
|
||||||
|
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr)
|
inline int acceleratorIsCommunicable(void *ptr)
|
||||||
@ -323,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
cgh.parallel_for( \
|
cgh.parallel_for( \
|
||||||
sycl::nd_range<3>(global,local), \
|
sycl::nd_range<3>(global,local), \
|
||||||
[=] (sycl::nd_item<3> item) /*mutable*/ \
|
[=] (sycl::nd_item<3> item) /*mutable*/ \
|
||||||
[[intel::reqd_sub_group_size(16)]] \
|
[[sycl::reqd_sub_group_size(16)]] \
|
||||||
{ \
|
{ \
|
||||||
auto iter1 = item.get_global_id(0); \
|
auto iter1 = item.get_global_id(0); \
|
||||||
auto iter2 = item.get_global_id(1); \
|
auto iter2 = item.get_global_id(1); \
|
||||||
@ -363,8 +377,8 @@ inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *t
|
|||||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||||
|
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr)
|
inline int acceleratorIsCommunicable(void *ptr)
|
||||||
@ -478,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
|||||||
inline void *acceleratorAllocHost(size_t bytes)
|
inline void *acceleratorAllocHost(size_t bytes)
|
||||||
{
|
{
|
||||||
void *ptr=NULL;
|
void *ptr=NULL;
|
||||||
auto err = hipMallocHost((void **)&ptr,bytes);
|
auto err = hipHostMalloc((void **)&ptr,bytes);
|
||||||
if( err != hipSuccess ) {
|
if( err != hipSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
||||||
@ -511,23 +525,35 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||||
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
||||||
|
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
typedef int acceleratorEvent_t;
|
||||||
|
|
||||||
|
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||||
{
|
{
|
||||||
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
|
acceleratorCopyToDevice(from,to,bytes);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
|
acceleratorCopyFromDevice(from,to,bytes);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
||||||
|
|
||||||
|
inline void acceleratorEventWait(acceleratorEvent_t ev)
|
||||||
|
{
|
||||||
|
// auto discard=hipStreamSynchronize(ev);
|
||||||
|
}
|
||||||
|
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
inline void acceleratorPin(void *ptr,unsigned long bytes)
|
inline void acceleratorPin(void *ptr,unsigned long bytes)
|
||||||
@ -564,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)
|
|||||||
|
|
||||||
#undef GRID_SIMT
|
#undef GRID_SIMT
|
||||||
|
|
||||||
|
typedef int acceleratorEvent_t;
|
||||||
|
|
||||||
inline void acceleratorMem(void)
|
inline void acceleratorMem(void)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
@ -584,8 +612,13 @@ inline void acceleratorMem(void)
|
|||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||||
|
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyToDevice(from,to,bytes); return 0; }
|
||||||
|
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyFromDevice(from,to,bytes); return 0; }
|
||||||
|
inline void acceleratorEventWait(acceleratorEvent_t ev){}
|
||||||
|
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
|
||||||
|
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); return 0;}
|
||||||
|
|
||||||
inline void acceleratorCopySynchronise(void) {};
|
inline void acceleratorCopySynchronise(void) {};
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||||
@ -676,7 +709,7 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
|||||||
|
|
||||||
template<class T> void acceleratorPut(T& dev,const T&host)
|
template<class T> void acceleratorPut(T& dev,const T&host)
|
||||||
{
|
{
|
||||||
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
|
||||||
}
|
}
|
||||||
template<class T> T acceleratorGet(T& dev)
|
template<class T> T acceleratorGet(T& dev)
|
||||||
{
|
{
|
||||||
|
@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define thread_critical DO_PRAGMA(omp critical)
|
#define thread_critical DO_PRAGMA(omp critical)
|
||||||
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
uint64_t *ufrom = (uint64_t *)from;
|
const uint64_t *ufrom = (const uint64_t *)from;
|
||||||
uint64_t *uto = (uint64_t *)to;
|
uint64_t *uto = (uint64_t *)to;
|
||||||
assert(bytes%8==0);
|
assert(bytes%8==0);
|
||||||
uint64_t words=bytes/8;
|
uint64_t words=bytes/8;
|
||||||
@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
bcopy(from,to,bytes);
|
bcopy(from,to,bytes);
|
||||||
}
|
}
|
||||||
|
@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
Grid_default_latt,
|
Grid_default_latt,
|
||||||
Grid_default_mpi);
|
Grid_default_mpi);
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
|
||||||
|
std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
|
||||||
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
|
||||||
|
FlightRecorder::PrintEntireLog = 1;
|
||||||
|
FlightRecorder::ChecksumComms = 1;
|
||||||
|
FlightRecorder::ChecksumCommsSend=1;
|
||||||
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
||||||
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
||||||
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
||||||
@ -631,12 +638,11 @@ void Grid_debug_handler_init(void)
|
|||||||
sa.sa_flags = SA_SIGINFO;
|
sa.sa_flags = SA_SIGINFO;
|
||||||
// sigaction(SIGSEGV,&sa,NULL);
|
// sigaction(SIGSEGV,&sa,NULL);
|
||||||
sigaction(SIGTRAP,&sa,NULL);
|
sigaction(SIGTRAP,&sa,NULL);
|
||||||
sigaction(SIGBUS,&sa,NULL);
|
// sigaction(SIGBUS,&sa,NULL);
|
||||||
// sigaction(SIGUSR2,&sa,NULL);
|
// sigaction(SIGUSR2,&sa,NULL);
|
||||||
|
|
||||||
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
// feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
||||||
|
// sigaction(SIGFPE,&sa,NULL);
|
||||||
sigaction(SIGFPE,&sa,NULL);
|
|
||||||
sigaction(SIGKILL,&sa,NULL);
|
sigaction(SIGKILL,&sa,NULL);
|
||||||
sigaction(SIGILL,&sa,NULL);
|
sigaction(SIGILL,&sa,NULL);
|
||||||
|
|
||||||
@ -651,3 +657,4 @@ void Grid_debug_handler_init(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ namespace Grid{
|
|||||||
int64_t index64;
|
int64_t index64;
|
||||||
IndexFromCoorReversed(coor,index64,dims);
|
IndexFromCoorReversed(coor,index64,dims);
|
||||||
if ( index64>=2*1024*1024*1024LL ){
|
if ( index64>=2*1024*1024*1024LL ){
|
||||||
std::cout << " IndexFromCoorReversed overflow"<<std::endl;
|
// std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
|
||||||
}
|
}
|
||||||
assert(index64<2*1024*1024*1024LL);
|
assert(index64<2*1024*1024*1024LL);
|
||||||
index = (int) index64;
|
index = (int) index64;
|
||||||
|
@ -66,6 +66,7 @@ namespace Grid{
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class T> void writeFile(T& in, std::string const fname){
|
template <class T> void writeFile(T& in, std::string const fname){
|
||||||
#ifdef HAVE_LIME
|
#ifdef HAVE_LIME
|
||||||
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
|
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
|
||||||
@ -73,7 +74,7 @@ template <class T> void writeFile(T& in, std::string const fname){
|
|||||||
Grid::emptyUserRecord record;
|
Grid::emptyUserRecord record;
|
||||||
Grid::ScidacWriter WR(in.Grid()->IsBoss());
|
Grid::ScidacWriter WR(in.Grid()->IsBoss());
|
||||||
WR.open(fname);
|
WR.open(fname);
|
||||||
WR.writeScidacFieldRecord(in,record,0);
|
WR.writeScidacFieldRecord(in,record,0); // Lexico
|
||||||
WR.close();
|
WR.close();
|
||||||
#endif
|
#endif
|
||||||
// What is the appropriate way to throw error?
|
// What is the appropriate way to throw error?
|
||||||
@ -107,8 +108,18 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
|
for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
|
||||||
|
|
||||||
|
#if 0
|
||||||
CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
|
CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
|
||||||
|
#else
|
||||||
|
// Don't require Grid format RNGs
|
||||||
|
FieldMetaData header;
|
||||||
|
std::string file, filesmr;
|
||||||
|
file = CPar.conf_path + "/" + CPar.conf_prefix + "." + std::to_string(conf);
|
||||||
|
filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix + "." + std::to_string(conf);
|
||||||
|
|
||||||
|
NerscIO::readConfiguration(Umu,header,file);
|
||||||
|
#endif
|
||||||
|
|
||||||
std::cout << std::setprecision(15);
|
std::cout << std::setprecision(15);
|
||||||
std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
|
std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
|
||||||
|
|
||||||
@ -116,6 +127,7 @@ int main(int argc, char **argv) {
|
|||||||
std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
|
std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
|
||||||
|
|
||||||
WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
|
WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
|
||||||
|
|
||||||
WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
|
WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
|
||||||
|
|
||||||
typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
|
typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
|
||||||
@ -165,33 +177,48 @@ int main(int argc, char **argv) {
|
|||||||
//double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
|
//double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
|
||||||
//Plq = coeff * Plq;
|
//Plq = coeff * Plq;
|
||||||
|
|
||||||
int tau = std::round(t);
|
|
||||||
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
|
|
||||||
writeFile(R,efile);
|
|
||||||
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
|
|
||||||
writeFile(qfield,tfile);
|
|
||||||
|
|
||||||
|
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
|
||||||
|
|
||||||
|
int tau = std::round(t);
|
||||||
|
|
||||||
|
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
|
||||||
|
// writeFile(R,efile);
|
||||||
|
|
||||||
|
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
|
||||||
|
// writeFile(qfield,tfile);
|
||||||
|
|
||||||
|
std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
|
||||||
|
{
|
||||||
|
// PeriodicGimplR::GaugeField Ucopy = U;
|
||||||
|
// NerscIO::writeConfiguration(Ucopy,ufile);
|
||||||
|
}
|
||||||
|
|
||||||
RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
|
RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
|
||||||
RealD T = real( sum(qfield) );
|
RealD T = real( sum(qfield) );
|
||||||
Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
|
Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
|
||||||
RealD E0 = real(peekSite(R,scoor));
|
RealD E0 = real(peekSite(R,scoor));
|
||||||
RealD T0 = real(peekSite(qfield,scoor));
|
RealD T0 = real(peekSite(qfield,scoor));
|
||||||
std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: " << conf << " " << step << " " << tau << " "
|
std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: " << conf << " " << step << " " << tau << " "
|
||||||
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
|
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
int t=WFPar.maxTau;
|
int t=WFPar.maxTau;
|
||||||
WF.smear(Uflow, Umu);
|
WF.smear(Uflow, Umu);
|
||||||
|
// NerscIO::writeConfiguration(Uflow,filesmr);
|
||||||
|
|
||||||
|
|
||||||
RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
|
RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
|
||||||
RealD WFlow_TC = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
|
RealD WFlow_TC = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
|
||||||
|
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
|
||||||
RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow); // t
|
RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow); // t
|
||||||
RealD WFlow_EC = WF.energyDensityCloverleaf(t,Uflow);
|
RealD WFlow_EC = WF.energyDensityCloverleaf(t,Uflow);
|
||||||
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
|
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
|
||||||
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
|
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
|
||||||
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
|
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
|
||||||
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
|
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
|
||||||
|
std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << " " << WFlow_TC5Li<< std::endl;
|
||||||
|
|
||||||
std::cout<< GridLogMessage << " Admissibility check:\n";
|
std::cout<< GridLogMessage << " Admissibility check:\n";
|
||||||
const double sp_adm = 0.067; // admissible threshold
|
const double sp_adm = 0.067; // admissible threshold
|
||||||
|
@ -25,13 +25,20 @@ directory
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid/Grid.h>
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
#if Nc == 3
|
||||||
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
|
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
|
||||||
#include <Grid/qcd/smearing/JacobianAction.h>
|
#include <Grid/qcd/smearing/JacobianAction.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
|
#if Nc != 3
|
||||||
|
#warning FTHMC2p1f will not work for Nc != 3
|
||||||
|
std::cout << "This program will currently only work for Nc == 3." << std::endl;
|
||||||
|
#else
|
||||||
std::cout << std::setprecision(12);
|
std::cout << std::setprecision(12);
|
||||||
|
|
||||||
Grid_init(&argc, &argv);
|
Grid_init(&argc, &argv);
|
||||||
@ -220,7 +227,6 @@ int main(int argc, char **argv)
|
|||||||
TheHMC.Run(SmearingPolicy); // for smearing
|
TheHMC.Run(SmearingPolicy); // for smearing
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
|
#endif
|
||||||
} // main
|
} // main
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,14 +24,22 @@ See the full license in the file "LICENSE" in the top level distribution
|
|||||||
directory
|
directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
|
||||||
#include <Grid/Grid.h>
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
#if Nc == 3
|
||||||
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
|
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
|
||||||
#include <Grid/qcd/smearing/JacobianAction.h>
|
#include <Grid/qcd/smearing/JacobianAction.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
|
#if Nc != 3
|
||||||
|
#warning FTHMC2p1f_3GeV will not work for Nc != 3
|
||||||
|
std::cout << "This program will currently only work for Nc == 3." << std::endl;
|
||||||
|
#else
|
||||||
std::cout << std::setprecision(12);
|
std::cout << std::setprecision(12);
|
||||||
|
|
||||||
Grid_init(&argc, &argv);
|
Grid_init(&argc, &argv);
|
||||||
@ -220,6 +228,7 @@ int main(int argc, char **argv)
|
|||||||
TheHMC.Run(SmearingPolicy); // for smearing
|
TheHMC.Run(SmearingPolicy); // for smearing
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
|
#endif
|
||||||
} // main
|
} // main
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,13 +25,20 @@ directory
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid/Grid.h>
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
#if Nc == 3
|
||||||
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
|
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
|
||||||
#include <Grid/qcd/smearing/JacobianAction.h>
|
#include <Grid/qcd/smearing/JacobianAction.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
|
#if Nc != 3
|
||||||
|
#warning HMC2p1f_3GeV will not work for Nc != 3
|
||||||
|
std::cout << "This program will currently only work for Nc == 3." << std::endl;
|
||||||
|
#else
|
||||||
std::cout << std::setprecision(12);
|
std::cout << std::setprecision(12);
|
||||||
|
|
||||||
Grid_init(&argc, &argv);
|
Grid_init(&argc, &argv);
|
||||||
@ -220,6 +227,7 @@ int main(int argc, char **argv)
|
|||||||
TheHMC.Run(SmearingPolicy); // for smearing
|
TheHMC.Run(SmearingPolicy); // for smearing
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
|
#endif
|
||||||
} // main
|
} // main
|
||||||
|
|
||||||
|
|
||||||
|
@ -201,8 +201,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
Params.dirichlet=NonDirichlet;
|
Params.dirichlet=NonDirichlet;
|
||||||
ParamsDir.dirichlet=Dirichlet;
|
ParamsDir.dirichlet=Dirichlet;
|
||||||
ParamsDir.partialDirichlet=0;
|
// ParamsDir.partialDirichlet=0;
|
||||||
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
|
||||||
|
|
||||||
// double StoppingCondition = 1e-14;
|
// double StoppingCondition = 1e-14;
|
||||||
// double MDStoppingCondition = 1e-9;
|
// double MDStoppingCondition = 1e-9;
|
||||||
@ -298,11 +297,11 @@ int main(int argc, char **argv) {
|
|||||||
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
||||||
else ParamsDen.dirichlet = NonDirichlet;
|
else ParamsDen.dirichlet = NonDirichlet;
|
||||||
|
|
||||||
if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||||
else ParamsNum.partialDirichlet = 0;
|
// else ParamsNum.partialDirichlet = 0;
|
||||||
|
|
||||||
if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||||
else ParamsDen.partialDirichlet = 0;
|
// else ParamsDen.partialDirichlet = 0;
|
||||||
|
|
||||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
||||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
||||||
|
@ -333,9 +333,9 @@ int main(int argc, char **argv) {
|
|||||||
ParamsF.dirichlet=NonDirichlet;
|
ParamsF.dirichlet=NonDirichlet;
|
||||||
ParamsDir.dirichlet=Dirichlet;
|
ParamsDir.dirichlet=Dirichlet;
|
||||||
ParamsDirF.dirichlet=Dirichlet;
|
ParamsDirF.dirichlet=Dirichlet;
|
||||||
ParamsDir.partialDirichlet=1;
|
// ParamsDir.partialDirichlet=1;
|
||||||
ParamsDirF.partialDirichlet=1;
|
// ParamsDirF.partialDirichlet=1;
|
||||||
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
// std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
||||||
|
|
||||||
// double StoppingCondition = 1e-14;
|
// double StoppingCondition = 1e-14;
|
||||||
// double MDStoppingCondition = 1e-9;
|
// double MDStoppingCondition = 1e-9;
|
||||||
@ -481,21 +481,21 @@ int main(int argc, char **argv) {
|
|||||||
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
||||||
else ParamsDen.dirichlet = NonDirichlet;
|
else ParamsDen.dirichlet = NonDirichlet;
|
||||||
|
|
||||||
if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||||
else ParamsNum.partialDirichlet = 0;
|
// else ParamsNum.partialDirichlet = 0;
|
||||||
|
|
||||||
if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||||
else ParamsDen.partialDirichlet = 0;
|
// else ParamsDen.partialDirichlet = 0;
|
||||||
|
|
||||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
||||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
||||||
|
|
||||||
ParamsDenF.dirichlet = ParamsDen.dirichlet;
|
ParamsDenF.dirichlet = ParamsDen.dirichlet;
|
||||||
ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
|
// ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
|
||||||
DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
|
DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
|
||||||
|
|
||||||
ParamsNumF.dirichlet = ParamsNum.dirichlet;
|
ParamsNumF.dirichlet = ParamsNum.dirichlet;
|
||||||
ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
|
// ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
|
||||||
NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
|
NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
|
||||||
|
|
||||||
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
|
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
|
||||||
|
@ -166,18 +166,18 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
header();
|
header();
|
||||||
|
|
||||||
for(int lat=8;lat<=maxlat;lat+=4){
|
for(int lat=8;lat<=maxlat;lat+=4){
|
||||||
for(int Ls=8;Ls<=8;Ls*=2){
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
Coordinate latt_size ({lat*mpi_layout[0],
|
Coordinate latt_size ({lat*mpi_layout[0],
|
||||||
lat*mpi_layout[1],
|
lat*mpi_layout[1],
|
||||||
lat*mpi_layout[2],
|
lat*mpi_layout[2],
|
||||||
lat*mpi_layout[3]});
|
lat*mpi_layout[3]});
|
||||||
|
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
RealD Nrank = Grid._Nprocessors;
|
RealD Nrank = Grid._Nprocessors;
|
||||||
@ -193,101 +193,6 @@ int main (int argc, char ** argv)
|
|||||||
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ncomm;
|
|
||||||
|
|
||||||
double dbytes;
|
|
||||||
for(int i=0;i<Nloop;i++){
|
|
||||||
double start=usecond();
|
|
||||||
|
|
||||||
dbytes=0;
|
|
||||||
ncomm=0;
|
|
||||||
|
|
||||||
std::vector<CommsRequest_t> requests;
|
|
||||||
|
|
||||||
for(int mu=0;mu<4;mu++){
|
|
||||||
|
|
||||||
|
|
||||||
if (mpi_layout[mu]>1 ) {
|
|
||||||
|
|
||||||
ncomm++;
|
|
||||||
int comm_proc=1;
|
|
||||||
int xmit_to_rank;
|
|
||||||
int recv_from_rank;
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
dbytes+=
|
|
||||||
Grid.StencilSendToRecvFromBegin(requests,
|
|
||||||
(void *)&xbuf[mu][0],
|
|
||||||
xmit_to_rank,1,
|
|
||||||
(void *)&rbuf[mu][0],
|
|
||||||
recv_from_rank,1,
|
|
||||||
bytes,bytes,mu);
|
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu]-1;
|
|
||||||
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
dbytes+=
|
|
||||||
Grid.StencilSendToRecvFromBegin(requests,
|
|
||||||
(void *)&xbuf[mu+4][0],
|
|
||||||
xmit_to_rank,1,
|
|
||||||
(void *)&rbuf[mu+4][0],
|
|
||||||
recv_from_rank,1,
|
|
||||||
bytes,bytes,mu+4);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Grid.StencilSendToRecvFromComplete(requests,0);
|
|
||||||
Grid.Barrier();
|
|
||||||
double stop=usecond();
|
|
||||||
t_time[i] = stop-start; // microseconds
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
|
||||||
|
|
||||||
dbytes=dbytes*ppn;
|
|
||||||
double xbytes = dbytes*0.5;
|
|
||||||
// double rbytes = dbytes*0.5;
|
|
||||||
double bidibytes = dbytes;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
|
|
||||||
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
|
|
||||||
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
|
|
||||||
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
|
|
||||||
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
|
|
||||||
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
|
||||||
header();
|
|
||||||
|
|
||||||
for(int lat=8;lat<=maxlat;lat+=4){
|
|
||||||
for(int Ls=8;Ls<=8;Ls*=2){
|
|
||||||
|
|
||||||
Coordinate latt_size ({lat*mpi_layout[0],
|
|
||||||
lat*mpi_layout[1],
|
|
||||||
lat*mpi_layout[2],
|
|
||||||
lat*mpi_layout[3]});
|
|
||||||
|
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
|
||||||
RealD Nrank = Grid._Nprocessors;
|
|
||||||
RealD Nnode = Grid.NodeCount();
|
|
||||||
RealD ppn = Nrank/Nnode;
|
|
||||||
|
|
||||||
std::vector<HalfSpinColourVectorD *> xbuf(8);
|
|
||||||
std::vector<HalfSpinColourVectorD *> rbuf(8);
|
|
||||||
Grid.ShmBufferFreeAll();
|
|
||||||
uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
|
||||||
for(int d=0;d<8;d++){
|
|
||||||
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
|
||||||
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ncomm;
|
int ncomm;
|
||||||
double dbytes;
|
double dbytes;
|
||||||
for(int i=0;i<Nloop;i++){
|
for(int i=0;i<Nloop;i++){
|
||||||
@ -296,45 +201,34 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<CommsRequest_t> requests;
|
std::vector<CommsRequest_t> requests;
|
||||||
dbytes=0;
|
dbytes=0;
|
||||||
ncomm=0;
|
ncomm=0;
|
||||||
for(int mu=0;mu<4;mu++){
|
|
||||||
|
for(int dir=0;dir<8;dir++) {
|
||||||
|
|
||||||
|
double tbytes;
|
||||||
|
int mu =dir % 4;
|
||||||
|
|
||||||
if (mpi_layout[mu]>1 ) {
|
if (mpi_layout[mu]>1 ) {
|
||||||
|
|
||||||
ncomm++;
|
ncomm++;
|
||||||
int comm_proc=1;
|
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
|
if ( dir == mu ) {
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
int comm_proc=1;
|
||||||
dbytes+=
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
Grid.StencilSendToRecvFromBegin(requests,
|
} else {
|
||||||
(void *)&xbuf[mu][0],
|
int comm_proc = mpi_layout[mu]-1;
|
||||||
xmit_to_rank,1,
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
(void *)&rbuf[mu][0],
|
}
|
||||||
recv_from_rank,1,
|
int tid = omp_get_thread_num();
|
||||||
bytes,bytes,mu);
|
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
|
||||||
Grid.StencilSendToRecvFromComplete(requests,mu);
|
(void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);
|
||||||
requests.resize(0);
|
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu]-1;
|
dbytes+=tbytes;
|
||||||
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
dbytes+=
|
|
||||||
Grid.StencilSendToRecvFromBegin(requests,
|
|
||||||
(void *)&xbuf[mu+4][0],
|
|
||||||
xmit_to_rank,1,
|
|
||||||
(void *)&rbuf[mu+4][0],
|
|
||||||
recv_from_rank,1,
|
|
||||||
bytes,bytes,mu+4);
|
|
||||||
Grid.StencilSendToRecvFromComplete(requests,mu+4);
|
|
||||||
requests.resize(0);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Grid.Barrier();
|
Grid.Barrier();
|
||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
t_time[i] = stop-start; // microseconds
|
t_time[i] = stop-start; // microseconds
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
|
@ -32,18 +32,18 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
template<class d>
|
////////////////////////
|
||||||
struct scal {
|
/// Move to domains ////
|
||||||
d internal;
|
////////////////////////
|
||||||
|
|
||||||
|
Gamma::Algebra Gmu [] = {
|
||||||
|
Gamma::Algebra::GammaX,
|
||||||
|
Gamma::Algebra::GammaY,
|
||||||
|
Gamma::Algebra::GammaZ,
|
||||||
|
Gamma::Algebra::GammaT
|
||||||
};
|
};
|
||||||
|
|
||||||
Gamma::Algebra Gmu [] = {
|
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
|
||||||
Gamma::Algebra::GammaX,
|
|
||||||
Gamma::Algebra::GammaY,
|
|
||||||
Gamma::Algebra::GammaZ,
|
|
||||||
Gamma::Algebra::GammaT
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
@ -52,39 +52,108 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
int threads = GridThread::GetThreads();
|
||||||
|
|
||||||
Coordinate latt4 = GridDefaultLatt();
|
int Ls=16;
|
||||||
int Ls=8;
|
for(int i=0;i<argc;i++) {
|
||||||
for(int i=0;i<argc;i++)
|
|
||||||
if(std::string(argv[i]) == "-Ls"){
|
if(std::string(argv[i]) == "-Ls"){
|
||||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////
|
||||||
|
// With comms
|
||||||
|
//////////////////
|
||||||
|
Coordinate Dirichlet(Nd+1,0);
|
||||||
|
|
||||||
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
|
Benchmark(Ls,Dirichlet,false);
|
||||||
|
|
||||||
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
|
Benchmark(Ls,Dirichlet,true);
|
||||||
|
|
||||||
|
//////////////////
|
||||||
|
// Domain decomposed
|
||||||
|
//////////////////
|
||||||
|
/*
|
||||||
|
Coordinate latt4 = GridDefaultLatt();
|
||||||
|
Coordinate mpi = GridDefaultMpi();
|
||||||
|
Coordinate CommDim(Nd);
|
||||||
|
Coordinate shm;
|
||||||
|
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||||
|
|
||||||
|
|
||||||
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
|
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||||
|
Dirichlet[0] = 0;
|
||||||
|
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||||
|
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||||
|
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||||
|
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||||
|
|
||||||
|
Benchmark(Ls,Dirichlet,false);
|
||||||
|
|
||||||
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
|
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
||||||
|
|
||||||
|
Benchmark(Ls,Dirichlet,true);
|
||||||
|
*/
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
|
||||||
|
{
|
||||||
|
Coordinate latt4 = GridDefaultLatt();
|
||||||
GridLogLayout();
|
GridLogLayout();
|
||||||
|
|
||||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
#undef SINGLE
|
||||||
|
#ifdef SINGLE
|
||||||
|
typedef vComplexF Simd;
|
||||||
|
typedef LatticeFermionF FermionField;
|
||||||
|
typedef LatticeGaugeFieldF GaugeField;
|
||||||
|
typedef LatticeColourMatrixF ColourMatrixField;
|
||||||
|
typedef DomainWallFermionF FermionAction;
|
||||||
|
#else
|
||||||
|
typedef vComplexD Simd;
|
||||||
|
typedef LatticeFermionD FermionField;
|
||||||
|
typedef LatticeGaugeFieldD GaugeField;
|
||||||
|
typedef LatticeColourMatrixD ColourMatrixField;
|
||||||
|
typedef DomainWallFermionD FermionAction;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
|
||||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
|
||||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
|
||||||
|
|
||||||
LatticeFermion src (FGrid); random(RNG5,src);
|
|
||||||
|
FermionField src (FGrid); random(RNG5,src);
|
||||||
#if 0
|
#if 0
|
||||||
src = Zero();
|
src = Zero();
|
||||||
{
|
{
|
||||||
@ -100,46 +169,39 @@ int main (int argc, char ** argv)
|
|||||||
src = src*N2;
|
src = src*N2;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
FermionField result(FGrid); result=Zero();
|
||||||
LatticeFermion result(FGrid); result=Zero();
|
FermionField ref(FGrid); ref=Zero();
|
||||||
LatticeFermion ref(FGrid); ref=Zero();
|
FermionField tmp(FGrid);
|
||||||
LatticeFermion tmp(FGrid);
|
FermionField err(FGrid);
|
||||||
LatticeFermion err(FGrid);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||||
LatticeGaugeField Umu(UGrid);
|
GaugeField Umu(UGrid);
|
||||||
|
GaugeField UmuCopy(UGrid);
|
||||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||||
|
// SU<Nc>::ColdConfiguration(Umu);
|
||||||
|
UmuCopy=Umu;
|
||||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||||
#if 0
|
|
||||||
Umu=1.0;
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
LatticeColourMatrix ttmp(UGrid);
|
|
||||||
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
|
|
||||||
// if (mu !=2 ) ttmp = 0;
|
|
||||||
// ttmp = ttmp* pow(10.0,mu);
|
|
||||||
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
|
|
||||||
}
|
|
||||||
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Apply BCs
|
||||||
|
////////////////////////////////////
|
||||||
|
Coordinate Block(4);
|
||||||
|
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
|
||||||
|
|
||||||
|
DirichletFilter<GaugeField> Filter(Block);
|
||||||
|
Filter.applyFilter(Umu);
|
||||||
|
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
// Naive wilson implementation
|
// Naive wilson implementation
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
// replicate across fifth dimension
|
std::vector<ColourMatrixField> U(4,UGrid);
|
||||||
LatticeGaugeField Umu5d(FGrid);
|
|
||||||
std::vector<LatticeColourMatrix> U(4,FGrid);
|
|
||||||
{
|
|
||||||
autoView( Umu5d_v, Umu5d, CpuWrite);
|
|
||||||
autoView( Umu_v , Umu , CpuRead);
|
|
||||||
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
||||||
|
|
||||||
if (1)
|
if (1)
|
||||||
@ -147,10 +209,28 @@ int main (int argc, char ** argv)
|
|||||||
ref = Zero();
|
ref = Zero();
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
tmp = Cshift(src,mu+1,1);
|
||||||
|
{
|
||||||
|
autoView( tmp_v , tmp , CpuWrite);
|
||||||
|
autoView( U_v , U[mu] , CpuRead);
|
||||||
|
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||||
|
|
||||||
tmp =adj(U[mu])*src;
|
{
|
||||||
|
autoView( tmp_v , tmp , CpuWrite);
|
||||||
|
autoView( U_v , U[mu] , CpuRead);
|
||||||
|
autoView( src_v, src , CpuRead);
|
||||||
|
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
tmp =Cshift(tmp,mu+1,-1);
|
||||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||||
}
|
}
|
||||||
@ -167,11 +247,9 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop "<<std::endl;
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||||
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
|
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
|
||||||
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
|
||||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||||
@ -181,9 +259,15 @@ int main (int argc, char ** argv)
|
|||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
FermionAction::ImplParams p;
|
||||||
int ncall =1000;
|
p.dirichlet=Dirichlet;
|
||||||
|
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
||||||
|
Dw.SloppyComms(sloppy);
|
||||||
|
Dw.ImportGauge(Umu);
|
||||||
|
|
||||||
|
int ncall =300;
|
||||||
|
RealD n2e;
|
||||||
|
|
||||||
if (1) {
|
if (1) {
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
Dw.Dhop(src,result,0);
|
Dw.Dhop(src,result,0);
|
||||||
@ -198,8 +282,8 @@ int main (int argc, char ** argv)
|
|||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
double flops=single_site_flops*volume*ncall;
|
double flops=single_site_flops*volume*ncall;
|
||||||
|
|
||||||
auto nsimd = vComplex::Nsimd();
|
auto nsimd = Simd::Nsimd();
|
||||||
auto simdwidth = sizeof(vComplex);
|
auto simdwidth = sizeof(Simd);
|
||||||
|
|
||||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
||||||
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||||
@ -208,28 +292,27 @@ int main (int argc, char ** argv)
|
|||||||
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
|
||||||
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
||||||
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
|
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
n2e = norm2(err);
|
||||||
//exit(0);
|
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||||
|
|
||||||
if(( norm2(err)>1.0e-4) ) {
|
if(( n2e>1.0e-4) ) {
|
||||||
/*
|
|
||||||
std::cout << "RESULT\n " << result<<std::endl;
|
|
||||||
std::cout << "REF \n " << ref <<std::endl;
|
|
||||||
std::cout << "ERR \n " << err <<std::endl;
|
|
||||||
*/
|
|
||||||
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
|
std::cout<<GridLogMessage << "RESULT" << std::endl;
|
||||||
|
// std::cout << result<<std::endl;
|
||||||
|
std::cout << norm2(result)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "REF" << std::endl;
|
||||||
|
std::cout << norm2(ref)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "ERR" << std::endl;
|
||||||
|
std::cout << norm2(err)<<std::endl;
|
||||||
|
FGrid->Barrier();
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
assert (norm2(err)< 1.0e-4 );
|
assert (n2e< 1.0e-4 );
|
||||||
}
|
}
|
||||||
|
|
||||||
if (1)
|
if (1)
|
||||||
@ -238,16 +321,30 @@ int main (int argc, char ** argv)
|
|||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
||||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
tmp = Cshift(src,mu+1,1);
|
||||||
{
|
{
|
||||||
autoView( ref_v, ref, CpuWrite);
|
autoView( ref_v, ref, CpuWrite);
|
||||||
autoView( tmp_v, tmp, CpuRead);
|
autoView( tmp_v, tmp, CpuRead);
|
||||||
for(int i=0;i<ref_v.size();i++){
|
autoView( U_v , U[mu] , CpuRead);
|
||||||
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
|
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
int i=s+Ls*ss;
|
||||||
|
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp =adj(U[mu])*src;
|
{
|
||||||
|
autoView( tmp_v , tmp , CpuWrite);
|
||||||
|
autoView( U_v , U[mu] , CpuRead);
|
||||||
|
autoView( src_v, src , CpuRead);
|
||||||
|
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// tmp =adj(U[mu])*src;
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
tmp =Cshift(tmp,mu+1,-1);
|
||||||
{
|
{
|
||||||
autoView( ref_v, ref, CpuWrite);
|
autoView( ref_v, ref, CpuWrite);
|
||||||
@ -259,27 +356,27 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
ref = -0.5*ref;
|
ref = -0.5*ref;
|
||||||
}
|
}
|
||||||
// dump=1;
|
|
||||||
Dw.Dhop(src,result,1);
|
Dw.Dhop(src,result,DaggerYes);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||||
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
|
n2e= norm2(err);
|
||||||
if((norm2(err)>1.0e-4)){
|
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||||
/*
|
|
||||||
std::cout<< "DAG RESULT\n " <<ref << std::endl;
|
|
||||||
std::cout<< "DAG sRESULT\n " <<result << std::endl;
|
|
||||||
std::cout<< "DAG ERR \n " << err <<std::endl;
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
LatticeFermion src_e (FrbGrid);
|
|
||||||
LatticeFermion src_o (FrbGrid);
|
|
||||||
LatticeFermion r_e (FrbGrid);
|
|
||||||
LatticeFermion r_o (FrbGrid);
|
|
||||||
LatticeFermion r_eo (FGrid);
|
|
||||||
|
|
||||||
|
assert((n2e)<1.0e-4);
|
||||||
|
|
||||||
|
FermionField src_e (FrbGrid);
|
||||||
|
FermionField src_o (FrbGrid);
|
||||||
|
FermionField r_e (FrbGrid);
|
||||||
|
FermionField r_o (FrbGrid);
|
||||||
|
FermionField r_eo (FGrid);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
||||||
pickCheckerboard(Even,src_e,src);
|
pickCheckerboard(Even,src_e,src);
|
||||||
@ -291,10 +388,8 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO "<<std::endl;
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
|
||||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||||
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
|
||||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||||
@ -308,13 +403,7 @@ int main (int argc, char ** argv)
|
|||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
#ifdef CUDA_PROFILE
|
|
||||||
if(i==10) cudaProfilerStart();
|
|
||||||
#endif
|
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
#ifdef CUDA_PROFILE
|
|
||||||
if(i==20) cudaProfilerStop();
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
@ -338,14 +427,9 @@ int main (int argc, char ** argv)
|
|||||||
setCheckerboard(r_eo,r_e);
|
setCheckerboard(r_eo,r_e);
|
||||||
|
|
||||||
err = r_eo-result;
|
err = r_eo-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
n2e= norm2(err);
|
||||||
if((norm2(err)>1.0e-4)){
|
std::cout<<GridLogMessage << "norm diff "<< n2e<<std::endl;
|
||||||
/*
|
assert(n2e<1.0e-4);
|
||||||
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
|
|
||||||
std::cout<< "Deo REF\n " <<result << std::endl;
|
|
||||||
std::cout<< "Deo ERR \n " << err <<std::endl;
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
pickCheckerboard(Even,src_e,err);
|
pickCheckerboard(Even,src_e,err);
|
||||||
pickCheckerboard(Odd,src_o,err);
|
pickCheckerboard(Odd,src_o,err);
|
||||||
@ -354,6 +438,4 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
assert(norm2(src_e)<1.0e-4);
|
assert(norm2(src_e)<1.0e-4);
|
||||||
assert(norm2(src_o)<1.0e-4);
|
assert(norm2(src_o)<1.0e-4);
|
||||||
Grid_finalize();
|
|
||||||
exit(0);
|
|
||||||
}
|
}
|
||||||
|
@ -43,7 +43,7 @@ Gamma::Algebra Gmu [] = {
|
|||||||
Gamma::Algebra::GammaT
|
Gamma::Algebra::GammaT
|
||||||
};
|
};
|
||||||
|
|
||||||
void Benchmark(int Ls, Coordinate Dirichlet);
|
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
@ -69,11 +69,19 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
Benchmark(Ls,Dirichlet);
|
Benchmark(Ls,Dirichlet,false);
|
||||||
|
|
||||||
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
|
Benchmark(Ls,Dirichlet,true);
|
||||||
|
|
||||||
//////////////////
|
//////////////////
|
||||||
// Domain decomposed
|
// Domain decomposed
|
||||||
//////////////////
|
//////////////////
|
||||||
|
/*
|
||||||
Coordinate latt4 = GridDefaultLatt();
|
Coordinate latt4 = GridDefaultLatt();
|
||||||
Coordinate mpi = GridDefaultMpi();
|
Coordinate mpi = GridDefaultMpi();
|
||||||
Coordinate CommDim(Nd);
|
Coordinate CommDim(Nd);
|
||||||
@ -81,42 +89,35 @@ int main (int argc, char ** argv)
|
|||||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||||
|
|
||||||
|
|
||||||
//////////////////////
|
|
||||||
// Node level
|
|
||||||
//////////////////////
|
|
||||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||||
// Dirichlet[0] = 0;
|
Dirichlet[0] = 0;
|
||||||
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||||
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||||
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||||
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||||
|
|
||||||
Benchmark(Ls,Dirichlet);
|
Benchmark(Ls,Dirichlet,false);
|
||||||
|
|
||||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
|
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
||||||
// Dirichlet[0] = 0;
|
|
||||||
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
|
|
||||||
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
|
|
||||||
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
|
|
||||||
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
|
|
||||||
|
|
||||||
Benchmark(Ls,Dirichlet);
|
Benchmark(Ls,Dirichlet,true);
|
||||||
|
*/
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
void Benchmark(int Ls, Coordinate Dirichlet)
|
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
|
||||||
{
|
{
|
||||||
Coordinate latt4 = GridDefaultLatt();
|
Coordinate latt4 = GridDefaultLatt();
|
||||||
GridLogLayout();
|
GridLogLayout();
|
||||||
@ -132,21 +133,13 @@ void Benchmark(int Ls, Coordinate Dirichlet)
|
|||||||
typedef LatticeGaugeFieldF GaugeField;
|
typedef LatticeGaugeFieldF GaugeField;
|
||||||
typedef LatticeColourMatrixF ColourMatrixField;
|
typedef LatticeColourMatrixF ColourMatrixField;
|
||||||
typedef DomainWallFermionF FermionAction;
|
typedef DomainWallFermionF FermionAction;
|
||||||
#endif
|
#else
|
||||||
#ifdef DOUBLE
|
|
||||||
typedef vComplexD Simd;
|
typedef vComplexD Simd;
|
||||||
typedef LatticeFermionD FermionField;
|
typedef LatticeFermionD FermionField;
|
||||||
typedef LatticeGaugeFieldD GaugeField;
|
typedef LatticeGaugeFieldD GaugeField;
|
||||||
typedef LatticeColourMatrixD ColourMatrixField;
|
typedef LatticeColourMatrixD ColourMatrixField;
|
||||||
typedef DomainWallFermionD FermionAction;
|
typedef DomainWallFermionD FermionAction;
|
||||||
#endif
|
#endif
|
||||||
#ifdef DOUBLE2
|
|
||||||
typedef vComplexD2 Simd;
|
|
||||||
typedef LatticeFermionD2 FermionField;
|
|
||||||
typedef LatticeGaugeFieldD2 GaugeField;
|
|
||||||
typedef LatticeColourMatrixD2 ColourMatrixField;
|
|
||||||
typedef DomainWallFermionD2 FermionAction;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
@ -269,6 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
|
|||||||
FermionAction::ImplParams p;
|
FermionAction::ImplParams p;
|
||||||
p.dirichlet=Dirichlet;
|
p.dirichlet=Dirichlet;
|
||||||
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
||||||
|
Dw.SloppyComms(sloppy);
|
||||||
Dw.ImportGauge(Umu);
|
Dw.ImportGauge(Umu);
|
||||||
|
|
||||||
int ncall =300;
|
int ncall =300;
|
||||||
|
@ -1,465 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
#define CUDA_PROFILE
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CUDA_PROFILE
|
|
||||||
#include <cuda_profiler_api.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
|
|
||||||
////////////////////////
|
|
||||||
/// Move to domains ////
|
|
||||||
////////////////////////
|
|
||||||
|
|
||||||
Gamma::Algebra Gmu [] = {
|
|
||||||
Gamma::Algebra::GammaX,
|
|
||||||
Gamma::Algebra::GammaY,
|
|
||||||
Gamma::Algebra::GammaZ,
|
|
||||||
Gamma::Algebra::GammaT
|
|
||||||
};
|
|
||||||
|
|
||||||
void Benchmark(int Ls, Coordinate Dirichlet, int partial);
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
|
||||||
{
|
|
||||||
Grid_init(&argc,&argv);
|
|
||||||
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
|
|
||||||
int Ls=8;
|
|
||||||
for(int i=0;i<argc;i++) {
|
|
||||||
if(std::string(argv[i]) == "-Ls"){
|
|
||||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////
|
|
||||||
// With comms
|
|
||||||
//////////////////
|
|
||||||
Coordinate Dirichlet(Nd+1,0);
|
|
||||||
|
|
||||||
for(auto partial : {0}) {
|
|
||||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
Benchmark(Ls,Dirichlet,partial);
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////
|
|
||||||
// Domain decomposed
|
|
||||||
//////////////////
|
|
||||||
Coordinate latt4 = GridDefaultLatt();
|
|
||||||
Coordinate mpi = GridDefaultMpi();
|
|
||||||
Coordinate CommDim(Nd);
|
|
||||||
//Coordinate shm({2,1,1,1});
|
|
||||||
Coordinate shm;
|
|
||||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
|
||||||
|
|
||||||
std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
|
|
||||||
|
|
||||||
//////////////////////
|
|
||||||
// Node level
|
|
||||||
//////////////////////
|
|
||||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
|
||||||
// for(int d=0;d<Nd;d++) CommDim[d]= 1;
|
|
||||||
Dirichlet[0] = 0;
|
|
||||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
|
||||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
|
||||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
|
||||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
|
||||||
|
|
||||||
for(auto partial : {0,1}) {
|
|
||||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
Benchmark(Ls,Dirichlet,partial);
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
|
||||||
Dirichlet[0] = 0;
|
|
||||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
|
|
||||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
|
|
||||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
|
|
||||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
|
|
||||||
|
|
||||||
for(auto partial : {0,1}) {
|
|
||||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
Benchmark(Ls,Dirichlet,partial);
|
|
||||||
}
|
|
||||||
Grid_finalize();
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
void Benchmark(int Ls, Coordinate Dirichlet, int partial)
|
|
||||||
{
|
|
||||||
Coordinate latt4 = GridDefaultLatt();
|
|
||||||
GridLogLayout();
|
|
||||||
|
|
||||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
|
||||||
#define SINGLE
|
|
||||||
#ifdef SINGLE
|
|
||||||
typedef vComplexF Simd;
|
|
||||||
typedef LatticeFermionF FermionField;
|
|
||||||
typedef LatticeGaugeFieldF GaugeField;
|
|
||||||
typedef LatticeColourMatrixF ColourMatrixField;
|
|
||||||
typedef DomainWallFermionF FermionAction;
|
|
||||||
#endif
|
|
||||||
#ifdef DOUBLE
|
|
||||||
typedef vComplexD Simd;
|
|
||||||
typedef LatticeFermionD FermionField;
|
|
||||||
typedef LatticeGaugeFieldD GaugeField;
|
|
||||||
typedef LatticeColourMatrixD ColourMatrixField;
|
|
||||||
typedef DomainWallFermionD FermionAction;
|
|
||||||
#endif
|
|
||||||
#ifdef DOUBLE2
|
|
||||||
typedef vComplexD2 Simd;
|
|
||||||
typedef LatticeFermionD2 FermionField;
|
|
||||||
typedef LatticeGaugeFieldD2 GaugeField;
|
|
||||||
typedef LatticeColourMatrixD2 ColourMatrixField;
|
|
||||||
typedef DomainWallFermionD2 FermionAction;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
|
||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
|
||||||
|
|
||||||
|
|
||||||
FermionField src (FGrid); random(RNG5,src);
|
|
||||||
#if 0
|
|
||||||
src = Zero();
|
|
||||||
{
|
|
||||||
Coordinate origin({0,0,0,latt4[2]-1,0});
|
|
||||||
SpinColourVectorF tmp;
|
|
||||||
tmp=Zero();
|
|
||||||
tmp()(0)(0)=Complex(-2.0,0.0);
|
|
||||||
std::cout << " source site 0 " << tmp<<std::endl;
|
|
||||||
pokeSite(tmp,src,origin);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
RealD N2 = 1.0/::sqrt(norm2(src));
|
|
||||||
src = src*N2;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
FermionField result(FGrid); result=Zero();
|
|
||||||
FermionField ref(FGrid); ref=Zero();
|
|
||||||
FermionField tmp(FGrid);
|
|
||||||
FermionField err(FGrid);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
|
||||||
GaugeField Umu(UGrid);
|
|
||||||
GaugeField UmuFull(UGrid);
|
|
||||||
GaugeField UmuCopy(UGrid);
|
|
||||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
|
||||||
UmuCopy=Umu;
|
|
||||||
UmuFull=Umu;
|
|
||||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
|
||||||
|
|
||||||
////////////////////////////////////
|
|
||||||
// Apply BCs
|
|
||||||
////////////////////////////////////
|
|
||||||
Coordinate Block(4);
|
|
||||||
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
|
|
||||||
|
|
||||||
DirichletFilter<GaugeField> Filter(Block);
|
|
||||||
Filter.applyFilter(Umu);
|
|
||||||
if(!partial) Filter.applyFilter(UmuCopy);
|
|
||||||
|
|
||||||
////////////////////////////////////
|
|
||||||
// Naive wilson implementation
|
|
||||||
////////////////////////////////////
|
|
||||||
std::vector<ColourMatrixField> U(4,UGrid);
|
|
||||||
std::vector<ColourMatrixField> Ucopy(4,UGrid);
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
|
||||||
Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
|
||||||
|
|
||||||
if (1)
|
|
||||||
{
|
|
||||||
ref = Zero();
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
tmp = Cshift(src,mu+1,1);
|
|
||||||
{
|
|
||||||
autoView( tmp_v , tmp , CpuWrite);
|
|
||||||
autoView( U_v , U[mu] , CpuRead);
|
|
||||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
|
||||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
if ( (s<depth) || (s>=Ls-depth)){
|
|
||||||
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
|
|
||||||
} else {
|
|
||||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
|
||||||
{
|
|
||||||
autoView( tmp_v , tmp , CpuWrite);
|
|
||||||
autoView( U_v , U[mu] , CpuRead);
|
|
||||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
|
||||||
autoView( src_v, src , CpuRead);
|
|
||||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
if ( (s<depth) || (s>=Ls-depth)){
|
|
||||||
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
|
|
||||||
} else {
|
|
||||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
|
||||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
|
||||||
}
|
|
||||||
ref = -0.5*ref;
|
|
||||||
}
|
|
||||||
|
|
||||||
RealD mass=0.1;
|
|
||||||
RealD M5 =1.8;
|
|
||||||
|
|
||||||
RealD NP = UGrid->_Nprocessors;
|
|
||||||
RealD NN = UGrid->NodeCount();
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
|
||||||
std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
|
|
||||||
std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
|
||||||
#endif
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
|
||||||
|
|
||||||
FermionAction::ImplParams p;
|
|
||||||
p.dirichlet=Dirichlet;
|
|
||||||
p.partialDirichlet=partial;
|
|
||||||
FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
|
||||||
|
|
||||||
int ncall =1;
|
|
||||||
RealD n2e;
|
|
||||||
|
|
||||||
if (1) {
|
|
||||||
FGrid->Barrier();
|
|
||||||
Dw.Dhop(src,result,0);
|
|
||||||
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
|
|
||||||
double t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.Dhop(src,result,0);
|
|
||||||
}
|
|
||||||
double t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
|
||||||
double flops=single_site_flops*volume*ncall;
|
|
||||||
|
|
||||||
auto nsimd = Simd::Nsimd();
|
|
||||||
auto simdwidth = sizeof(Simd);
|
|
||||||
|
|
||||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
|
||||||
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
|
||||||
|
|
||||||
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
|
|
||||||
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
|
||||||
err = ref-result;
|
|
||||||
n2e = norm2(err);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
|
||||||
|
|
||||||
if(( n2e>1.0e-4) ) {
|
|
||||||
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
DumpSliceNorm("s-slice ref ",ref,1);
|
|
||||||
DumpSliceNorm("s-slice res ",result,1);
|
|
||||||
DumpSliceNorm("s-slice error ",err,1);
|
|
||||||
exit(-1);
|
|
||||||
}
|
|
||||||
assert (n2e< 1.0e-4 );
|
|
||||||
}
|
|
||||||
|
|
||||||
if (1)
|
|
||||||
{ // Naive wilson dag implementation
|
|
||||||
|
|
||||||
ref = Zero();
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
tmp = Cshift(src,mu+1,1);
|
|
||||||
{
|
|
||||||
autoView( tmp_v , tmp , CpuWrite);
|
|
||||||
autoView( U_v , U[mu] , CpuRead);
|
|
||||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
|
||||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
if ( (s<depth) || (s>=Ls-depth)){
|
|
||||||
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
|
|
||||||
} else {
|
|
||||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
|
||||||
{
|
|
||||||
autoView( tmp_v , tmp , CpuWrite);
|
|
||||||
autoView( U_v , U[mu] , CpuRead);
|
|
||||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
|
||||||
autoView( src_v, src , CpuRead);
|
|
||||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
if ( (s<depth) || (s>=Ls-depth)){
|
|
||||||
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
|
|
||||||
} else {
|
|
||||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
|
||||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
|
||||||
}
|
|
||||||
ref = -0.5*ref;
|
|
||||||
}
|
|
||||||
|
|
||||||
Dw.Dhop(src,result,DaggerYes);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
|
||||||
err = ref-result;
|
|
||||||
n2e= norm2(err);
|
|
||||||
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
|
||||||
|
|
||||||
assert((n2e)<1.0e-4);
|
|
||||||
|
|
||||||
FermionField src_e (FrbGrid);
|
|
||||||
FermionField src_o (FrbGrid);
|
|
||||||
FermionField r_e (FrbGrid);
|
|
||||||
FermionField r_o (FrbGrid);
|
|
||||||
FermionField r_eo (FGrid);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
|
||||||
pickCheckerboard(Even,src_e,src);
|
|
||||||
pickCheckerboard(Odd,src_o,src);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
|
||||||
#endif
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
|
||||||
{
|
|
||||||
FGrid->Barrier();
|
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
|
||||||
double t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
|
||||||
}
|
|
||||||
double t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
|
||||||
double flops=(single_site_flops*volume*ncall)/2.0;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
|
|
||||||
}
|
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
|
||||||
Dw.DhopOE(src_e,r_o,DaggerNo);
|
|
||||||
Dw.Dhop (src ,result,DaggerNo);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
|
|
||||||
|
|
||||||
setCheckerboard(r_eo,r_o);
|
|
||||||
setCheckerboard(r_eo,r_e);
|
|
||||||
|
|
||||||
err = r_eo-result;
|
|
||||||
n2e= norm2(err);
|
|
||||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
|
||||||
assert(n2e<1.0e-4);
|
|
||||||
|
|
||||||
pickCheckerboard(Even,src_e,err);
|
|
||||||
pickCheckerboard(Odd,src_o,err);
|
|
||||||
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
|
||||||
|
|
||||||
assert(norm2(src_e)<1.0e-4);
|
|
||||||
assert(norm2(src_o)<1.0e-4);
|
|
||||||
}
|
|
@ -492,17 +492,18 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
uint64_t ncall = 500;
|
uint64_t no = 50;
|
||||||
|
uint64_t ni = 100;
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
for(uint64_t j=0;j<ni;j++){
|
||||||
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
@ -520,11 +521,11 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
@ -535,6 +536,7 @@ public:
|
|||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -654,17 +656,19 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
uint64_t ncall = 500;
|
|
||||||
|
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
uint64_t no = 50;
|
||||||
|
uint64_t ni = 100;
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
Ds.DhopEO(src_o,r_e,DaggerNo);
|
for(uint64_t j=0;j<ni;j++){
|
||||||
|
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
@ -675,11 +679,11 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
@ -689,6 +693,7 @@ public:
|
|||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -792,19 +797,18 @@ public:
|
|||||||
Dc.M(src,r);
|
Dc.M(src,r);
|
||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
uint64_t ni = 100;
|
||||||
uint64_t ncall = 500;
|
uint64_t no = 50;
|
||||||
|
|
||||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(ncall);
|
std::vector<double> t_time(no);
|
||||||
for(uint64_t i=0;i<ncall;i++){
|
for(uint64_t i=0;i<no;i++){
|
||||||
t0=usecond();
|
double t0=usecond();
|
||||||
Dc.M(src,r);
|
for(uint64_t j=0;j<ni;j++){
|
||||||
t1=usecond();
|
Dc.M(src,r);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
@ -814,20 +818,21 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min;
|
mf_hi = flops/timestat.min*ni;
|
||||||
mf_lo = flops/timestat.max;
|
mf_lo = flops/timestat.max*ni;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean;
|
mflops = flops/timestat.mean*ni;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
if ( mflops>mflops_best ) mflops_best = mflops;
|
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||||
if ( mflops<mflops_worst) mflops_worst= mflops;
|
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call "<< timestat.mean/ni<<std::endl;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -872,7 +877,7 @@ int main (int argc, char ** argv)
|
|||||||
int do_dslash=1;
|
int do_dslash=1;
|
||||||
|
|
||||||
int sel=4;
|
int sel=4;
|
||||||
std::vector<int> L_list({8,12,16,24});
|
std::vector<int> L_list({8,12,16,24,32});
|
||||||
int selm1=sel-1;
|
int selm1=sel-1;
|
||||||
|
|
||||||
std::vector<double> clover;
|
std::vector<double> clover;
|
||||||
|
@ -151,7 +151,7 @@ AC_ARG_ENABLE([tracing],
|
|||||||
case ${ac_TRACING} in
|
case ${ac_TRACING} in
|
||||||
nvtx)
|
nvtx)
|
||||||
AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
|
AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
|
||||||
LIBS="${LIBS} -lnvToolsExt64_1"
|
LIBS="${LIBS} -lnvToolsExt"
|
||||||
;;
|
;;
|
||||||
roctx)
|
roctx)
|
||||||
AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])
|
AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])
|
||||||
|
@ -93,10 +93,13 @@ int main(int argc, char ** argv)
|
|||||||
Real coeff = (width*width) / Real(4*Iterations);
|
Real coeff = (width*width) / Real(4*Iterations);
|
||||||
|
|
||||||
chi=kronecker;
|
chi=kronecker;
|
||||||
|
|
||||||
// chi = (1-p^2/2N)^N kronecker
|
// chi = (1-p^2/2N)^N kronecker
|
||||||
for(int n = 0; n < Iterations; ++n) {
|
for(int n = 0; n < Iterations; ++n) {
|
||||||
Laplacian.M(chi,psi);
|
Laplacian.M(chi,psi);
|
||||||
chi = chi - coeff*psi;
|
chi = chi - coeff*psi;
|
||||||
|
RealD n2 = norm2(chi);
|
||||||
|
chi = chi * (1.0/std::sqrt(n2));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << " Wuppertal smeared operator is chi = \n" << chi <<std::endl;
|
std::cout << " Wuppertal smeared operator is chi = \n" << chi <<std::endl;
|
||||||
|
@ -1,18 +1,19 @@
|
|||||||
#Ahead of time compile for PVC
|
#Ahead of time compile for PVC
|
||||||
|
|
||||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib"
|
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
|
||||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
|
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
|
||||||
|
|
||||||
#JIT compile
|
#JIT compile
|
||||||
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||||
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
||||||
|
|
||||||
../../configure \
|
../configure \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-reduction=grid \
|
--enable-reduction=grid \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--enable-comms=mpi-auto \
|
--enable-comms=mpi-auto \
|
||||||
--enable-debug \
|
--enable-debug \
|
||||||
|
--prefix $HOME/gpt-install \
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--with-lime=$CLIME \
|
--with-lime=$CLIME \
|
||||||
|
22
systems/Frontier-rocm631/config-command
Normal file
22
systems/Frontier-rocm631/config-command
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
||||||
|
../../configure --enable-comms=mpi-auto \
|
||||||
|
--with-lime=$CLIME \
|
||||||
|
--enable-unified=no \
|
||||||
|
--enable-shm=nvlink \
|
||||||
|
--enable-tracing=none \
|
||||||
|
--enable-accelerator=hip \
|
||||||
|
--enable-gen-simd-width=64 \
|
||||||
|
--disable-gparity \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
--enable-simd=GPU \
|
||||||
|
--with-gmp=$OLCF_GMP_ROOT \
|
||||||
|
--with-fftw=$FFTW_DIR/.. \
|
||||||
|
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
CXX=hipcc MPICXX=mpicxx \
|
||||||
|
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
||||||
|
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
16
systems/Frontier-rocm631/sourceme631.sh
Normal file
16
systems/Frontier-rocm631/sourceme631.sh
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
|
||||||
|
echo spack
|
||||||
|
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
|
||||||
|
|
||||||
|
#module load cce/15.0.1
|
||||||
|
|
||||||
|
module load rocm/6.3.1
|
||||||
|
module load cray-fftw
|
||||||
|
module load craype-accel-amd-gfx90a
|
||||||
|
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
|
#Ugly hacks to get down level software working on current system
|
||||||
|
#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
|
||||||
|
#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
|
||||||
|
#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
|
||||||
|
|
@ -30,14 +30,10 @@ source ${root}/sourceme.sh
|
|||||||
|
|
||||||
export OMP_NUM_THREADS=7
|
export OMP_NUM_THREADS=7
|
||||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||||
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||||
|
#64.64.32.96
|
||||||
for vol in 32.32.32.64
|
for vol in 64.64.32.64
|
||||||
do
|
do
|
||||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol
|
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
|
||||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
|
|
||||||
|
|
||||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
|
|
||||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
|
|
||||||
done
|
done
|
||||||
|
|
||||||
|
@ -3,20 +3,19 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
|||||||
--with-lime=$CLIME \
|
--with-lime=$CLIME \
|
||||||
--enable-unified=no \
|
--enable-unified=no \
|
||||||
--enable-shm=nvlink \
|
--enable-shm=nvlink \
|
||||||
--enable-tracing=timer \
|
--enable-tracing=none \
|
||||||
--enable-accelerator=hip \
|
--enable-accelerator=hip \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-accelerator-cshift \
|
|
||||||
--with-gmp=$OLCF_GMP_ROOT \
|
--with-gmp=$OLCF_GMP_ROOT \
|
||||||
--with-fftw=$FFTW_DIR/.. \
|
--with-fftw=$FFTW_DIR/.. \
|
||||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
CXX=hipcc MPICXX=mpicxx \
|
CXX=hipcc MPICXX=mpicxx \
|
||||||
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
||||||
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
|
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,12 +1,25 @@
|
|||||||
|
|
||||||
|
echo spack
|
||||||
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
|
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
|
||||||
spack load c-lime
|
|
||||||
module load emacs
|
module load cce/15.0.1
|
||||||
module load PrgEnv-gnu
|
module load rocm/5.3.0
|
||||||
module load rocm/6.0.0
|
|
||||||
module load cray-mpich
|
|
||||||
module load gmp
|
|
||||||
module load cray-fftw
|
module load cray-fftw
|
||||||
module load craype-accel-amd-gfx90a
|
module load craype-accel-amd-gfx90a
|
||||||
|
|
||||||
|
#Ugly hacks to get down level software working on current system
|
||||||
|
export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
|
||||||
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
||||||
|
export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
|
||||||
|
ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
|
||||||
|
|
||||||
|
#echo spack load c-lime
|
||||||
|
#spack load c-lime
|
||||||
|
#module load emacs
|
||||||
|
##module load PrgEnv-gnu
|
||||||
|
##module load cray-mpich
|
||||||
|
##module load cray-fftw
|
||||||
|
##module load craype-accel-amd-gfx90a
|
||||||
|
##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
||||||
#Hack for lib
|
#Hack for lib
|
||||||
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
|
##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
|
||||||
|
273
systems/Jupiter/benchmarks/dwf.1node.perf
Normal file
273
systems/Jupiter/benchmarks/dwf.1node.perf
Normal file
@ -0,0 +1,273 @@
|
|||||||
|
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||||
|
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||||
|
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||||
|
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||||
|
SLURM detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 1
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 0 device 0 bus id: 0009:01:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
SharedMemoryMpi: World communicator of size 4
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 81604378624 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 0.309000 s : Testing with full communication
|
||||||
|
Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 0.313000 s : Grid Layout
|
||||||
|
Grid : Message : 0.313000 s : Global lattice size : 32 32 64 64
|
||||||
|
Grid : Message : 0.319000 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 0.320000 s : MPI tasks : 1 1 2 2
|
||||||
|
Grid : Message : 0.129590 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 0.942440 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
local rank 1 device 0 bus id: 0019:01:00.0
|
||||||
|
local rank 2 device 0 bus id: 0029:01:00.0
|
||||||
|
local rank 3 device 0 bus id: 0039:01:00.0
|
||||||
|
Grid : Message : 43.893114 s : Drawing gauge field
|
||||||
|
Grid : Message : 54.574150 s : Random gauge initialised
|
||||||
|
Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 54.580032 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 60.407451 s : *****************************************************************
|
||||||
|
Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 60.407470 s : *****************************************************************
|
||||||
|
Grid : Message : 60.407471 s : *****************************************************************
|
||||||
|
Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 60.407473 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 60.407475 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 60.407480 s : *****************************************************************
|
||||||
|
Grid : Message : 61.102178 s : Called warmup
|
||||||
|
Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
|
||||||
|
Grid : Message : 62.177198 s : mflop/s = 24721998.6
|
||||||
|
Grid : Message : 62.177201 s : mflop/s per rank = 6180499.64
|
||||||
|
Grid : Message : 62.177204 s : mflop/s per node = 24721998.6
|
||||||
|
Grid : Message : 62.182696 s : norm diff 5.8108784e-14 Line 306
|
||||||
|
Grid : Message : 71.328862 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 71.328885 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 71.328886 s : Called DwDag
|
||||||
|
Grid : Message : 71.328887 s : norm dag result 4.12810493
|
||||||
|
Grid : Message : 71.329493 s : norm dag ref 4.12810493
|
||||||
|
Grid : Message : 71.331967 s : norm dag diff 3.40632318e-14 Line 377
|
||||||
|
Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 71.803650 s : src_e0.500003185
|
||||||
|
Grid : Message : 71.819727 s : src_o0.499996882
|
||||||
|
Grid : Message : 71.821991 s : *********************************************************
|
||||||
|
Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 71.821995 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 71.822003 s : *********************************************************
|
||||||
|
Grid : Message : 72.377054 s : Deo mflop/s = 24065467
|
||||||
|
Grid : Message : 72.377071 s : Deo mflop/s per rank 6016366.75
|
||||||
|
Grid : Message : 72.377074 s : Deo mflop/s per node 24065467
|
||||||
|
Grid : Message : 72.624877 s : r_e2.06377678
|
||||||
|
Grid : Message : 72.625198 s : r_o2.06381058
|
||||||
|
Grid : Message : 72.625507 s : res4.12758736
|
||||||
|
Grid : Message : 73.759140 s : norm diff 0
|
||||||
|
Grid : Message : 73.868204 s : norm diff even 0
|
||||||
|
Grid : Message : 73.907201 s : norm diff odd 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 74.414582 s : Testing without internode communication
|
||||||
|
Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 74.414586 s : Grid Layout
|
||||||
|
Grid : Message : 74.414586 s : Global lattice size : 32 32 64 64
|
||||||
|
Grid : Message : 74.414594 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 74.414595 s : MPI tasks : 1 1 2 2
|
||||||
|
Grid : Message : 74.679364 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 74.759525 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 119.252016 s : Drawing gauge field
|
||||||
|
Grid : Message : 129.919846 s : Random gauge initialised
|
||||||
|
Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 129.923611 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 135.522878 s : *****************************************************************
|
||||||
|
Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 135.522899 s : *****************************************************************
|
||||||
|
Grid : Message : 135.522899 s : *****************************************************************
|
||||||
|
Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 135.522901 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 135.522903 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 135.522908 s : *****************************************************************
|
||||||
|
Grid : Message : 136.151202 s : Called warmup
|
||||||
|
Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
|
||||||
|
Grid : Message : 137.224748 s : mflop/s = 24755806
|
||||||
|
Grid : Message : 137.224751 s : mflop/s per rank = 6188951.49
|
||||||
|
Grid : Message : 137.224753 s : mflop/s per node = 24755806
|
||||||
|
Grid : Message : 137.235239 s : norm diff 5.8108784e-14 Line 306
|
||||||
|
Grid : Message : 146.451686 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 146.451710 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 146.451712 s : Called DwDag
|
||||||
|
Grid : Message : 146.451714 s : norm dag result 4.12810493
|
||||||
|
Grid : Message : 146.452323 s : norm dag ref 4.12810493
|
||||||
|
Grid : Message : 146.454799 s : norm dag diff 3.40632318e-14 Line 377
|
||||||
|
Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 146.940894 s : src_e0.500003185
|
||||||
|
Grid : Message : 146.953676 s : src_o0.499996882
|
||||||
|
Grid : Message : 146.955927 s : *********************************************************
|
||||||
|
Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 146.955932 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 146.955941 s : *********************************************************
|
||||||
|
Grid : Message : 147.511975 s : Deo mflop/s = 24036256.5
|
||||||
|
Grid : Message : 147.511989 s : Deo mflop/s per rank 6009064.13
|
||||||
|
Grid : Message : 147.511991 s : Deo mflop/s per node 24036256.5
|
||||||
|
Grid : Message : 147.522100 s : r_e2.06377678
|
||||||
|
Grid : Message : 147.522433 s : r_o2.06381058
|
||||||
|
Grid : Message : 147.522745 s : res4.12758736
|
||||||
|
Grid : Message : 148.229848 s : norm diff 0
|
||||||
|
Grid : Message : 149.233474 s : norm diff even 0
|
||||||
|
Grid : Message : 149.235815 s : norm diff odd 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 149.960990 s : Testing without intranode communication
|
||||||
|
Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 149.960995 s : Grid Layout
|
||||||
|
Grid : Message : 149.960995 s : Global lattice size : 32 32 64 64
|
||||||
|
Grid : Message : 149.961003 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 149.961004 s : MPI tasks : 1 1 2 2
|
||||||
|
Grid : Message : 150.155810 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 150.973420 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 193.933765 s : Drawing gauge field
|
||||||
|
Grid : Message : 204.611551 s : Random gauge initialised
|
||||||
|
Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 204.615265 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 210.117788 s : *****************************************************************
|
||||||
|
Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 210.117809 s : *****************************************************************
|
||||||
|
Grid : Message : 210.117810 s : *****************************************************************
|
||||||
|
Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 210.117813 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 210.117814 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 210.117819 s : *****************************************************************
|
||||||
|
Grid : Message : 210.714641 s : Called warmup
|
||||||
|
Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
|
||||||
|
Grid : Message : 211.892252 s : mflop/s = 22568003.2
|
||||||
|
Grid : Message : 211.892255 s : mflop/s per rank = 5642000.8
|
||||||
|
Grid : Message : 211.892257 s : mflop/s per node = 22568003.2
|
||||||
|
Grid : Message : 211.896037 s : norm diff 5.8108784e-14 Line 306
|
||||||
|
Grid : Message : 220.751375 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 220.751409 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 220.751411 s : Called DwDag
|
||||||
|
Grid : Message : 220.751412 s : norm dag result 4.12810493
|
||||||
|
Grid : Message : 220.753307 s : norm dag ref 4.12810493
|
||||||
|
Grid : Message : 220.755796 s : norm dag diff 3.40632318e-14 Line 377
|
||||||
|
Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 221.697800 s : src_e0.500003185
|
||||||
|
Grid : Message : 221.890920 s : src_o0.499996882
|
||||||
|
Grid : Message : 221.913430 s : *********************************************************
|
||||||
|
Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 221.913480 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 221.913550 s : *********************************************************
|
||||||
|
Grid : Message : 221.645213 s : Deo mflop/s = 24114032
|
||||||
|
Grid : Message : 221.645228 s : Deo mflop/s per rank 6028508.01
|
||||||
|
Grid : Message : 221.645231 s : Deo mflop/s per node 24114032
|
||||||
|
Grid : Message : 221.656021 s : r_e2.06377678
|
||||||
|
Grid : Message : 221.656389 s : r_o2.06381058
|
||||||
|
Grid : Message : 221.656698 s : res4.12758736
|
||||||
|
Grid : Message : 222.110075 s : norm diff 0
|
||||||
|
Grid : Message : 222.857692 s : norm diff even 0
|
||||||
|
Grid : Message : 222.875763 s : norm diff odd 0
|
||||||
|
Grid : Message : 223.598127 s : *******************************************
|
||||||
|
Grid : Message : 223.598145 s : ******* Grid Finalize ******
|
||||||
|
Grid : Message : 223.598146 s : *******************************************
|
286
systems/Jupiter/benchmarks/dwf.4node.perf
Normal file
286
systems/Jupiter/benchmarks/dwf.4node.perf
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||||
|
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||||
|
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||||
|
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||||
|
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||||
|
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||||
|
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||||
|
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||||
|
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||||
|
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||||
|
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||||
|
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||||
|
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||||
|
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||||
|
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||||
|
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||||
|
SLURM detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 1
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 0 device 0 bus id: 0009:01:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
SharedMemoryMpi: World communicator of size 16
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 81604378624 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 0.838000 s : Testing with full communication
|
||||||
|
Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 0.840000 s : Grid Layout
|
||||||
|
Grid : Message : 0.840000 s : Global lattice size : 64 64 64 64
|
||||||
|
Grid : Message : 0.846000 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 0.846000 s : MPI tasks : 2 2 2 2
|
||||||
|
Grid : Message : 0.165970 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 0.960410 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
local rank 2 device 0 bus id: 0029:01:00.0
|
||||||
|
local rank 3 device 0 bus id: 0039:01:00.0
|
||||||
|
local rank 1 device 0 bus id: 0019:01:00.0
|
||||||
|
Grid : Message : 44.657270 s : Drawing gauge field
|
||||||
|
Grid : Message : 55.247733 s : Random gauge initialised
|
||||||
|
Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 55.253053 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 62.191747 s : *****************************************************************
|
||||||
|
Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 62.191768 s : *****************************************************************
|
||||||
|
Grid : Message : 62.191769 s : *****************************************************************
|
||||||
|
Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 62.191769 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 62.191770 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 62.191772 s : *****************************************************************
|
||||||
|
Grid : Message : 62.857568 s : Called warmup
|
||||||
|
Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
|
||||||
|
Grid : Message : 65.582120 s : mflop/s = 48306525
|
||||||
|
Grid : Message : 65.582140 s : mflop/s per rank = 3019157.81
|
||||||
|
Grid : Message : 65.582150 s : mflop/s per node = 12076631.3
|
||||||
|
Grid : Message : 65.637550 s : norm diff 5.80156793e-14 Line 306
|
||||||
|
Grid : Message : 75.122153 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 75.122167 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 75.122167 s : Called DwDag
|
||||||
|
Grid : Message : 75.122167 s : norm dag result 4.12801829
|
||||||
|
Grid : Message : 75.123295 s : norm dag ref 4.12801829
|
||||||
|
Grid : Message : 75.125890 s : norm dag diff 3.42093991e-14 Line 377
|
||||||
|
Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 75.605683 s : src_e0.500004005
|
||||||
|
Grid : Message : 75.617824 s : src_o0.499996067
|
||||||
|
Grid : Message : 75.620089 s : *********************************************************
|
||||||
|
Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 75.620093 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 75.620096 s : *********************************************************
|
||||||
|
Grid : Message : 76.732272 s : Deo mflop/s = 48068252.4
|
||||||
|
Grid : Message : 76.732283 s : Deo mflop/s per rank 3004265.77
|
||||||
|
Grid : Message : 76.732285 s : Deo mflop/s per node 12017063.1
|
||||||
|
Grid : Message : 76.749317 s : r_e2.06443136
|
||||||
|
Grid : Message : 76.749652 s : r_o2.06378451
|
||||||
|
Grid : Message : 76.749955 s : res4.12821587
|
||||||
|
Grid : Message : 77.198827 s : norm diff 0
|
||||||
|
Grid : Message : 77.981760 s : norm diff even 0
|
||||||
|
Grid : Message : 78.455900 s : norm diff odd 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 78.539337 s : Testing without internode communication
|
||||||
|
Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 78.539339 s : Grid Layout
|
||||||
|
Grid : Message : 78.539339 s : Global lattice size : 64 64 64 64
|
||||||
|
Grid : Message : 78.539347 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 78.539348 s : MPI tasks : 2 2 2 2
|
||||||
|
Grid : Message : 78.798501 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 78.879916 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 124.586264 s : Drawing gauge field
|
||||||
|
Grid : Message : 135.338090 s : Random gauge initialised
|
||||||
|
Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 135.341266 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 142.604280 s : *****************************************************************
|
||||||
|
Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 142.604460 s : *****************************************************************
|
||||||
|
Grid : Message : 142.604470 s : *****************************************************************
|
||||||
|
Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 142.604480 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 142.604500 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 142.604520 s : *****************************************************************
|
||||||
|
Grid : Message : 142.686034 s : Called warmup
|
||||||
|
Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
|
||||||
|
Grid : Message : 144.868559 s : mflop/s = 48706194.1
|
||||||
|
Grid : Message : 144.868561 s : mflop/s per rank = 3044137.13
|
||||||
|
Grid : Message : 144.868562 s : mflop/s per node = 12176548.5
|
||||||
|
Grid : Message : 144.887595 s : norm diff 5.80156793e-14 Line 306
|
||||||
|
Grid : Message : 153.622978 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 153.622995 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 153.622995 s : Called DwDag
|
||||||
|
Grid : Message : 153.622996 s : norm dag result 4.12801829
|
||||||
|
Grid : Message : 153.623604 s : norm dag ref 4.12801829
|
||||||
|
Grid : Message : 153.626098 s : norm dag diff 3.42093991e-14 Line 377
|
||||||
|
Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 154.148319 s : src_e0.500004005
|
||||||
|
Grid : Message : 154.151454 s : src_o0.499996067
|
||||||
|
Grid : Message : 154.153722 s : *********************************************************
|
||||||
|
Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 154.153725 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 154.153728 s : *********************************************************
|
||||||
|
Grid : Message : 155.200671 s : Deo mflop/s = 51121022.4
|
||||||
|
Grid : Message : 155.200682 s : Deo mflop/s per rank 3195063.9
|
||||||
|
Grid : Message : 155.200684 s : Deo mflop/s per node 12780255.6
|
||||||
|
Grid : Message : 155.217204 s : r_e2.06443136
|
||||||
|
Grid : Message : 155.217550 s : r_o2.06378451
|
||||||
|
Grid : Message : 155.217869 s : res4.12821587
|
||||||
|
Grid : Message : 155.673744 s : norm diff 0
|
||||||
|
Grid : Message : 156.463329 s : norm diff even 0
|
||||||
|
Grid : Message : 156.878866 s : norm diff odd 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 157.620764 s : Testing without intranode communication
|
||||||
|
Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 157.620766 s : Grid Layout
|
||||||
|
Grid : Message : 157.620766 s : Global lattice size : 64 64 64 64
|
||||||
|
Grid : Message : 157.620773 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 157.620774 s : MPI tasks : 2 2 2 2
|
||||||
|
Grid : Message : 157.671479 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 157.755651 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 202.465158 s : Drawing gauge field
|
||||||
|
Grid : Message : 213.214546 s : Random gauge initialised
|
||||||
|
Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 213.217711 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 219.662772 s : *****************************************************************
|
||||||
|
Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 219.662787 s : *****************************************************************
|
||||||
|
Grid : Message : 219.662788 s : *****************************************************************
|
||||||
|
Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 219.662789 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 219.662790 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 219.662791 s : *****************************************************************
|
||||||
|
Grid : Message : 220.425592 s : Called warmup
|
||||||
|
Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
|
||||||
|
Grid : Message : 222.536267 s : mflop/s = 50365105.5
|
||||||
|
Grid : Message : 222.536269 s : mflop/s per rank = 3147819.09
|
||||||
|
Grid : Message : 222.536270 s : mflop/s per node = 12591276.4
|
||||||
|
Grid : Message : 222.541053 s : norm diff 5.80156793e-14 Line 306
|
||||||
|
Grid : Message : 232.135901 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 232.135916 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 232.135917 s : Called DwDag
|
||||||
|
Grid : Message : 232.135918 s : norm dag result 4.12801829
|
||||||
|
Grid : Message : 232.151938 s : norm dag ref 4.12801829
|
||||||
|
Grid : Message : 232.154451 s : norm dag diff 3.42093991e-14 Line 377
|
||||||
|
Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 232.630529 s : src_e0.500004005
|
||||||
|
Grid : Message : 232.643197 s : src_o0.499996067
|
||||||
|
Grid : Message : 232.645527 s : *********************************************************
|
||||||
|
Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 232.645532 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 232.645535 s : *********************************************************
|
||||||
|
Grid : Message : 233.774184 s : Deo mflop/s = 47432091.9
|
||||||
|
Grid : Message : 233.774194 s : Deo mflop/s per rank 2964505.74
|
||||||
|
Grid : Message : 233.774196 s : Deo mflop/s per node 11858023
|
||||||
|
Grid : Message : 233.791552 s : r_e2.06443136
|
||||||
|
Grid : Message : 233.791899 s : r_o2.06378451
|
||||||
|
Grid : Message : 233.792204 s : res4.12821587
|
||||||
|
Grid : Message : 234.230783 s : norm diff 0
|
||||||
|
Grid : Message : 235.162780 s : norm diff even 0
|
||||||
|
Grid : Message : 235.291950 s : norm diff odd 0
|
||||||
|
Grid : Message : 235.765411 s : *******************************************
|
||||||
|
Grid : Message : 235.765424 s : ******* Grid Finalize ******
|
||||||
|
Grid : Message : 235.765425 s : *******************************************
|
||||||
|
|