mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-17 15:27:06 +01:00
Compare commits
2 Commits
specflow
...
cbd4de2c26
Author | SHA1 | Date | |
---|---|---|---|
cbd4de2c26 | |||
bdb302be66 |
@ -191,7 +191,7 @@ public:
|
|||||||
|
|
||||||
Lattice<sobj> pgbuf(&pencil_g);
|
Lattice<sobj> pgbuf(&pencil_g);
|
||||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||||
//std::cout << "CPU view" << std::endl;
|
std::cout << "CPU view" << std::endl;
|
||||||
|
|
||||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||||
@ -215,7 +215,7 @@ public:
|
|||||||
else if ( sign == forward ) div = 1.0;
|
else if ( sign == forward ) div = 1.0;
|
||||||
else assert(0);
|
else assert(0);
|
||||||
|
|
||||||
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
||||||
FFTW_plan p;
|
FFTW_plan p;
|
||||||
{
|
{
|
||||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
||||||
@ -229,7 +229,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Barrel shift and collect global pencil
|
// Barrel shift and collect global pencil
|
||||||
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
||||||
Coordinate lcoor(Nd), gcoor(Nd);
|
Coordinate lcoor(Nd), gcoor(Nd);
|
||||||
result = source;
|
result = source;
|
||||||
int pc = processor_coor[dim];
|
int pc = processor_coor[dim];
|
||||||
@ -251,7 +251,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
|
std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
|
||||||
// Loop over orthog coords
|
// Loop over orthog coords
|
||||||
int NN=pencil_g.lSites();
|
int NN=pencil_g.lSites();
|
||||||
GridStopWatch timer;
|
GridStopWatch timer;
|
||||||
@ -274,7 +274,7 @@ public:
|
|||||||
usec += timer.useconds();
|
usec += timer.useconds();
|
||||||
flops+= flops_call*NN;
|
flops+= flops_call*NN;
|
||||||
|
|
||||||
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
|
std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
|
||||||
// writing out result
|
// writing out result
|
||||||
{
|
{
|
||||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||||
@ -291,7 +291,7 @@ public:
|
|||||||
}
|
}
|
||||||
result = result*div;
|
result = result*div;
|
||||||
|
|
||||||
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
|
std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
|
||||||
// destroying plan
|
// destroying plan
|
||||||
FFTW<scalar>::fftw_destroy_plan(p);
|
FFTW<scalar>::fftw_destroy_plan(p);
|
||||||
#endif
|
#endif
|
||||||
|
@ -277,38 +277,6 @@ public:
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class Matrix,class Field>
|
|
||||||
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
|
|
||||||
Matrix &_Mat;
|
|
||||||
RealD shift;
|
|
||||||
public:
|
|
||||||
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
|
|
||||||
// Support for coarsening to a multigrid
|
|
||||||
void OpDiag (const Field &in, Field &out) {
|
|
||||||
_Mat.Mdiag(in,out);
|
|
||||||
out = out + shift*in;
|
|
||||||
}
|
|
||||||
void OpDir (const Field &in, Field &out,int dir,int disp) {
|
|
||||||
_Mat.Mdir(in,out,dir,disp);
|
|
||||||
}
|
|
||||||
void OpDirAll (const Field &in, std::vector<Field> &out){
|
|
||||||
_Mat.MdirAll(in,out);
|
|
||||||
};
|
|
||||||
void Op (const Field &in, Field &out){
|
|
||||||
_Mat.M(in,out);
|
|
||||||
out = out + shift * in;
|
|
||||||
}
|
|
||||||
void AdjOp (const Field &in, Field &out){
|
|
||||||
_Mat.Mdag(in,out);
|
|
||||||
out = out + shift * in;
|
|
||||||
}
|
|
||||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
void HermOp(const Field &in, Field &out){
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
// Even Odd Schur decomp operators; there are several
|
// Even Odd Schur decomp operators; there are several
|
||||||
|
@ -208,8 +208,8 @@ public:
|
|||||||
assert(Bkn.size()==batchCount);
|
assert(Bkn.size()==batchCount);
|
||||||
assert(Cmn.size()==batchCount);
|
assert(Cmn.size()==batchCount);
|
||||||
|
|
||||||
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
//assert(OpB!=GridBLAS_OP_T);
|
assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
@ -367,67 +367,28 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk * eBkn ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
|
||||||
});
|
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
|
||||||
if (std::abs(beta) != 0.0)
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
|
||||||
});
|
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
|
||||||
} );
|
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
|
||||||
if (std::abs(beta) != 0.0)
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
|
||||||
} );
|
} );
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -453,8 +414,8 @@ public:
|
|||||||
RealD t2=usecond();
|
RealD t2=usecond();
|
||||||
int32_t batchCount = Amk.size();
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
//assert(OpB!=GridBLAS_OP_T);
|
assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
int lda = m; // m x k column major
|
||||||
int ldb = k; // k x n column major
|
int ldb = k; // k x n column major
|
||||||
@ -553,70 +514,28 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk * eBkn ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
|
||||||
});
|
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
|
||||||
if (std::abs(beta) != 0.0)
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
|
||||||
});
|
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
|
||||||
if (std::abs(beta) != 0.0)
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
|
||||||
} );
|
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
|
||||||
if (std::abs(beta) != 0.0)
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
|
||||||
} );
|
} );
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -742,41 +661,29 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk * eBkn ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
} );
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
|
||||||
});
|
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@ -902,40 +809,28 @@ public:
|
|||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk * eBkn ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
|
||||||
});
|
});
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
thread_for (p, batchCount, {
|
thread_for (p, batchCount, {
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
if (std::abs(beta) != 0.0)
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
|
||||||
else
|
|
||||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
|
@ -245,10 +245,9 @@ until convergence
|
|||||||
_HermOp(src_n,tmp);
|
_HermOp(src_n,tmp);
|
||||||
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
|
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
|
||||||
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
|
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
|
||||||
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
|
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
|
||||||
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
|
|
||||||
RealD vden = norm2(src_n);
|
RealD vden = norm2(src_n);
|
||||||
RealD na = std::sqrt(vnum/vden);
|
RealD na = vnum/vden;
|
||||||
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
|
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
|
||||||
i=_MAX_ITER_IRL_MEVAPP_;
|
i=_MAX_ITER_IRL_MEVAPP_;
|
||||||
evalMaxApprox = na;
|
evalMaxApprox = na;
|
||||||
@ -256,7 +255,6 @@ until convergence
|
|||||||
src_n = tmp;
|
src_n = tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
|
|
||||||
|
|
||||||
std::vector<RealD> lme(Nm);
|
std::vector<RealD> lme(Nm);
|
||||||
std::vector<RealD> lme2(Nm);
|
std::vector<RealD> lme2(Nm);
|
||||||
|
@ -97,7 +97,7 @@ public:
|
|||||||
|
|
||||||
RealD scale;
|
RealD scale;
|
||||||
|
|
||||||
ConjugateGradient<FineField> CG(1.0e-3,400,false);
|
ConjugateGradient<FineField> CG(1.0e-2,100,false);
|
||||||
FineField noise(FineGrid);
|
FineField noise(FineGrid);
|
||||||
FineField Mn(FineGrid);
|
FineField Mn(FineGrid);
|
||||||
|
|
||||||
@ -110,7 +110,7 @@ public:
|
|||||||
|
|
||||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||||
|
|
||||||
for(int i=0;i<4;i++){
|
for(int i=0;i<1;i++){
|
||||||
|
|
||||||
CG(hermop,noise,subspace[b]);
|
CG(hermop,noise,subspace[b]);
|
||||||
|
|
||||||
@ -146,7 +146,7 @@ public:
|
|||||||
|
|
||||||
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
|
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
|
||||||
|
|
||||||
for(int i=0;i<2;i++){
|
for(int i=0;i<3;i++){
|
||||||
// void operator() (const Field &src, Field &psi){
|
// void operator() (const Field &src, Field &psi){
|
||||||
#if 1
|
#if 1
|
||||||
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
|
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
|
||||||
|
@ -441,20 +441,8 @@ public:
|
|||||||
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
//////////////////////////////////////////////////////////////////////
|
|
||||||
// Galerkin projection of matrix
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
|
||||||
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||||
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
||||||
{
|
|
||||||
CoarsenOperator(linop,Subspace,Subspace);
|
|
||||||
}
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
|
||||||
// Petrov - Galerkin projection of matrix
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
|
||||||
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
|
||||||
Aggregation<Fobj,CComplex,nbasis> & U,
|
|
||||||
Aggregation<Fobj,CComplex,nbasis> & V)
|
|
||||||
{
|
{
|
||||||
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
|
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
|
||||||
GridBase *grid = FineGrid();
|
GridBase *grid = FineGrid();
|
||||||
@ -470,9 +458,11 @@ public:
|
|||||||
// Orthogonalise the subblocks over the basis
|
// Orthogonalise the subblocks over the basis
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
CoarseScalar InnerProd(CoarseGrid());
|
CoarseScalar InnerProd(CoarseGrid());
|
||||||
blockOrthogonalise(InnerProd,V.subspace);
|
blockOrthogonalise(InnerProd,Subspace.subspace);
|
||||||
blockOrthogonalise(InnerProd,U.subspace);
|
|
||||||
|
|
||||||
|
// for(int s=0;s<Subspace.subspace.size();s++){
|
||||||
|
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
|
||||||
|
// }
|
||||||
const int npoint = geom.npoint;
|
const int npoint = geom.npoint;
|
||||||
|
|
||||||
Coordinate clatt = CoarseGrid()->GlobalDimensions();
|
Coordinate clatt = CoarseGrid()->GlobalDimensions();
|
||||||
@ -552,7 +542,7 @@ public:
|
|||||||
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
|
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
|
||||||
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
|
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
|
||||||
tphaseBZ-=usecond();
|
tphaseBZ-=usecond();
|
||||||
phaV = phaF[p]*V.subspace[i];
|
phaV = phaF[p]*Subspace.subspace[i];
|
||||||
tphaseBZ+=usecond();
|
tphaseBZ+=usecond();
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
@ -565,7 +555,7 @@ public:
|
|||||||
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
|
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
|
||||||
|
|
||||||
tproj-=usecond();
|
tproj-=usecond();
|
||||||
blockProject(coarseInner,MphaV,U.subspace);
|
blockProject(coarseInner,MphaV,Subspace.subspace);
|
||||||
coarseInner = conjugate(pha[p]) * coarseInner;
|
coarseInner = conjugate(pha[p]) * coarseInner;
|
||||||
|
|
||||||
ComputeProj[p] = coarseInner;
|
ComputeProj[p] = coarseInner;
|
||||||
|
@ -69,7 +69,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
||||||
void construct(pointer __p, const _Tp& __val) { };
|
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
||||||
void construct(pointer __p) { };
|
void construct(pointer __p) { };
|
||||||
void destroy(pointer __p) { };
|
void destroy(pointer __p) { };
|
||||||
};
|
};
|
||||||
@ -175,11 +175,10 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
|
|||||||
// Template typedefs
|
// Template typedefs
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
|
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
|
||||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
|
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; //
|
||||||
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
|
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
|
||||||
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
|
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
|
||||||
|
|
||||||
/*
|
|
||||||
template<class T> class vecView
|
template<class T> class vecView
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
@ -215,7 +214,6 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
|
|||||||
#define autoVecView(v_v,v,mode) \
|
#define autoVecView(v_v,v,mode) \
|
||||||
auto v_v = VectorView(v,mode); \
|
auto v_v = VectorView(v,mode); \
|
||||||
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
||||||
*/
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -9,7 +9,6 @@ static char print_buffer [ MAXLINE ];
|
|||||||
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
|
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
|
||||||
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
|
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
|
||||||
//#define dprintf(...)
|
//#define dprintf(...)
|
||||||
//#define mprintf(...)
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// For caching copies of data on device
|
// For caching copies of data on device
|
||||||
@ -110,7 +109,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
assert(AccCache.state!=Empty);
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||||
assert(AccCache.accLock==0);
|
assert(AccCache.accLock==0);
|
||||||
assert(AccCache.cpuLock==0);
|
assert(AccCache.cpuLock==0);
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
@ -120,7 +119,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
DeviceBytes -=AccCache.bytes;
|
DeviceBytes -=AccCache.bytes;
|
||||||
LRUremove(AccCache);
|
LRUremove(AccCache);
|
||||||
AccCache.AccPtr=(uint64_t) NULL;
|
AccCache.AccPtr=(uint64_t) NULL;
|
||||||
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||||
}
|
}
|
||||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
EntryErase(CpuPtr);
|
EntryErase(CpuPtr);
|
||||||
@ -140,7 +139,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
assert(AccCache.state!=Empty);
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
|
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
|
||||||
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
|
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
|
||||||
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
|
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
|
||||||
if (AccCache.accLock!=0) return;
|
if (AccCache.accLock!=0) return;
|
||||||
@ -154,7 +153,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
|||||||
AccCache.AccPtr=(uint64_t)NULL;
|
AccCache.AccPtr=(uint64_t)NULL;
|
||||||
AccCache.state=CpuDirty; // CPU primary now
|
AccCache.state=CpuDirty; // CPU primary now
|
||||||
DeviceBytes -=AccCache.bytes;
|
DeviceBytes -=AccCache.bytes;
|
||||||
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||||
}
|
}
|
||||||
// uint64_t CpuPtr = AccCache.CpuPtr;
|
// uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
DeviceEvictions++;
|
DeviceEvictions++;
|
||||||
@ -168,7 +167,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
|||||||
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
||||||
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
DeviceToHostBytes+=AccCache.bytes;
|
DeviceToHostBytes+=AccCache.bytes;
|
||||||
DeviceToHostXfer++;
|
DeviceToHostXfer++;
|
||||||
AccCache.state=Consistent;
|
AccCache.state=Consistent;
|
||||||
@ -183,7 +182,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
|
|||||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||||
DeviceBytes+=AccCache.bytes;
|
DeviceBytes+=AccCache.bytes;
|
||||||
}
|
}
|
||||||
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
|
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx\n",
|
||||||
(uint64_t)AccCache.bytes,
|
(uint64_t)AccCache.bytes,
|
||||||
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
@ -211,7 +210,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
||||||
{
|
{
|
||||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||||
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
|
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
|
||||||
AcceleratorViewClose((uint64_t)Ptr);
|
AcceleratorViewClose((uint64_t)Ptr);
|
||||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||||
CpuViewClose((uint64_t)Ptr);
|
CpuViewClose((uint64_t)Ptr);
|
||||||
@ -223,7 +222,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
|||||||
{
|
{
|
||||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||||
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
|
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
|
||||||
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
||||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||||
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
||||||
@ -234,9 +233,6 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
|||||||
}
|
}
|
||||||
void MemoryManager::EvictVictims(uint64_t bytes)
|
void MemoryManager::EvictVictims(uint64_t bytes)
|
||||||
{
|
{
|
||||||
if(bytes>=DeviceMaxBytes) {
|
|
||||||
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
|
|
||||||
}
|
|
||||||
assert(bytes<DeviceMaxBytes);
|
assert(bytes<DeviceMaxBytes);
|
||||||
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
||||||
if ( DeviceLRUBytes > 0){
|
if ( DeviceLRUBytes > 0){
|
||||||
@ -269,7 +265,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
assert(AccCache.cpuLock==0); // Programming error
|
assert(AccCache.cpuLock==0); // Programming error
|
||||||
|
|
||||||
if(AccCache.state!=Empty) {
|
if(AccCache.state!=Empty) {
|
||||||
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
|
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n",
|
||||||
(uint64_t)AccCache.CpuPtr,
|
(uint64_t)AccCache.CpuPtr,
|
||||||
(uint64_t)CpuPtr,
|
(uint64_t)CpuPtr,
|
||||||
(uint64_t)AccCache.bytes,
|
(uint64_t)AccCache.bytes,
|
||||||
@ -309,7 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
||||||
}
|
}
|
||||||
AccCache.accLock= 1;
|
AccCache.accLock= 1;
|
||||||
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
|
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
|
||||||
} else if(AccCache.state==CpuDirty ){
|
} else if(AccCache.state==CpuDirty ){
|
||||||
if(mode==AcceleratorWriteDiscard) {
|
if(mode==AcceleratorWriteDiscard) {
|
||||||
CpuDiscard(AccCache);
|
CpuDiscard(AccCache);
|
||||||
@ -322,21 +318,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
||||||
}
|
}
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
|
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
|
||||||
} else if(AccCache.state==Consistent) {
|
} else if(AccCache.state==Consistent) {
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
||||||
else
|
else
|
||||||
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
|
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
|
||||||
} else if(AccCache.state==AccDirty) {
|
} else if(AccCache.state==AccDirty) {
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
||||||
else
|
else
|
||||||
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
|
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@ -345,7 +341,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
// If view is opened on device must remove from LRU
|
// If view is opened on device must remove from LRU
|
||||||
if(AccCache.LRU_valid==1){
|
if(AccCache.LRU_valid==1){
|
||||||
// must possibly remove from LRU as now locked on GPU
|
// must possibly remove from LRU as now locked on GPU
|
||||||
dprintf("AccCache entry removed from LRU ");
|
dprintf("AccCache entry removed from LRU \n");
|
||||||
LRUremove(AccCache);
|
LRUremove(AccCache);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -368,10 +364,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
|
|||||||
AccCache.accLock--;
|
AccCache.accLock--;
|
||||||
// Move to LRU queue if not locked and close on device
|
// Move to LRU queue if not locked and close on device
|
||||||
if(AccCache.accLock==0) {
|
if(AccCache.accLock==0) {
|
||||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||||
LRUinsert(AccCache);
|
LRUinsert(AccCache);
|
||||||
} else {
|
} else {
|
||||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
||||||
|
@ -33,8 +33,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
///////////////////////////////////
|
///////////////////////////////////
|
||||||
#include <Grid/communicator/SharedMemory.h>
|
#include <Grid/communicator/SharedMemory.h>
|
||||||
|
|
||||||
#define NVLINK_GET
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
extern bool Stencil_force_mpi ;
|
extern bool Stencil_force_mpi ;
|
||||||
@ -149,8 +147,7 @@ public:
|
|||||||
sizeof(obj),d*100+p);
|
sizeof(obj),d*100+p);
|
||||||
|
|
||||||
}
|
}
|
||||||
if (!list.empty()) // avoid triggering assert in comms == none
|
CommsComplete(list);
|
||||||
CommsComplete(list);
|
|
||||||
for(int p=1;p<_processors[d];p++){
|
for(int p=1;p<_processors[d];p++){
|
||||||
accum = accum + column[p];
|
accum = accum + column[p];
|
||||||
}
|
}
|
||||||
@ -195,11 +192,6 @@ public:
|
|||||||
void *recv,
|
void *recv,
|
||||||
int recv_from_rank,int do_recv,
|
int recv_from_rank,int do_recv,
|
||||||
int xbytes,int rbytes,int dir);
|
int xbytes,int rbytes,int dir);
|
||||||
|
|
||||||
// Could do a PollHtoD and have a CommsMerge dependence
|
|
||||||
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
|
|
||||||
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
|
||||||
|
|
||||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int xmit_to_rank,int do_xmit,
|
int xmit_to_rank,int do_xmit,
|
||||||
|
@ -30,7 +30,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
|
||||||
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
@ -363,6 +362,8 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
int bytes)
|
int bytes)
|
||||||
{
|
{
|
||||||
std::vector<MpiCommsRequest_t> reqs(0);
|
std::vector<MpiCommsRequest_t> reqs(0);
|
||||||
|
unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
||||||
|
unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
||||||
|
|
||||||
int myrank = _processor;
|
int myrank = _processor;
|
||||||
int ierr;
|
int ierr;
|
||||||
@ -378,6 +379,9 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
communicator,MPI_STATUS_IGNORE);
|
communicator,MPI_STATUS_IGNORE);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
|
|
||||||
|
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
||||||
|
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||||
|
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
|
||||||
}
|
}
|
||||||
// Basic Halo comms primitive
|
// Basic Halo comms primitive
|
||||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||||
@ -395,8 +399,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
|
|
||||||
|
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,int dox,
|
int dest,int dox,
|
||||||
@ -438,15 +440,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
list.push_back(rrq);
|
list.push_back(rrq);
|
||||||
off_node_bytes+=rbytes;
|
off_node_bytes+=rbytes;
|
||||||
}
|
}
|
||||||
#ifdef NVLINK_GET
|
|
||||||
else {
|
|
||||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
|
||||||
assert(shm!=NULL);
|
|
||||||
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
// This is a NVLINK PUT
|
|
||||||
if (dox) {
|
if (dox) {
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
tag= dir+_processor*32;
|
tag= dir+_processor*32;
|
||||||
@ -455,11 +450,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
list.push_back(xrq);
|
list.push_back(xrq);
|
||||||
off_node_bytes+=xbytes;
|
off_node_bytes+=xbytes;
|
||||||
} else {
|
} else {
|
||||||
#ifndef NVLINK_GET
|
|
||||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||||
assert(shm!=NULL);
|
assert(shm!=NULL);
|
||||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
@ -468,7 +461,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||||
{
|
{
|
||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
/*finishes Get/Put*/
|
|
||||||
acceleratorCopySynchronise();
|
acceleratorCopySynchronise();
|
||||||
|
|
||||||
if (nreq==0) return;
|
if (nreq==0) return;
|
||||||
@ -568,105 +561,53 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
|||||||
|
|
||||||
if (dox) {
|
if (dox) {
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
|
#undef DEVICE_TO_HOST_CONCURRENT // pipeline
|
||||||
|
#ifdef DEVICE_TO_HOST_CONCURRENT
|
||||||
tag= dir+_processor*32;
|
tag= dir+_processor*32;
|
||||||
|
|
||||||
host_xmit = this->HostBufferMalloc(xbytes);
|
host_xmit = this->HostBufferMalloc(xbytes);
|
||||||
CommsRequest_t srq;
|
acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
|
||||||
|
|
||||||
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
|
|
||||||
|
|
||||||
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
// assert(ierr==0);
|
// assert(ierr==0);
|
||||||
// off_node_bytes+=xbytes;
|
// off_node_bytes+=xbytes;
|
||||||
|
|
||||||
|
CommsRequest_t srq;
|
||||||
srq.PacketType = InterNodeXmit;
|
srq.PacketType = InterNodeXmit;
|
||||||
srq.bytes = xbytes;
|
srq.bytes = xbytes;
|
||||||
// srq.req = xrq;
|
// srq.req = xrq;
|
||||||
srq.host_buf = host_xmit;
|
srq.host_buf = host_xmit;
|
||||||
srq.device_buf = xmit;
|
srq.device_buf = xmit;
|
||||||
srq.tag = tag;
|
|
||||||
srq.dest = dest;
|
|
||||||
srq.commdir = commdir;
|
|
||||||
list.push_back(srq);
|
list.push_back(srq);
|
||||||
|
#else
|
||||||
|
tag= dir+_processor*32;
|
||||||
|
|
||||||
|
host_xmit = this->HostBufferMalloc(xbytes);
|
||||||
|
const int chunks=1;
|
||||||
|
for(int n=0;n<chunks;n++){
|
||||||
|
void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
|
||||||
|
void * xmitc = (void *)( (uint64_t) xmit + n*xbytes/chunks);
|
||||||
|
acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
|
||||||
|
}
|
||||||
|
acceleratorCopySynchronise(); // Complete all pending copy transfers
|
||||||
|
|
||||||
|
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
|
assert(ierr==0);
|
||||||
|
off_node_bytes+=xbytes;
|
||||||
|
|
||||||
|
CommsRequest_t srq;
|
||||||
|
srq.PacketType = InterNodeXmit;
|
||||||
|
srq.bytes = xbytes;
|
||||||
|
srq.req = xrq;
|
||||||
|
srq.host_buf = host_xmit;
|
||||||
|
srq.device_buf = xmit;
|
||||||
|
list.push_back(srq);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
* In the interest of better pipelining, poll for completion on each DtoH and
|
|
||||||
* start MPI_ISend in the meantime
|
|
||||||
*/
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
|
|
||||||
{
|
|
||||||
int pending = 0;
|
|
||||||
do {
|
|
||||||
|
|
||||||
pending = 0;
|
|
||||||
|
|
||||||
for(int idx = 0; idx<list.size();idx++){
|
|
||||||
|
|
||||||
if ( list[idx].PacketType==InterNodeRecv ) {
|
|
||||||
|
|
||||||
int flag = 0;
|
|
||||||
MPI_Status status;
|
|
||||||
int ierr = MPI_Test(&list[idx].req,&flag,&status);
|
|
||||||
assert(ierr==0);
|
|
||||||
|
|
||||||
if ( flag ) {
|
|
||||||
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
|
|
||||||
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
|
|
||||||
list[idx].PacketType=InterNodeReceiveHtoD;
|
|
||||||
} else {
|
|
||||||
pending ++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
|
|
||||||
} while ( pending );
|
|
||||||
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
|
|
||||||
{
|
|
||||||
int pending = 0;
|
|
||||||
do {
|
|
||||||
|
|
||||||
pending = 0;
|
|
||||||
|
|
||||||
for(int idx = 0; idx<list.size();idx++){
|
|
||||||
|
|
||||||
if ( list[idx].PacketType==InterNodeXmit ) {
|
|
||||||
|
|
||||||
if ( acceleratorEventIsComplete(list[idx].ev) ) {
|
|
||||||
|
|
||||||
void *host_xmit = list[idx].host_buf;
|
|
||||||
uint32_t xbytes = list[idx].bytes;
|
|
||||||
int dest = list[idx].dest;
|
|
||||||
int tag = list[idx].tag;
|
|
||||||
int commdir = list[idx].commdir;
|
|
||||||
///////////////////
|
|
||||||
// Send packet
|
|
||||||
///////////////////
|
|
||||||
|
|
||||||
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
|
|
||||||
|
|
||||||
MPI_Request xrq;
|
|
||||||
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
|
||||||
assert(ierr==0);
|
|
||||||
|
|
||||||
list[idx].req = xrq; // Update the MPI request in the list
|
|
||||||
|
|
||||||
list[idx].PacketType=InterNodeXmitISend;
|
|
||||||
|
|
||||||
} else {
|
|
||||||
// not done, so return to polling loop
|
|
||||||
pending++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} while (pending);
|
|
||||||
}
|
|
||||||
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
@ -703,89 +644,69 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
* - complete all copies
|
* - complete all copies
|
||||||
* - post MPI send asynch
|
* - post MPI send asynch
|
||||||
*/
|
*/
|
||||||
#ifdef NVLINK_GET
|
|
||||||
if ( dor ) {
|
|
||||||
|
|
||||||
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
// static int printed;
|
||||||
// Intranode
|
// if((printed<8) && this->IsBoss() ) {
|
||||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
// printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
|
||||||
assert(shm!=NULL);
|
// printed++;
|
||||||
|
// }
|
||||||
CommsRequest_t srq;
|
|
||||||
|
|
||||||
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
|
||||||
|
|
||||||
srq.PacketType = IntraNodeRecv;
|
|
||||||
srq.bytes = xbytes;
|
|
||||||
// srq.req = xrq;
|
|
||||||
srq.host_buf = NULL;
|
|
||||||
srq.device_buf = xmit;
|
|
||||||
srq.tag = -1;
|
|
||||||
srq.dest = dest;
|
|
||||||
srq.commdir = dir;
|
|
||||||
list.push_back(srq);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
if (dox) {
|
if (dox) {
|
||||||
|
|
||||||
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
// Intranode
|
#ifdef DEVICE_TO_HOST_CONCURRENT
|
||||||
|
tag= dir+_processor*32;
|
||||||
|
// Find the send in the prepared list
|
||||||
|
int list_idx=-1;
|
||||||
|
for(int idx = 0; idx<list.size();idx++){
|
||||||
|
|
||||||
|
if ( (list[idx].device_buf==xmit)
|
||||||
|
&&(list[idx].PacketType==InterNodeXmit)
|
||||||
|
&&(list[idx].bytes==xbytes) ) {
|
||||||
|
|
||||||
|
list_idx = idx;
|
||||||
|
host_xmit = list[idx].host_buf;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(list_idx != -1); // found it
|
||||||
|
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
|
assert(ierr==0);
|
||||||
|
list[list_idx].req = xrq; // Update the MPI request in the list
|
||||||
|
off_node_bytes+=xbytes;
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||||
assert(shm!=NULL);
|
assert(shm!=NULL);
|
||||||
|
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||||
CommsRequest_t srq;
|
|
||||||
|
|
||||||
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
|
||||||
|
|
||||||
srq.PacketType = IntraNodeXmit;
|
|
||||||
srq.bytes = xbytes;
|
|
||||||
// srq.req = xrq;
|
|
||||||
srq.host_buf = NULL;
|
|
||||||
srq.device_buf = xmit;
|
|
||||||
srq.tag = -1;
|
|
||||||
srq.dest = dest;
|
|
||||||
srq.commdir = dir;
|
|
||||||
list.push_back(srq);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||||
{
|
{
|
||||||
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
|
int nreq=list.size();
|
||||||
|
|
||||||
std::vector<MPI_Status> status;
|
if (nreq==0) return;
|
||||||
std::vector<MPI_Request> MpiRequests;
|
std::vector<MPI_Status> status(nreq);
|
||||||
|
std::vector<MPI_Request> MpiRequests(nreq);
|
||||||
for(int r=0;r<list.size();r++){
|
|
||||||
// Must check each Send buf is clear to reuse
|
|
||||||
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
|
|
||||||
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
|
|
||||||
}
|
|
||||||
|
|
||||||
int nreq=MpiRequests.size();
|
for(int r=0;r<nreq;r++){
|
||||||
|
MpiRequests[r] = list[r].req;
|
||||||
if (nreq>0) {
|
|
||||||
status.resize(MpiRequests.size());
|
|
||||||
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
|
||||||
assert(ierr==0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// for(int r=0;r<nreq;r++){
|
int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
|
||||||
// if ( list[r].PacketType==InterNodeRecv ) {
|
assert(ierr==0);
|
||||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
|
||||||
// }
|
for(int r=0;r<nreq;r++){
|
||||||
// }
|
if ( list[r].PacketType==InterNodeRecv ) {
|
||||||
|
acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
acceleratorCopySynchronise(); // Complete all pending copy transfers
|
||||||
list.resize(0); // Delete the list
|
list.resize(0); // Delete the list
|
||||||
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
||||||
#ifndef NVLINK_GET
|
this->StencilBarrier();
|
||||||
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
|
@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
|
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
|
||||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,
|
int dest,
|
||||||
@ -132,8 +132,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
{
|
{
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int xmit_to_rank,int dox,
|
int xmit_to_rank,int dox,
|
||||||
@ -141,7 +139,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
|||||||
int recv_from_rank,int dor,
|
int recv_from_rank,int dor,
|
||||||
int xbytes,int rbytes, int dir)
|
int xbytes,int rbytes, int dir)
|
||||||
{
|
{
|
||||||
return 0.0;
|
return xbytes+rbytes;
|
||||||
}
|
}
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
|
@ -50,30 +50,12 @@ typedef MPI_Request MpiCommsRequest_t;
|
|||||||
#ifdef ACCELERATOR_AWARE_MPI
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
typedef MPI_Request CommsRequest_t;
|
typedef MPI_Request CommsRequest_t;
|
||||||
#else
|
#else
|
||||||
/*
|
enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
|
||||||
* Enable state transitions as each packet flows.
|
|
||||||
*/
|
|
||||||
enum PacketType_t {
|
|
||||||
FaceGather,
|
|
||||||
InterNodeXmit,
|
|
||||||
InterNodeRecv,
|
|
||||||
IntraNodeXmit,
|
|
||||||
IntraNodeRecv,
|
|
||||||
InterNodeXmitISend,
|
|
||||||
InterNodeReceiveHtoD
|
|
||||||
};
|
|
||||||
/*
|
|
||||||
*Package arguments needed for various actions along packet flow
|
|
||||||
*/
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
PacketType_t PacketType;
|
PacketType_t PacketType;
|
||||||
void *host_buf;
|
void *host_buf;
|
||||||
void *device_buf;
|
void *device_buf;
|
||||||
int dest;
|
|
||||||
int tag;
|
|
||||||
int commdir;
|
|
||||||
unsigned long bytes;
|
unsigned long bytes;
|
||||||
acceleratorEvent_t ev;
|
|
||||||
MpiCommsRequest_t req;
|
MpiCommsRequest_t req;
|
||||||
} CommsRequest_t;
|
} CommsRequest_t;
|
||||||
#endif
|
#endif
|
||||||
@ -137,7 +119,7 @@ public:
|
|||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
||||||
static void SharedMemoryFree(void);
|
static void SharedMemoryFree(void);
|
||||||
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
||||||
static void SharedMemoryZero(void *dest,size_t bytes);
|
static void SharedMemoryZero(void *dest,size_t bytes);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -542,12 +542,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
// printf("Host buffer allocate for GPU non-aware MPI\n");
|
printf("Host buffer allocate for GPU non-aware MPI\n");
|
||||||
#if 0
|
#if 0
|
||||||
HostCommBuf= acceleratorAllocHost(bytes);
|
HostCommBuf= acceleratorAllocHost(bytes);
|
||||||
#else
|
#else
|
||||||
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
||||||
#if 0
|
#ifdef HAVE_NUMAIF_H
|
||||||
#warning "Moving host buffers to specific NUMA domain"
|
#warning "Moving host buffers to specific NUMA domain"
|
||||||
int numa;
|
int numa;
|
||||||
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
|
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
|
||||||
@ -916,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
|||||||
bzero(dest,bytes);
|
bzero(dest,bytes);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||||
//{
|
{
|
||||||
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
// acceleratorCopyToDevice(src,dest,bytes);
|
acceleratorCopyToDevice(src,dest,bytes);
|
||||||
//#else
|
#else
|
||||||
// bcopy(src,dest,bytes);
|
bcopy(src,dest,bytes);
|
||||||
//#endif
|
#endif
|
||||||
//}
|
}
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
// Global shared functionality finished
|
// Global shared functionality finished
|
||||||
// Now move to per communicator functionality
|
// Now move to per communicator functionality
|
||||||
@ -959,7 +959,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
|
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
|
||||||
|
|
||||||
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
|
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
|
||||||
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
|
|
||||||
}
|
}
|
||||||
ShmBufferFreeAll();
|
ShmBufferFreeAll();
|
||||||
|
|
||||||
@ -990,7 +989,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
SharedMemoryTest();
|
//SharedMemoryTest();
|
||||||
}
|
}
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
// On node barrier
|
// On node barrier
|
||||||
@ -1012,18 +1011,19 @@ void SharedMemory::SharedMemoryTest(void)
|
|||||||
check[0]=GlobalSharedMemory::WorldNode;
|
check[0]=GlobalSharedMemory::WorldNode;
|
||||||
check[1]=r;
|
check[1]=r;
|
||||||
check[2]=magic;
|
check[2]=magic;
|
||||||
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
|
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ShmBarrier();
|
ShmBarrier();
|
||||||
for(uint64_t r=0;r<ShmSize;r++){
|
for(uint64_t r=0;r<ShmSize;r++){
|
||||||
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
|
ShmBarrier();
|
||||||
|
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
|
||||||
|
ShmBarrier();
|
||||||
assert(check[0]==GlobalSharedMemory::WorldNode);
|
assert(check[0]==GlobalSharedMemory::WorldNode);
|
||||||
assert(check[1]==r);
|
assert(check[1]==r);
|
||||||
assert(check[2]==magic);
|
assert(check[2]==magic);
|
||||||
|
ShmBarrier();
|
||||||
}
|
}
|
||||||
ShmBarrier();
|
|
||||||
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void *SharedMemory::ShmBuffer(int rank)
|
void *SharedMemory::ShmBuffer(int rank)
|
||||||
|
@ -68,7 +68,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
|||||||
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
#if 1
|
||||||
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
|
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
{
|
{
|
||||||
int sshift[2];
|
int sshift[2];
|
||||||
@ -125,11 +125,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||||
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
|
||||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
|
||||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
RealD tcopy=0.0;
|
RealD tcopy=0.0;
|
||||||
@ -160,29 +156,16 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
// int rank = grid->_processor;
|
// int rank = grid->_processor;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
|
|
||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
tcomms-=usecond();
|
tcomms-=usecond();
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
|
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf[0],
|
(void *)&recv_buf[0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
#else
|
|
||||||
// bouncy bouncy
|
|
||||||
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
|
|
||||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&hrecv_buf[0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
xbytes+=bytes;
|
xbytes+=bytes;
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
tcomms+=usecond();
|
tcomms+=usecond();
|
||||||
@ -243,16 +226,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||||
scalar_object * recv_buf_extract_mpi;
|
scalar_object * recv_buf_extract_mpi;
|
||||||
scalar_object * send_buf_extract_mpi;
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
|
||||||
for(int s=0;s<Nsimd;s++){
|
for(int s=0;s<Nsimd;s++){
|
||||||
send_buf_extract[s].resize(buffer_size);
|
send_buf_extract[s].resize(buffer_size);
|
||||||
recv_buf_extract[s].resize(buffer_size);
|
recv_buf_extract[s].resize(buffer_size);
|
||||||
}
|
}
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
|
||||||
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
|
|
||||||
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||||
@ -304,22 +283,11 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
|
|
||||||
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
||||||
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
|
||||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)recv_buf_extract_mpi,
|
(void *)recv_buf_extract_mpi,
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
#else
|
|
||||||
// bouncy bouncy
|
|
||||||
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
|
|
||||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&hrecv_buf[0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
xbytes+=bytes;
|
xbytes+=bytes;
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
@ -343,6 +311,234 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
|
{
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
GridBase *grid=rhs.Grid();
|
||||||
|
Lattice<vobj> temp(rhs.Grid());
|
||||||
|
|
||||||
|
int fd = rhs.Grid()->_fdimensions[dimension];
|
||||||
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
int pd = rhs.Grid()->_processors[dimension];
|
||||||
|
int simd_layout = rhs.Grid()->_simd_layout[dimension];
|
||||||
|
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
||||||
|
assert(simd_layout==1);
|
||||||
|
assert(comm_dim==1);
|
||||||
|
assert(shift>=0);
|
||||||
|
assert(shift<fd);
|
||||||
|
RealD tcopy=0.0;
|
||||||
|
RealD tgather=0.0;
|
||||||
|
RealD tscatter=0.0;
|
||||||
|
RealD tcomms=0.0;
|
||||||
|
uint64_t xbytes=0;
|
||||||
|
|
||||||
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
|
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
|
||||||
|
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
|
||||||
|
vobj *send_buf;
|
||||||
|
vobj *recv_buf;
|
||||||
|
{
|
||||||
|
grid->ShmBufferFreeAll();
|
||||||
|
size_t bytes = buffer_size*sizeof(vobj);
|
||||||
|
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||||
|
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
|
|
||||||
|
if (comm_proc==0) {
|
||||||
|
|
||||||
|
tcopy-=usecond();
|
||||||
|
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||||
|
tcopy+=usecond();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
int words = buffer_size;
|
||||||
|
if (cbmask != 0x3) words=words>>1;
|
||||||
|
|
||||||
|
int bytes = words * sizeof(vobj);
|
||||||
|
|
||||||
|
tgather-=usecond();
|
||||||
|
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
||||||
|
tgather+=usecond();
|
||||||
|
|
||||||
|
// int rank = grid->_processor;
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
|
||||||
|
tcomms-=usecond();
|
||||||
|
// grid->Barrier();
|
||||||
|
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
||||||
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)&recv_buf[0],
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
xbytes+=bytes;
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
||||||
|
|
||||||
|
// grid->Barrier();
|
||||||
|
tcomms+=usecond();
|
||||||
|
|
||||||
|
tscatter-=usecond();
|
||||||
|
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
||||||
|
tscatter+=usecond();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(Cshift_verbose){
|
||||||
|
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
|
{
|
||||||
|
GridBase *grid=rhs.Grid();
|
||||||
|
const int Nsimd = grid->Nsimd();
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
int fd = grid->_fdimensions[dimension];
|
||||||
|
int rd = grid->_rdimensions[dimension];
|
||||||
|
int ld = grid->_ldimensions[dimension];
|
||||||
|
int pd = grid->_processors[dimension];
|
||||||
|
int simd_layout = grid->_simd_layout[dimension];
|
||||||
|
int comm_dim = grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
|
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||||
|
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||||
|
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||||
|
|
||||||
|
assert(comm_dim==1);
|
||||||
|
assert(simd_layout==2);
|
||||||
|
assert(shift>=0);
|
||||||
|
assert(shift<fd);
|
||||||
|
RealD tcopy=0.0;
|
||||||
|
RealD tgather=0.0;
|
||||||
|
RealD tscatter=0.0;
|
||||||
|
RealD tcomms=0.0;
|
||||||
|
uint64_t xbytes=0;
|
||||||
|
|
||||||
|
int permute_type=grid->PermuteType(dimension);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
// Simd direction uses an extract/merge pair
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
|
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||||
|
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||||
|
scalar_object * recv_buf_extract_mpi;
|
||||||
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
{
|
||||||
|
size_t bytes = sizeof(scalar_object)*buffer_size;
|
||||||
|
grid->ShmBufferFreeAll();
|
||||||
|
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||||
|
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||||
|
}
|
||||||
|
for(int s=0;s<Nsimd;s++){
|
||||||
|
send_buf_extract[s].resize(buffer_size);
|
||||||
|
recv_buf_extract[s].resize(buffer_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
|
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||||
|
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
|
||||||
|
|
||||||
|
///////////////////////////////////////////
|
||||||
|
// Work out what to send where
|
||||||
|
///////////////////////////////////////////
|
||||||
|
int cb = (cbmask==0x2)? Odd : Even;
|
||||||
|
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
|
|
||||||
|
// loop over outer coord planes orthog to dim
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
// FIXME call local permute copy if none are offnode.
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
pointers[i] = &send_buf_extract[i][0];
|
||||||
|
}
|
||||||
|
tgather-=usecond();
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||||
|
tgather+=usecond();
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
|
int inner_bit = (Nsimd>>(permute_type+1));
|
||||||
|
int ic= (i&inner_bit)? 1:0;
|
||||||
|
|
||||||
|
int my_coor = rd*ic + x;
|
||||||
|
int nbr_coor = my_coor+sshift;
|
||||||
|
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||||
|
|
||||||
|
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
|
||||||
|
int nbr_ox = (nbr_coor%rd); // outer coord of peer
|
||||||
|
int nbr_lane = (i&(~inner_bit));
|
||||||
|
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
|
||||||
|
if (nbr_ic) nbr_lane|=inner_bit;
|
||||||
|
|
||||||
|
assert (sx == nbr_ox);
|
||||||
|
|
||||||
|
if(nbr_proc){
|
||||||
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
tcomms-=usecond();
|
||||||
|
// grid->Barrier();
|
||||||
|
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
||||||
|
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)recv_buf_extract_mpi,
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
||||||
|
xbytes+=bytes;
|
||||||
|
|
||||||
|
// grid->Barrier();
|
||||||
|
tcomms+=usecond();
|
||||||
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
|
} else {
|
||||||
|
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
tscatter-=usecond();
|
||||||
|
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||||
|
tscatter+=usecond();
|
||||||
|
|
||||||
|
}
|
||||||
|
if(Cshift_verbose){
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
|
|||||||
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
|
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
|
||||||
|
|
||||||
//copy offsets to device
|
//copy offsets to device
|
||||||
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||||
|
|
||||||
|
|
||||||
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
||||||
@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
|
|||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||||
|
|
||||||
//sync after copy
|
//sync after copy
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
|
@ -466,12 +466,6 @@ public:
|
|||||||
static deviceVector<vobj> recv_buf;
|
static deviceVector<vobj> recv_buf;
|
||||||
send_buf.resize(buffer_size*2*depth);
|
send_buf.resize(buffer_size*2*depth);
|
||||||
recv_buf.resize(buffer_size*2*depth);
|
recv_buf.resize(buffer_size*2*depth);
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
|
||||||
static hostVector<vobj> hsend_buf;
|
|
||||||
static hostVector<vobj> hrecv_buf;
|
|
||||||
hsend_buf.resize(buffer_size*2*depth);
|
|
||||||
hrecv_buf.resize(buffer_size*2*depth);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
std::vector<MpiCommsRequest_t> fwd_req;
|
std::vector<MpiCommsRequest_t> fwd_req;
|
||||||
std::vector<MpiCommsRequest_t> bwd_req;
|
std::vector<MpiCommsRequest_t> bwd_req;
|
||||||
@ -501,17 +495,9 @@ public:
|
|||||||
t_gather+=usecond()-t;
|
t_gather+=usecond()-t;
|
||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
|
||||||
grid->SendToRecvFromBegin(fwd_req,
|
grid->SendToRecvFromBegin(fwd_req,
|
||||||
(void *)&send_buf[d*buffer_size], xmit_to_rank,
|
(void *)&send_buf[d*buffer_size], xmit_to_rank,
|
||||||
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||||
#else
|
|
||||||
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
|
|
||||||
grid->SendToRecvFromBegin(fwd_req,
|
|
||||||
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
|
|
||||||
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
|
||||||
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
|
|
||||||
#endif
|
|
||||||
t_comms+=usecond()-t;
|
t_comms+=usecond()-t;
|
||||||
}
|
}
|
||||||
for ( int d=0;d < depth ; d ++ ) {
|
for ( int d=0;d < depth ; d ++ ) {
|
||||||
@ -522,17 +508,9 @@ public:
|
|||||||
t_gather+= usecond() - t;
|
t_gather+= usecond() - t;
|
||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
|
||||||
grid->SendToRecvFromBegin(bwd_req,
|
grid->SendToRecvFromBegin(bwd_req,
|
||||||
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
|
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||||
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||||
#else
|
|
||||||
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
|
|
||||||
grid->SendToRecvFromBegin(bwd_req,
|
|
||||||
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
|
|
||||||
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
|
||||||
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
|
|
||||||
#endif
|
|
||||||
t_comms+=usecond()-t;
|
t_comms+=usecond()-t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -484,11 +484,6 @@ public:
|
|||||||
this->face_table_computed=1;
|
this->face_table_computed=1;
|
||||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
#ifdef NVLINK_GET
|
|
||||||
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
|
||||||
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
|
||||||
// Or issue barrier AFTER the DMA is running
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
} else { \
|
} else { \
|
||||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||||
} \
|
} \
|
||||||
acceleratorSynchronise(); \
|
acceleratorSynchronise(); \
|
||||||
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
||||||
Recon(result, Uchi);
|
Recon(result, Uchi);
|
||||||
|
|
||||||
@ -517,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
// // dependent on result of merge
|
// dependent on result of merge
|
||||||
acceleratorFenceComputeStream();
|
acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
|
||||||
|
@ -363,16 +363,12 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
// std::cout << "Communicate Begin "<<std::endl;
|
|
||||||
// _grid->Barrier();
|
|
||||||
FlightRecorder::StepLog("Communicate begin");
|
FlightRecorder::StepLog("Communicate begin");
|
||||||
// All GPU kernel tasks must complete
|
// All GPU kernel tasks must complete
|
||||||
// accelerator_barrier(); // All kernels should ALREADY be complete
|
// accelerator_barrier(); // All kernels should ALREADY be complete
|
||||||
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||||
// But the HaloGather had a barrier too.
|
// But the HaloGather had a barrier too.
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
// std::cout << "Communicate prepare "<<i<<std::endl;
|
|
||||||
// _grid->Barrier();
|
|
||||||
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
Packets[i].to_rank,Packets[i].do_send,
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
@ -380,15 +376,8 @@ public:
|
|||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||||
}
|
}
|
||||||
// std::cout << "Communicate PollDtoH "<<std::endl;
|
|
||||||
// _grid->Barrier();
|
|
||||||
_grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
|
|
||||||
// std::cout << "Communicate CopySynch "<<std::endl;
|
|
||||||
// _grid->Barrier();
|
|
||||||
acceleratorCopySynchronise();
|
acceleratorCopySynchronise();
|
||||||
// Starts intranode
|
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
// std::cout << "Communicate Begin "<<i<<std::endl;
|
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
Packets[i].to_rank,Packets[i].do_send,
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
@ -406,14 +395,7 @@ public:
|
|||||||
|
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
// std::cout << "Communicate Complete "<<std::endl;
|
|
||||||
// _grid->Barrier();
|
|
||||||
FlightRecorder::StepLog("Start communicate complete");
|
FlightRecorder::StepLog("Start communicate complete");
|
||||||
// std::cout << "Communicate Complete PollIRecv "<<std::endl;
|
|
||||||
// _grid->Barrier();
|
|
||||||
_grid->StencilSendToRecvFromPollIRecv(MpiReqs);
|
|
||||||
// std::cout << "Communicate Complete Complete "<<std::endl;
|
|
||||||
// _grid->Barrier();
|
|
||||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||||
if ( this->partialDirichlet ) DslashLogPartial();
|
if ( this->partialDirichlet ) DslashLogPartial();
|
||||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||||
@ -446,7 +428,6 @@ public:
|
|||||||
Communicate();
|
Communicate();
|
||||||
CommsMergeSHM(compress);
|
CommsMergeSHM(compress);
|
||||||
CommsMerge(compress);
|
CommsMerge(compress);
|
||||||
accelerator_barrier();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
||||||
@ -502,9 +483,6 @@ public:
|
|||||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
// accelerator_barrier();
|
// accelerator_barrier();
|
||||||
//////////////////////////////////
|
|
||||||
// I will overwrite my send buffers
|
|
||||||
//////////////////////////////////
|
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
|
||||||
assert(source.Grid()==_grid);
|
assert(source.Grid()==_grid);
|
||||||
@ -518,11 +496,7 @@ public:
|
|||||||
HaloGatherDir(source,compress,point,face_idx);
|
HaloGatherDir(source,compress,point,face_idx);
|
||||||
}
|
}
|
||||||
accelerator_barrier(); // All my local gathers are complete
|
accelerator_barrier(); // All my local gathers are complete
|
||||||
#ifdef NVLINK_GET
|
// _grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
|
||||||
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
|
||||||
// Or issue barrier AFTER the DMA is running
|
|
||||||
#endif
|
|
||||||
face_table_computed=1;
|
face_table_computed=1;
|
||||||
assert(u_comm_offset==_unified_buffer_size);
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
}
|
}
|
||||||
@ -561,7 +535,6 @@ public:
|
|||||||
coalescedWrite(to[j] ,coalescedRead(from [j]));
|
coalescedWrite(to[j] ,coalescedRead(from [j]));
|
||||||
});
|
});
|
||||||
acceleratorFenceComputeStream();
|
acceleratorFenceComputeStream();
|
||||||
// Also fenced in WilsonKernels
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -690,7 +663,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
|
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
||||||
surface_list.resize(surface_list_size);
|
surface_list.resize(surface_list_size);
|
||||||
std::vector<int> surface_list_host(surface_list_size);
|
std::vector<int> surface_list_host(surface_list_size);
|
||||||
int32_t ss=0;
|
int32_t ss=0;
|
||||||
@ -710,7 +683,6 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
|
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
|
||||||
// std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
|
|
||||||
}
|
}
|
||||||
/// Introduce a block structure and switch off comms on boundaries
|
/// Introduce a block structure and switch off comms on boundaries
|
||||||
void DirichletBlock(const Coordinate &dirichlet_block)
|
void DirichletBlock(const Coordinate &dirichlet_block)
|
||||||
@ -802,8 +774,8 @@ public:
|
|||||||
this->_entries_host_p = &_entries[0];
|
this->_entries_host_p = &_entries[0];
|
||||||
this->_entries_p = &_entries_device[0];
|
this->_entries_p = &_entries_device[0];
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
|
std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
|
||||||
// <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
|
<<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
|
||||||
|
|
||||||
for(int ii=0;ii<npoints;ii++){
|
for(int ii=0;ii<npoints;ii++){
|
||||||
|
|
||||||
|
@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
|
|||||||
auto err = cudaMallocHost((void **)&ptr,bytes);
|
auto err = cudaMallocHost((void **)&ptr,bytes);
|
||||||
if( err != cudaSuccess ) {
|
if( err != cudaSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
|
printf(" cudaMallocHost failed for %ld %s \n",bytes,cudaGetErrorString(err));
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
|
|||||||
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
||||||
if( err != cudaSuccess ) {
|
if( err != cudaSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
|
printf(" cudaMallocManaged failed for %ld %s \n",bytes,cudaGetErrorString(err));
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
@ -237,38 +237,25 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
auto err = cudaMalloc((void **)&ptr,bytes);
|
auto err = cudaMalloc((void **)&ptr,bytes);
|
||||||
if( err != cudaSuccess ) {
|
if( err != cudaSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
|
printf(" cudaMalloc failed for %ld %s \n",bytes,cudaGetErrorString(err));
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef int acceleratorEvent_t;
|
|
||||||
|
|
||||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(void *from, void *to, size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
inline void acceleratorCopyToDevice(const void *from, void *to, size_t bytes) {cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||||
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||||
|
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
||||||
|
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
|
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||||
acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
|
|
||||||
acceleratorCopyFromDevice(from,to,bytes);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
|
||||||
{
|
{
|
||||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
||||||
inline void acceleratorEventWait(acceleratorEvent_t ev)
|
|
||||||
{
|
|
||||||
//auto discard=cudaStreamSynchronize(ev);
|
|
||||||
}
|
|
||||||
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
|
|
||||||
|
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr)
|
inline int acceleratorIsCommunicable(void *ptr)
|
||||||
@ -357,28 +344,11 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
|||||||
|
|
||||||
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
|
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
|
||||||
|
|
||||||
|
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
|
||||||
///////
|
inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); }
|
||||||
// Asynch event interface
|
inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
|
||||||
///////
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
typedef sycl::event acceleratorEvent_t;
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
|
|
||||||
inline void acceleratorEventWait(acceleratorEvent_t ev)
|
|
||||||
{
|
|
||||||
ev.wait();
|
|
||||||
}
|
|
||||||
|
|
||||||
inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
|
|
||||||
{
|
|
||||||
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
|
|
||||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
|
||||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
|
||||||
|
|
||||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
|
||||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr)
|
inline int acceleratorIsCommunicable(void *ptr)
|
||||||
@ -389,10 +359,8 @@ inline int acceleratorIsCommunicable(void *ptr)
|
|||||||
else return 0;
|
else return 0;
|
||||||
#endif
|
#endif
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
@ -492,7 +460,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
|||||||
inline void *acceleratorAllocHost(size_t bytes)
|
inline void *acceleratorAllocHost(size_t bytes)
|
||||||
{
|
{
|
||||||
void *ptr=NULL;
|
void *ptr=NULL;
|
||||||
auto err = hipHostMalloc((void **)&ptr,bytes);
|
auto err = hipMallocHost((void **)&ptr,bytes);
|
||||||
if( err != hipSuccess ) {
|
if( err != hipSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
||||||
@ -525,35 +493,23 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||||
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
||||||
|
|
||||||
typedef int acceleratorEvent_t;
|
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||||
|
|
||||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
|
||||||
{
|
{
|
||||||
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
acceleratorCopyToDevice(from,to,bytes);
|
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||||
acceleratorCopyFromDevice(from,to,bytes);
|
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
||||||
|
|
||||||
inline void acceleratorEventWait(acceleratorEvent_t ev)
|
|
||||||
{
|
|
||||||
// auto discard=hipStreamSynchronize(ev);
|
|
||||||
}
|
|
||||||
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
inline void acceleratorPin(void *ptr,unsigned long bytes)
|
inline void acceleratorPin(void *ptr,unsigned long bytes)
|
||||||
@ -590,8 +546,6 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)
|
|||||||
|
|
||||||
#undef GRID_SIMT
|
#undef GRID_SIMT
|
||||||
|
|
||||||
typedef int acceleratorEvent_t;
|
|
||||||
|
|
||||||
inline void acceleratorMem(void)
|
inline void acceleratorMem(void)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
@ -611,12 +565,9 @@ inline void acceleratorMem(void)
|
|||||||
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||||
|
|
||||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyToDevice(from,to,bytes); return 0; }
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
||||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyFromDevice(from,to,bytes); return 0; }
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
||||||
inline void acceleratorEventWait(acceleratorEvent_t ev){}
|
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
||||||
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
|
|
||||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); return 0;}
|
|
||||||
|
|
||||||
inline void acceleratorCopySynchronise(void) {};
|
inline void acceleratorCopySynchronise(void) {};
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||||
@ -699,15 +650,19 @@ accelerator_inline void acceleratorFence(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
inline void acceleratorCopyDeviceToDevice(void *from,void *to, size_t bytes)
|
||||||
{
|
{
|
||||||
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
|
||||||
acceleratorCopySynchronise();
|
acceleratorCopySynchronise();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class T> void acceleratorPut(T& dev,const T&host)
|
template<class T> void acceleratorPut(T& dev,T&host)
|
||||||
{
|
{
|
||||||
acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
|
acceleratorCopyToDevice(&host, &dev, sizeof(T));
|
||||||
|
}
|
||||||
|
template<class T> void acceleratorPut(T& dev, const T& host)
|
||||||
|
{
|
||||||
|
acceleratorCopyToDevice(&host, &dev, sizeof(T));
|
||||||
}
|
}
|
||||||
template<class T> T acceleratorGet(T& dev)
|
template<class T> T acceleratorGet(T& dev)
|
||||||
{
|
{
|
||||||
|
@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define thread_critical DO_PRAGMA(omp critical)
|
#define thread_critical DO_PRAGMA(omp critical)
|
||||||
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
const uint64_t *ufrom = (const uint64_t *)from;
|
uint64_t *ufrom = (uint64_t *)from;
|
||||||
uint64_t *uto = (uint64_t *)to;
|
uint64_t *uto = (uint64_t *)to;
|
||||||
assert(bytes%8==0);
|
assert(bytes%8==0);
|
||||||
uint64_t words=bytes/8;
|
uint64_t words=bytes/8;
|
||||||
@ -84,7 +84,7 @@ inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
||||||
{
|
{
|
||||||
bcopy(from,to,bytes);
|
bcopy(from,to,bytes);
|
||||||
}
|
}
|
||||||
|
@ -509,14 +509,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
Grid_default_latt,
|
Grid_default_latt,
|
||||||
Grid_default_mpi);
|
Grid_default_mpi);
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
|
|
||||||
std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
|
|
||||||
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
|
|
||||||
FlightRecorder::PrintEntireLog = 1;
|
|
||||||
FlightRecorder::ChecksumComms = 1;
|
|
||||||
FlightRecorder::ChecksumCommsSend=1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
||||||
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
||||||
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
||||||
@ -658,4 +651,3 @@ void Grid_debug_handler_init(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ namespace Grid{
|
|||||||
int64_t index64;
|
int64_t index64;
|
||||||
IndexFromCoorReversed(coor,index64,dims);
|
IndexFromCoorReversed(coor,index64,dims);
|
||||||
if ( index64>=2*1024*1024*1024LL ){
|
if ( index64>=2*1024*1024*1024LL ){
|
||||||
// std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
|
std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
|
||||||
}
|
}
|
||||||
assert(index64<2*1024*1024*1024LL);
|
assert(index64<2*1024*1024*1024LL);
|
||||||
index = (int) index64;
|
index = (int) index64;
|
||||||
|
@ -52,7 +52,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
int threads = GridThread::GetThreads();
|
||||||
|
|
||||||
int Ls=16;
|
int Ls=8;
|
||||||
for(int i=0;i<argc;i++) {
|
for(int i=0;i<argc;i++) {
|
||||||
if(std::string(argv[i]) == "-Ls"){
|
if(std::string(argv[i]) == "-Ls"){
|
||||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||||
|
@ -492,18 +492,17 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
uint64_t no = 50;
|
uint64_t ncall = 500;
|
||||||
uint64_t ni = 100;
|
|
||||||
|
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(no);
|
std::vector<double> t_time(ncall);
|
||||||
for(uint64_t i=0;i<no;i++){
|
for(uint64_t i=0;i<ncall;i++){
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
for(uint64_t j=0;j<ni;j++){
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
@ -521,11 +520,11 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min*ni;
|
mf_hi = flops/timestat.min;
|
||||||
mf_lo = flops/timestat.max*ni;
|
mf_lo = flops/timestat.max;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean*ni;
|
mflops = flops/timestat.mean;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
@ -536,7 +535,6 @@ public:
|
|||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -656,19 +654,17 @@ public:
|
|||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
|
uint64_t ncall = 500;
|
||||||
|
|
||||||
uint64_t no = 50;
|
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||||
uint64_t ni = 100;
|
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(no);
|
std::vector<double> t_time(ncall);
|
||||||
for(uint64_t i=0;i<no;i++){
|
for(uint64_t i=0;i<ncall;i++){
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
for(uint64_t j=0;j<ni;j++){
|
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||||
Ds.DhopEO(src_o,r_e,DaggerNo);
|
|
||||||
}
|
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
@ -679,11 +675,11 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min*ni;
|
mf_hi = flops/timestat.min;
|
||||||
mf_lo = flops/timestat.max*ni;
|
mf_lo = flops/timestat.max;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean*ni;
|
mflops = flops/timestat.mean;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
@ -693,7 +689,6 @@ public:
|
|||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -797,18 +792,19 @@ public:
|
|||||||
Dc.M(src,r);
|
Dc.M(src,r);
|
||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
uint64_t ni = 100;
|
double t1=usecond();
|
||||||
uint64_t no = 50;
|
uint64_t ncall = 500;
|
||||||
|
|
||||||
|
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||||
|
|
||||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||||
|
|
||||||
time_statistics timestat;
|
time_statistics timestat;
|
||||||
std::vector<double> t_time(no);
|
std::vector<double> t_time(ncall);
|
||||||
for(uint64_t i=0;i<no;i++){
|
for(uint64_t i=0;i<ncall;i++){
|
||||||
double t0=usecond();
|
t0=usecond();
|
||||||
for(uint64_t j=0;j<ni;j++){
|
Dc.M(src,r);
|
||||||
Dc.M(src,r);
|
t1=usecond();
|
||||||
}
|
|
||||||
double t1=usecond();
|
|
||||||
t_time[i] = t1-t0;
|
t_time[i] = t1-t0;
|
||||||
}
|
}
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
@ -818,21 +814,20 @@ public:
|
|||||||
double mf_hi, mf_lo, mf_err;
|
double mf_hi, mf_lo, mf_err;
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
mf_hi = flops/timestat.min*ni;
|
mf_hi = flops/timestat.min;
|
||||||
mf_lo = flops/timestat.max*ni;
|
mf_lo = flops/timestat.max;
|
||||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||||
|
|
||||||
mflops = flops/timestat.mean*ni;
|
mflops = flops/timestat.mean;
|
||||||
mflops_all.push_back(mflops);
|
mflops_all.push_back(mflops);
|
||||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||||
if ( mflops>mflops_best ) mflops_best = mflops;
|
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||||
if ( mflops<mflops_worst) mflops_worst= mflops;
|
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
||||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call "<< timestat.mean/ni<<std::endl;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -877,7 +872,7 @@ int main (int argc, char ** argv)
|
|||||||
int do_dslash=1;
|
int do_dslash=1;
|
||||||
|
|
||||||
int sel=4;
|
int sel=4;
|
||||||
std::vector<int> L_list({8,12,16,24,32});
|
std::vector<int> L_list({8,12,16,24});
|
||||||
int selm1=sel-1;
|
int selm1=sel-1;
|
||||||
|
|
||||||
std::vector<double> clover;
|
std::vector<double> clover;
|
||||||
|
@ -32,9 +32,15 @@ export MPICH_OFI_NIC_POLICY=GPU
|
|||||||
# Local vol 16.16.16.32
|
# Local vol 16.16.16.32
|
||||||
#
|
#
|
||||||
|
|
||||||
LX=16
|
VOL 128.64.128.96
|
||||||
|
MPI 4.4.4.3
|
||||||
|
NPROC 192
|
||||||
|
mpiexec -np 192 -ppn 12 -envall ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 4.4.4.3 --grid 128.64.128.96 --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap
|
||||||
|
|
||||||
|
|
||||||
|
LX=32
|
||||||
LY=16
|
LY=16
|
||||||
LZ=16
|
LZ=32
|
||||||
LT=32
|
LT=32
|
||||||
|
|
||||||
NX=2
|
NX=2
|
||||||
|
@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
|||||||
|
|
||||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
||||||
|
@ -1,19 +1,18 @@
|
|||||||
#Ahead of time compile for PVC
|
#Ahead of time compile for PVC
|
||||||
|
|
||||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
|
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib"
|
||||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
|
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
|
||||||
|
|
||||||
#JIT compile
|
#JIT compile
|
||||||
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||||
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
||||||
|
|
||||||
../configure \
|
../../configure \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-reduction=grid \
|
--enable-reduction=grid \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--enable-comms=mpi-auto \
|
--enable-comms=mpi-auto \
|
||||||
--enable-debug \
|
--enable-debug \
|
||||||
--prefix $HOME/gpt-install \
|
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--with-lime=$CLIME \
|
--with-lime=$CLIME \
|
||||||
|
@ -1,22 +0,0 @@
|
|||||||
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
|
||||||
../../configure --enable-comms=mpi-auto \
|
|
||||||
--with-lime=$CLIME \
|
|
||||||
--enable-unified=no \
|
|
||||||
--enable-shm=nvlink \
|
|
||||||
--enable-tracing=none \
|
|
||||||
--enable-accelerator=hip \
|
|
||||||
--enable-gen-simd-width=64 \
|
|
||||||
--disable-gparity \
|
|
||||||
--disable-fermion-reps \
|
|
||||||
--enable-simd=GPU \
|
|
||||||
--with-gmp=$OLCF_GMP_ROOT \
|
|
||||||
--with-fftw=$FFTW_DIR/.. \
|
|
||||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
|
||||||
--disable-fermion-reps \
|
|
||||||
CXX=hipcc MPICXX=mpicxx \
|
|
||||||
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
|
||||||
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,16 +0,0 @@
|
|||||||
|
|
||||||
echo spack
|
|
||||||
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
|
|
||||||
|
|
||||||
#module load cce/15.0.1
|
|
||||||
|
|
||||||
module load rocm/6.3.1
|
|
||||||
module load cray-fftw
|
|
||||||
module load craype-accel-amd-gfx90a
|
|
||||||
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
|
||||||
|
|
||||||
#Ugly hacks to get down level software working on current system
|
|
||||||
#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
|
|
||||||
#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
|
|
||||||
#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
|
|
||||||
|
|
@ -30,10 +30,14 @@ source ${root}/sourceme.sh
|
|||||||
|
|
||||||
export OMP_NUM_THREADS=7
|
export OMP_NUM_THREADS=7
|
||||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||||
#64.64.32.96
|
|
||||||
for vol in 64.64.32.64
|
for vol in 32.32.32.64
|
||||||
do
|
do
|
||||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
|
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol
|
||||||
|
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
|
||||||
|
|
||||||
|
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
|
||||||
|
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
|
||||||
done
|
done
|
||||||
|
|
||||||
|
@ -3,19 +3,20 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
|||||||
--with-lime=$CLIME \
|
--with-lime=$CLIME \
|
||||||
--enable-unified=no \
|
--enable-unified=no \
|
||||||
--enable-shm=nvlink \
|
--enable-shm=nvlink \
|
||||||
--enable-tracing=none \
|
--enable-tracing=timer \
|
||||||
--enable-accelerator=hip \
|
--enable-accelerator=hip \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
|
--enable-accelerator-cshift \
|
||||||
--with-gmp=$OLCF_GMP_ROOT \
|
--with-gmp=$OLCF_GMP_ROOT \
|
||||||
--with-fftw=$FFTW_DIR/.. \
|
--with-fftw=$FFTW_DIR/.. \
|
||||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
CXX=hipcc MPICXX=mpicxx \
|
CXX=hipcc MPICXX=mpicxx \
|
||||||
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
||||||
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
|
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,25 +1,12 @@
|
|||||||
|
|
||||||
echo spack
|
|
||||||
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
|
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
|
||||||
|
spack load c-lime
|
||||||
module load cce/15.0.1
|
module load emacs
|
||||||
module load rocm/5.3.0
|
module load PrgEnv-gnu
|
||||||
|
module load rocm/6.0.0
|
||||||
|
module load cray-mpich
|
||||||
|
module load gmp
|
||||||
module load cray-fftw
|
module load cray-fftw
|
||||||
module load craype-accel-amd-gfx90a
|
module load craype-accel-amd-gfx90a
|
||||||
|
|
||||||
#Ugly hacks to get down level software working on current system
|
|
||||||
export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
|
|
||||||
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
||||||
export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
|
|
||||||
ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
|
|
||||||
|
|
||||||
#echo spack load c-lime
|
|
||||||
#spack load c-lime
|
|
||||||
#module load emacs
|
|
||||||
##module load PrgEnv-gnu
|
|
||||||
##module load cray-mpich
|
|
||||||
##module load cray-fftw
|
|
||||||
##module load craype-accel-amd-gfx90a
|
|
||||||
##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
|
||||||
#Hack for lib
|
#Hack for lib
|
||||||
##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
|
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
|
||||||
|
@ -1,206 +0,0 @@
|
|||||||
The purpose of this file is to collate all non-obvious known magic shell variables
|
|
||||||
and compiler flags required for either correctness or performance on various systems.
|
|
||||||
|
|
||||||
A repository of work-arounds.
|
|
||||||
|
|
||||||
Contents:
|
|
||||||
1. Interconnect + MPI
|
|
||||||
2. Compilation
|
|
||||||
3. Profiling
|
|
||||||
|
|
||||||
************************
|
|
||||||
* 1. INTERCONNECT + MPI
|
|
||||||
************************
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
export OMPI_MCA_io=romio321
|
|
||||||
|
|
||||||
--------------------------------------
|
|
||||||
ROMIO fail with > 2GB per node read (32 bit issue)
|
|
||||||
--------------------------------------
|
|
||||||
|
|
||||||
Use later MPICH
|
|
||||||
|
|
||||||
https://github.com/paboyle/Grid/issues/381
|
|
||||||
|
|
||||||
https://github.com/pmodels/mpich/commit/3a479ab0
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
Slingshot: Frontier and Perlmutter libfabric slow down
|
|
||||||
and physical memory fragmentation
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
export FI_MR_CACHE_MONITOR=disabled
|
|
||||||
or
|
|
||||||
export FI_MR_CACHE_MONITOR=kdreg2
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
Perlmutter
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
|
|
||||||
export MPICH_RDMA_ENABLED_CUDA=1
|
|
||||||
export MPICH_GPU_IPC_ENABLED=1
|
|
||||||
export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
|
|
||||||
export MPICH_GPU_NO_ASYNC_MEMCPY=0
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
Frontier/LumiG
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
|
|
||||||
Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
|
|
||||||
|
|
||||||
cat << EOF > select_gpu
|
|
||||||
#!/bin/bash
|
|
||||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
|
||||||
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
|
||||||
export GPU_MAP=(0 1 2 3 7 6 5 4)
|
|
||||||
export NUMA_MAP=(3 3 1 1 2 2 0 0)
|
|
||||||
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
|
|
||||||
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
|
|
||||||
export HIP_VISIBLE_DEVICES=\$GPU
|
|
||||||
unset ROCR_VISIBLE_DEVICES
|
|
||||||
echo RANK \$SLURM_LOCALID using GPU \$GPU
|
|
||||||
exec numactl -m \$NUMA -N \$NUMA \$*
|
|
||||||
EOF
|
|
||||||
chmod +x ./select_gpu
|
|
||||||
|
|
||||||
srun ./select_gpu BINARY
|
|
||||||
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
export OMPI_MCA_btl=^uct,openib
|
|
||||||
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
|
||||||
export UCX_RNDV_SCHEME=put_zcopy
|
|
||||||
export UCX_RNDV_THRESH=16384
|
|
||||||
export UCX_IB_GPU_DIRECT_RDMA=yes
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
Mellanox + A100 correctness (Tursa, Booster, Leonardo)
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
export UCX_MEMTYPE_CACHE=n
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
MPICH/Aurora/PVC correctness and performance
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
|
|
||||||
https://github.com/pmodels/mpich/issues/7302
|
|
||||||
|
|
||||||
--enable-cuda-aware-mpi=no
|
|
||||||
--enable-unified=no
|
|
||||||
|
|
||||||
Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
|
|
||||||
Do not use SVM
|
|
||||||
|
|
||||||
Ideally use MPICH with fix to issue 7302:
|
|
||||||
|
|
||||||
https://github.com/pmodels/mpich/pull/7312
|
|
||||||
|
|
||||||
Ideally:
|
|
||||||
MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
|
|
||||||
|
|
||||||
Alternatives:
|
|
||||||
export MPIR_CVAR_NOLOCAL=1
|
|
||||||
export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
MPICH/Aurora/PVC correctness and performance
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
|
|
||||||
Broken:
|
|
||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
|
||||||
|
|
||||||
This gives good peformance without requiring
|
|
||||||
--enable-cuda-aware-mpi=no
|
|
||||||
|
|
||||||
But is an open issue reported by James Osborn
|
|
||||||
https://github.com/pmodels/mpich/issues/7139
|
|
||||||
|
|
||||||
Possibly resolved but unclear if in the installed software yet.
|
|
||||||
|
|
||||||
************************
|
|
||||||
* 2. COMPILATION
|
|
||||||
************************
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
G++ compiler breakage / graveyard
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
|
|
||||||
9.3.0, 10.3.1,
|
|
||||||
https://github.com/paboyle/Grid/issues/290
|
|
||||||
https://github.com/paboyle/Grid/issues/264
|
|
||||||
|
|
||||||
Working (-) Broken (X):
|
|
||||||
|
|
||||||
4.9.0 -
|
|
||||||
4.9.1 -
|
|
||||||
5.1.0 X
|
|
||||||
5.2.0 X
|
|
||||||
5.3.0 X
|
|
||||||
5.4.0 X
|
|
||||||
6.1.0 X
|
|
||||||
6.2.0 X
|
|
||||||
6.3.0 -
|
|
||||||
7.1.0 -
|
|
||||||
8.0.0 (HEAD) -
|
|
||||||
|
|
||||||
https://github.com/paboyle/Grid/issues/100
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
AMD GPU nodes :
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
|
|
||||||
multiple ROCM versions broken; use 5.3.0
|
|
||||||
manifests itself as wrong results in fp32
|
|
||||||
|
|
||||||
https://github.com/paboyle/Grid/issues/464
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
Aurora/PVC
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
|
|
||||||
SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
|
|
||||||
SYCL slow link and relocatable code issues (Christoph Lehner)
|
|
||||||
Opt large register file required for good performance in fp64
|
|
||||||
|
|
||||||
|
|
||||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
|
||||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
|
|
||||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC"
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
Aurora/PVC useful extra options
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
|
|
||||||
Host only sanitizer:
|
|
||||||
-Xarch_host -fsanitize=leak
|
|
||||||
-Xarch_host -fsanitize=address
|
|
||||||
|
|
||||||
Deterministic MPI reduction:
|
|
||||||
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
|
||||||
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
|
||||||
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
|
||||||
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
|
||||||
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
|
||||||
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
************************
|
|
||||||
* 3. Visual profile tools
|
|
||||||
************************
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
Frontier/rocprof
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
Aurora/unitrace
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
Tursa/nsight-sys
|
|
||||||
--------------------------------------------------------------------
|
|
@ -1,32 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
#SBATCH --partition lqcd
|
|
||||||
#SBATCH --time=00:50:00
|
|
||||||
#SBATCH -A lqcdtest
|
|
||||||
#SBATCH -q lqcd
|
|
||||||
#SBATCH --exclusive
|
|
||||||
#SBATCH --nodes=1
|
|
||||||
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
|
|
||||||
#SBATCH --ntasks=1
|
|
||||||
#SBATCH --cpus-per-task=64
|
|
||||||
#SBATCH --qos lqcd
|
|
||||||
|
|
||||||
source sourceme.sh
|
|
||||||
|
|
||||||
export PLACES=(1:16:4 1:32:2 0:64:1);
|
|
||||||
export THR=(16 32 64)
|
|
||||||
|
|
||||||
for t in 2
|
|
||||||
do
|
|
||||||
|
|
||||||
export OMP_NUM_THREADS=${THR[$t]}
|
|
||||||
export OMP_PLACES=${PLACES[$t]}
|
|
||||||
export thr=${THR[$t]}
|
|
||||||
|
|
||||||
#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
|
|
||||||
for vol in 48.48.48.96
|
|
||||||
do
|
|
||||||
srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
|
|
||||||
done
|
|
||||||
#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
|
|
||||||
done
|
|
||||||
|
|
@ -1,36 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
#SBATCH --partition lqcd
|
|
||||||
#SBATCH --time=00:50:00
|
|
||||||
#SBATCH -A lqcdtest
|
|
||||||
#SBATCH -q lqcd
|
|
||||||
#SBATCH --exclusive
|
|
||||||
#SBATCH --nodes=2
|
|
||||||
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
|
|
||||||
#SBATCH --ntasks=2
|
|
||||||
#SBATCH --cpus-per-task=64
|
|
||||||
#SBATCH --qos lqcd
|
|
||||||
|
|
||||||
source sourceme.sh
|
|
||||||
|
|
||||||
export PLACES=(1:16:4 1:32:2 0:64:1);
|
|
||||||
export THR=(16 32 64)
|
|
||||||
|
|
||||||
nodes=2
|
|
||||||
mpi=1.1.1.2
|
|
||||||
|
|
||||||
for t in 2
|
|
||||||
do
|
|
||||||
|
|
||||||
export OMP_NUM_THREADS=${THR[$t]}
|
|
||||||
export OMP_PLACES=${PLACES[$t]}
|
|
||||||
export thr=${THR[$t]}
|
|
||||||
|
|
||||||
#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
|
|
||||||
|
|
||||||
for vol in 64.64.64.128
|
|
||||||
do
|
|
||||||
srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
|
|
||||||
done
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
@ -1,16 +0,0 @@
|
|||||||
../../configure \
|
|
||||||
--enable-comms=mpi-auto \
|
|
||||||
--enable-unified=yes \
|
|
||||||
--enable-shm=shmopen \
|
|
||||||
--enable-shm-fast-path=shmopen \
|
|
||||||
--enable-accelerator=none \
|
|
||||||
--enable-simd=AVX512 \
|
|
||||||
--disable-accelerator-cshift \
|
|
||||||
--disable-fermion-reps \
|
|
||||||
--disable-gparity \
|
|
||||||
CXX=clang++ \
|
|
||||||
MPICXX=mpicxx \
|
|
||||||
CXXFLAGS="-std=c++17"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
|||||||
source $HOME/spack/share/spack/setup-env.sh
|
|
||||||
spack load llvm@17.0.4
|
|
||||||
export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
|
|
||||||
module load openmpi
|
|
@ -1,17 +0,0 @@
|
|||||||
../../src/Grid/configure \
|
|
||||||
--prefix /home/pab/NPR/install \
|
|
||||||
--enable-comms=mpi-auto \
|
|
||||||
--enable-simd=AVX2 \
|
|
||||||
--enable-shm=none \
|
|
||||||
--enable-debug \
|
|
||||||
--with-lime=$CLIME \
|
|
||||||
--with-hdf5=$HDF5 \
|
|
||||||
--with-fftw=$FFTW \
|
|
||||||
--with-gmp=$GMP \
|
|
||||||
--with-mpfr=$MPFR \
|
|
||||||
--disable-gparity \
|
|
||||||
--disable-fermion-reps \
|
|
||||||
CXX=clang++ \
|
|
||||||
MPICXX=mpicxx \
|
|
||||||
CXXFLAGS="-std=c++17 "
|
|
||||||
|
|
@ -1,28 +0,0 @@
|
|||||||
source $HOME/spack/share/spack/setup-env.sh
|
|
||||||
spack load llvm@12
|
|
||||||
spack load autoconf%clang@12.0.1
|
|
||||||
spack load automake%clang@12.0.1
|
|
||||||
spack load c-lime%clang@12.0.1
|
|
||||||
spack load fftw%clang@12.0.1
|
|
||||||
spack load gmp%clang@12.0.1
|
|
||||||
spack load mpfr%clang@12.0.1
|
|
||||||
spack load openmpi%clang@12.0.1
|
|
||||||
spack load openssl%clang@12.0.1
|
|
||||||
spack load hdf5+cxx%clang@12.0.1
|
|
||||||
spack load cmake%clang@12.0.1
|
|
||||||
export FFTW=`spack find --paths fftw%clang@12.0.1 | grep ^fftw | awk '{print $2}' `
|
|
||||||
export HDF5=`spack find --paths hdf5+cxx%clang@12.0.1 | grep ^hdf5 | awk '{print $2}' `
|
|
||||||
export CLIME=`spack find --paths c-lime%clang@12.0.1 | grep ^c-lime | awk '{print $2}' `
|
|
||||||
export MPFR=`spack find --paths mpfr%clang@12.0.1 | grep ^mpfr | awk '{print $2}' `
|
|
||||||
export LLVM=`spack find --paths llvm@12 | grep ^llvm | awk '{print $2}' `
|
|
||||||
export OPENSSL=`spack find --paths openssl%clang@12.0.1 | grep openssl | awk '{print $2}' `
|
|
||||||
export GMP=`spack find --paths gmp%clang@12.0.1 | grep ^gmp | awk '{print $2}' `
|
|
||||||
export TCLAP=`spack find --paths tclap%clang@12.0.1 | grep ^tclap | awk '{print $2}' `
|
|
||||||
export LD_LIBRARY_PATH=${TCLAP}/lib:$LD_LIBRARY_PATH
|
|
||||||
export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
|
|
||||||
export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
|
|
||||||
export LD_LIBRARY_PATH=$FFTW/lib:$LD_LIBRARY_PATH
|
|
||||||
export LD_LIBRARY_PATH=$LLVM/lib:$LD_LIBRARY_PATH
|
|
||||||
export LD_LIBRARY_PATH=$LLVM/lib/x86_64-unknown-linux-gnu/:$LD_LIBRARY_PATH
|
|
||||||
|
|
||||||
ulimit -s 81920
|
|
@ -1,19 +0,0 @@
|
|||||||
cd
|
|
||||||
git clone https://github.com/spack/spack.git
|
|
||||||
source $HOME/spack/share/spack/setup-env.sh
|
|
||||||
|
|
||||||
spack install llvm@12
|
|
||||||
|
|
||||||
spack install autoconf%clang@12.0.1
|
|
||||||
spack install automake%clang@12.0.1
|
|
||||||
spack install c-lime%clang@12.0.1
|
|
||||||
spack install fftw%clang@12.0.1
|
|
||||||
spack install gmp%clang@12.0.1
|
|
||||||
spack install mpfr%clang@12.0.1
|
|
||||||
spack install openmpi%clang@12.0.1
|
|
||||||
spack install openssl%clang@12.0.1
|
|
||||||
spack install hdf5+cxx%clang@12.0.1
|
|
||||||
spack install cmake%clang@12.0.1
|
|
||||||
spack install tclap%clang@12.0.1
|
|
||||||
spack install emacs%clang@12.0.1
|
|
||||||
|
|
@ -47,20 +47,20 @@ public:
|
|||||||
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
|
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
|
||||||
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
|
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
|
||||||
void Op (const Field &in, Field &out){
|
void Op (const Field &in, Field &out){
|
||||||
// std::cout << "Op: PVdag M "<<std::endl;
|
std::cout << "Op: PVdag M "<<std::endl;
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
_Mat.M(in,tmp);
|
_Mat.M(in,tmp);
|
||||||
_PV.Mdag(tmp,out);
|
_PV.Mdag(tmp,out);
|
||||||
}
|
}
|
||||||
void AdjOp (const Field &in, Field &out){
|
void AdjOp (const Field &in, Field &out){
|
||||||
// std::cout << "AdjOp: Mdag PV "<<std::endl;
|
std::cout << "AdjOp: Mdag PV "<<std::endl;
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
_PV.M(in,tmp);
|
_PV.M(in,tmp);
|
||||||
_Mat.Mdag(tmp,out);
|
_Mat.Mdag(tmp,out);
|
||||||
}
|
}
|
||||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
||||||
void HermOp(const Field &in, Field &out){
|
void HermOp(const Field &in, Field &out){
|
||||||
// std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
|
std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
// _Mat.M(in,tmp);
|
// _Mat.M(in,tmp);
|
||||||
// _PV.Mdag(tmp,out);
|
// _PV.Mdag(tmp,out);
|
||||||
@ -83,14 +83,14 @@ public:
|
|||||||
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
|
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
|
||||||
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
|
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
|
||||||
void Op (const Field &in, Field &out){
|
void Op (const Field &in, Field &out){
|
||||||
// std::cout << "Op: PVdag M "<<std::endl;
|
std::cout << "Op: PVdag M "<<std::endl;
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
_Mat.M(in,tmp);
|
_Mat.M(in,tmp);
|
||||||
_PV.Mdag(tmp,out);
|
_PV.Mdag(tmp,out);
|
||||||
out = out + shift * in;
|
out = out + shift * in;
|
||||||
}
|
}
|
||||||
void AdjOp (const Field &in, Field &out){
|
void AdjOp (const Field &in, Field &out){
|
||||||
// std::cout << "AdjOp: Mdag PV "<<std::endl;
|
std::cout << "AdjOp: Mdag PV "<<std::endl;
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
_PV.M(tmp,out);
|
_PV.M(tmp,out);
|
||||||
_Mat.Mdag(in,tmp);
|
_Mat.Mdag(in,tmp);
|
||||||
@ -98,7 +98,7 @@ public:
|
|||||||
}
|
}
|
||||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
|
||||||
void HermOp(const Field &in, Field &out){
|
void HermOp(const Field &in, Field &out){
|
||||||
// std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
|
std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
Op(in,tmp);
|
Op(in,tmp);
|
||||||
AdjOp(tmp,out);
|
AdjOp(tmp,out);
|
||||||
|
@ -1,14 +0,0 @@
|
|||||||
<?xml version="1.0"?>
|
|
||||||
<grid>
|
|
||||||
<LanczosParameters>
|
|
||||||
<mass>0.00107</mass>
|
|
||||||
<M5>1.8</M5>
|
|
||||||
<Ls>48</Ls>
|
|
||||||
<Nstop>10</Nstop>
|
|
||||||
<Nk>15</Nk>
|
|
||||||
<Np>85</Np>
|
|
||||||
<ChebyLow>0.003</ChebyLow>
|
|
||||||
<ChebyHigh>60</ChebyHigh>
|
|
||||||
<ChebyOrder>201</ChebyOrder>
|
|
||||||
</LanczosParameters>
|
|
||||||
</grid>
|
|
@ -1,346 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./tests/Test_dwf_G5R5.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Chulwoo Jung <chulwoo@bnl.gov>
|
|
||||||
From Duo and Bob's Chirality study
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
;
|
|
||||||
|
|
||||||
//typedef WilsonFermionD FermionOp;
|
|
||||||
typedef DomainWallFermionD FermionOp;
|
|
||||||
typedef typename DomainWallFermionD::FermionField FermionField;
|
|
||||||
|
|
||||||
|
|
||||||
RealD AllZero(RealD x) { return 0.; }
|
|
||||||
|
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
struct LanczosParameters: Serializable {
|
|
||||||
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
|
|
||||||
RealD, mass ,
|
|
||||||
RealD, M5 ,
|
|
||||||
Integer, Ls,
|
|
||||||
Integer, Nstop,
|
|
||||||
Integer, Nk,
|
|
||||||
Integer, Np,
|
|
||||||
RealD, ChebyLow,
|
|
||||||
RealD, ChebyHigh,
|
|
||||||
Integer, ChebyOrder)
|
|
||||||
// Integer, StartTrajectory,
|
|
||||||
// Integer, Trajectories, /* @brief Number of sweeps in this run */
|
|
||||||
// bool, MetropolisTest,
|
|
||||||
// Integer, NoMetropolisUntil,
|
|
||||||
// std::string, StartingType,
|
|
||||||
// Integer, SW,
|
|
||||||
// RealD, Kappa,
|
|
||||||
// IntegratorParameters, MD)
|
|
||||||
|
|
||||||
LanczosParameters() {
|
|
||||||
////////////////////////////// Default values
|
|
||||||
mass = 0;
|
|
||||||
// MetropolisTest = true;
|
|
||||||
// NoMetropolisUntil = 10;
|
|
||||||
// StartTrajectory = 0;
|
|
||||||
// SW = 2;
|
|
||||||
// Trajectories = 10;
|
|
||||||
// StartingType = "HotStart";
|
|
||||||
/////////////////////////////////
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class ReaderClass >
|
|
||||||
LanczosParameters(Reader<ReaderClass> & TheReader){
|
|
||||||
initialize(TheReader);
|
|
||||||
}
|
|
||||||
|
|
||||||
template < class ReaderClass >
|
|
||||||
void initialize(Reader<ReaderClass> &TheReader){
|
|
||||||
// std::cout << GridLogMessage << "Reading HMC\n";
|
|
||||||
read(TheReader, "HMC", *this);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void print_parameters() const {
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
|
|
||||||
// MD.print_parameters();
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
Grid_init(&argc, &argv);
|
|
||||||
|
|
||||||
LanczosParameters LanParams;
|
|
||||||
#if 1
|
|
||||||
{
|
|
||||||
XmlReader HMCrd("LanParams.xml");
|
|
||||||
read(HMCrd,"LanczosParameters",LanParams);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
{
|
|
||||||
LanParams.mass = mass;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
std::cout << GridLogMessage<< LanParams <<std::endl;
|
|
||||||
{
|
|
||||||
XmlWriter HMCwr("LanParams.xml.out");
|
|
||||||
write(HMCwr,"LanczosParameters",LanParams);
|
|
||||||
}
|
|
||||||
|
|
||||||
int Ls=16;
|
|
||||||
RealD M5=1.8;
|
|
||||||
RealD mass = -1.0;
|
|
||||||
|
|
||||||
mass=LanParams.mass;
|
|
||||||
Ls=LanParams.Ls;
|
|
||||||
M5=LanParams.M5;
|
|
||||||
|
|
||||||
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
|
|
||||||
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
|
|
||||||
GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian* UrbGrid =
|
|
||||||
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
// GridCartesian* FGrid = UGrid;
|
|
||||||
// GridRedBlackCartesian* FrbGrid = UrbGrid;
|
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
|
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
|
|
||||||
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1, 2, 3, 4});
|
|
||||||
std::vector<int> seeds5({5, 6, 7, 8});
|
|
||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
|
||||||
GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid);
|
|
||||||
|
|
||||||
FieldMetaData header;
|
|
||||||
std::string file("./config");
|
|
||||||
|
|
||||||
int precision32 = 0;
|
|
||||||
int tworow = 0;
|
|
||||||
NerscIO::readConfiguration(Umu,header,file);
|
|
||||||
|
|
||||||
/*
|
|
||||||
std::vector<LatticeColourMatrix> U(4, UGrid);
|
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
int Nstop = 10;
|
|
||||||
int Nk = 20;
|
|
||||||
int Np = 80;
|
|
||||||
Nstop=LanParams.Nstop;
|
|
||||||
Nk=LanParams.Nk;
|
|
||||||
Np=LanParams.Np;
|
|
||||||
|
|
||||||
int Nm = Nk + Np;
|
|
||||||
int MaxIt = 10000;
|
|
||||||
RealD resid = 1.0e-5;
|
|
||||||
|
|
||||||
|
|
||||||
//while ( mass > - 5.0){
|
|
||||||
FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
|
||||||
MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
|
|
||||||
// Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
|
|
||||||
Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
|
|
||||||
// Gamma5R5HermitianLinearOperator
|
|
||||||
std::vector<double> Coeffs{0, 1.};
|
|
||||||
Polynomial<FermionField> PolyX(Coeffs);
|
|
||||||
|
|
||||||
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
|
|
||||||
|
|
||||||
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
|
|
||||||
PlainHermOp<FermionField> Op (HermOp);
|
|
||||||
PlainHermOp<FermionField> Op2 (G5R5Herm);
|
|
||||||
|
|
||||||
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
|
|
||||||
|
|
||||||
std::vector<RealD> eval(Nm);
|
|
||||||
FermionField src(FGrid);
|
|
||||||
gaussian(RNG5, src);
|
|
||||||
std::vector<FermionField> evec(Nm, FGrid);
|
|
||||||
for (int i = 0; i < 1; i++) {
|
|
||||||
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
|
|
||||||
<< std::endl;
|
|
||||||
};
|
|
||||||
|
|
||||||
int Nconv;
|
|
||||||
IRL.calc(eval, evec, src, Nconv);
|
|
||||||
|
|
||||||
std::cout << mass <<" : " << eval << std::endl;
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
Gamma g5(Gamma::Algebra::Gamma5) ;
|
|
||||||
ComplexD dot;
|
|
||||||
FermionField tmp(FGrid);
|
|
||||||
// RealD eMe,eMMe;
|
|
||||||
for (int i = 0; i < Nstop ; i++) {
|
|
||||||
// tmp = g5*evec[i];
|
|
||||||
dot = innerProduct(evec[i],evec[i]);
|
|
||||||
// G5R5(tmp,evec[i]);
|
|
||||||
G5R5Herm.HermOpAndNorm(evec[i],tmp,eMe,eMMe);
|
|
||||||
std::cout <<"Norm "<<M5<<" "<< mass << " : " << i << " " << real(dot) << " " << imag(dot) << " "<< eMe << " " <<eMMe<< std::endl ;
|
|
||||||
for (int j = 0; j < Nstop ; j++) {
|
|
||||||
dot = innerProduct(tmp,evec[j]);
|
|
||||||
std::cout <<"G5R5 "<<M5<<" "<< mass << " : " << i << " " <<j<<" " << real(dot) << " " << imag(dot) << std::endl ;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// src = evec[0]+evec[1]+evec[2];
|
|
||||||
// mass += -0.1;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//**********************************************************************
|
|
||||||
//orthogonalization
|
|
||||||
//calculat the matrix
|
|
||||||
cout << "Start orthogonalization " << endl;
|
|
||||||
cout << "calculate the matrix element" << endl;
|
|
||||||
vector<LatticeFermion> G5R5Mevec(Nconv, FGrid);
|
|
||||||
vector<LatticeFermion> finalevec(Nconv, FGrid);
|
|
||||||
vector<RealD> eMe(Nconv), eMMe(Nconv);
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
|
|
||||||
}
|
|
||||||
cout << "Re<evec, G5R5M(evec)>: " << endl;
|
|
||||||
cout << eMe << endl;
|
|
||||||
cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
|
|
||||||
cout << eMMe << endl;
|
|
||||||
vector<vector<ComplexD>> VevecG5R5Mevec(Nconv);
|
|
||||||
Eigen::MatrixXcd evecG5R5Mevec = Eigen::MatrixXcd::Zero(Nconv, Nconv);
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
VevecG5R5Mevec[i].resize(Nconv);
|
|
||||||
for(int j = 0; j < Nconv; j++){
|
|
||||||
VevecG5R5Mevec[i][j] = innerProduct(evec[i], G5R5Mevec[j]);
|
|
||||||
evecG5R5Mevec(i, j) = VevecG5R5Mevec[i][j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//calculate eigenvector
|
|
||||||
cout << "Eigen solver" << endl;
|
|
||||||
Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(evecG5R5Mevec);
|
|
||||||
vector<RealD> eigeneval(Nconv);
|
|
||||||
vector<vector<ComplexD>> eigenevec(Nconv);
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
eigeneval[i] = eigensolver.eigenvalues()[i];
|
|
||||||
eigenevec[i].resize(Nconv);
|
|
||||||
for(int j = 0; j < Nconv; j++){
|
|
||||||
eigenevec[i][j] = eigensolver.eigenvectors()(i, j);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//rotation
|
|
||||||
cout << "Do rotation" << endl;
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
finalevec[i] = finalevec[i] - finalevec[i];
|
|
||||||
for(int j = 0; j < Nconv; j++){
|
|
||||||
finalevec[i] = eigenevec[j][i]*evec[j] + finalevec[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//normalize again;
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
RealD tmp_RealD = norm2(finalevec[i]);
|
|
||||||
tmp_RealD = 1./pow(tmp_RealD, 0.5);
|
|
||||||
finalevec[i] = finalevec[i]*tmp_RealD;
|
|
||||||
}
|
|
||||||
|
|
||||||
//check
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
//**********************************************************************
|
|
||||||
//sort the eigenvectors
|
|
||||||
vector<LatticeFermion> finalevec_copy(Nconv, FGrid);
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
finalevec_copy[i] = finalevec[i];
|
|
||||||
}
|
|
||||||
vector<RealD> eMe_copy(eMe);
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
eMe[i] = fabs(eMe[i]);
|
|
||||||
eMe_copy[i] = eMe[i];
|
|
||||||
}
|
|
||||||
sort(eMe_copy.begin(), eMe_copy.end());
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
for(int j = 0; j < Nconv; j++){
|
|
||||||
if(eMe[j] == eMe_copy[i]){
|
|
||||||
finalevec[i] = finalevec_copy[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
|
|
||||||
}
|
|
||||||
cout << "Re<evec, G5R5M(evec)>: " << endl;
|
|
||||||
cout << eMe << endl;
|
|
||||||
cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
|
|
||||||
cout << eMMe << endl;
|
|
||||||
|
|
||||||
|
|
||||||
// vector<LatticeFermion> finalevec(Nconv, FGrid);
|
|
||||||
// temporary, until doing rotation
|
|
||||||
// for(int i = 0; i < Nconv; i++)
|
|
||||||
// finalevec[i]=evec[i];
|
|
||||||
//**********************************************************************
|
|
||||||
//calculate chirality matrix
|
|
||||||
vector<LatticeFermion> G5evec(Nconv, FGrid);
|
|
||||||
vector<vector<ComplexD>> chiral_matrix(Nconv);
|
|
||||||
vector<vector<RealD>> chiral_matrix_real(Nconv);
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
// G5evec[i] = G5evec[i] - G5evec[i];
|
|
||||||
G5evec[i] = Zero();
|
|
||||||
for(int j = 0; j < Ls/2; j++){
|
|
||||||
axpby_ssp(G5evec[i], 1., finalevec[i], 0., G5evec[i], j, j);
|
|
||||||
}
|
|
||||||
for(int j = Ls/2; j < Ls; j++){
|
|
||||||
axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
chiral_matrix_real[i].resize(Nconv);
|
|
||||||
chiral_matrix[i].resize(Nconv);
|
|
||||||
for(int j = 0; j < Nconv; j++){
|
|
||||||
chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
|
|
||||||
chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
|
|
||||||
std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int i = 0; i < Nconv; i++){
|
|
||||||
if(chiral_matrix[i][i].real() < 0.){
|
|
||||||
chiral_matrix_real[i][i] = -1. * chiral_matrix_real[i][i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
Grid_finalize();
|
|
||||||
}
|
|
@ -1,278 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./tests/Test_dwf_lanczos.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Chulwoo Jung <chulwoo@bnl.gov>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
;
|
|
||||||
|
|
||||||
typedef WilsonFermionD FermionOp;
|
|
||||||
typedef typename WilsonFermionD::FermionField FermionField;
|
|
||||||
|
|
||||||
|
|
||||||
RealD AllZero(RealD x) { return 0.; }
|
|
||||||
|
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
template<typename Field>
|
|
||||||
class RationalHermOp : public LinearFunction<Field> {
|
|
||||||
public:
|
|
||||||
using LinearFunction<Field>::operator();
|
|
||||||
// OperatorFunction<Field> & _poly;
|
|
||||||
LinearOperatorBase<Field> &_Linop;
|
|
||||||
RealD _massDen, _massNum;
|
|
||||||
|
|
||||||
FunctionHermOp(LinearOperatorBase<Field>& linop, RealD massDen,RealD massNum)
|
|
||||||
: _Linop(linop) ,_massDen(massDen),_massNum(massNum) {};
|
|
||||||
|
|
||||||
void operator()(const Field& in, Field& out) {
|
|
||||||
// _poly(_Linop,in,out);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<class Matrix,class Field>
|
|
||||||
class InvG5LinearOperator : public LinearOperatorBase<Field> {
|
|
||||||
Matrix &_Mat;
|
|
||||||
RealD _num;
|
|
||||||
RealD _Tol;
|
|
||||||
Integer _MaxIt;
|
|
||||||
Gamma g5;
|
|
||||||
|
|
||||||
public:
|
|
||||||
InvG5LinearOperator(Matrix &Mat,RealD num): _Mat(Mat),_num(num), _Tol(1e-12),_MaxIt(10000), g5(Gamma::Algebra::Gamma5) {};
|
|
||||||
|
|
||||||
// Support for coarsening to a multigrid
|
|
||||||
void OpDiag (const Field &in, Field &out) {
|
|
||||||
assert(0);
|
|
||||||
_Mat.Mdiag(in,out);
|
|
||||||
}
|
|
||||||
void OpDir (const Field &in, Field &out,int dir,int disp) {
|
|
||||||
assert(0);
|
|
||||||
_Mat.Mdir(in,out,dir,disp);
|
|
||||||
}
|
|
||||||
void OpDirAll (const Field &in, std::vector<Field> &out){
|
|
||||||
assert(0);
|
|
||||||
_Mat.MdirAll(in,out);
|
|
||||||
};
|
|
||||||
void Op (const Field &in, Field &out){
|
|
||||||
assert(0);
|
|
||||||
_Mat.M(in,out);
|
|
||||||
}
|
|
||||||
void AdjOp (const Field &in, Field &out){
|
|
||||||
assert(0);
|
|
||||||
_Mat.Mdag(in,out);
|
|
||||||
}
|
|
||||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
|
||||||
HermOp(in,out);
|
|
||||||
ComplexD dot = innerProduct(in,out);
|
|
||||||
n1=real(dot);
|
|
||||||
n2=norm2(out);
|
|
||||||
}
|
|
||||||
void HermOp(const Field &in, Field &out){
|
|
||||||
Field tmp(in.Grid());
|
|
||||||
MdagMLinearOperator<Matrix,Field> denom(_Mat);
|
|
||||||
ConjugateGradient<Field> CG(_Tol,_MaxIt);
|
|
||||||
_Mat.M(in,tmp);
|
|
||||||
tmp += _num*in;
|
|
||||||
_Mat.Mdag(tmp,out);
|
|
||||||
CG(denom,out,tmp);
|
|
||||||
out = g5*tmp;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
struct LanczosParameters: Serializable {
|
|
||||||
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
|
|
||||||
RealD, mass ,
|
|
||||||
RealD, resid,
|
|
||||||
RealD, ChebyLow,
|
|
||||||
RealD, ChebyHigh,
|
|
||||||
Integer, ChebyOrder)
|
|
||||||
// Integer, StartTrajectory,
|
|
||||||
// Integer, Trajectories, /* @brief Number of sweeps in this run */
|
|
||||||
// bool, MetropolisTest,
|
|
||||||
// Integer, NoMetropolisUntil,
|
|
||||||
// std::string, StartingType,
|
|
||||||
// Integer, SW,
|
|
||||||
// RealD, Kappa,
|
|
||||||
// IntegratorParameters, MD)
|
|
||||||
|
|
||||||
LanczosParameters() {
|
|
||||||
////////////////////////////// Default values
|
|
||||||
mass = 0;
|
|
||||||
// MetropolisTest = true;
|
|
||||||
// NoMetropolisUntil = 10;
|
|
||||||
// StartTrajectory = 0;
|
|
||||||
// SW = 2;
|
|
||||||
// Trajectories = 10;
|
|
||||||
// StartingType = "HotStart";
|
|
||||||
/////////////////////////////////
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class ReaderClass >
|
|
||||||
LanczosParameters(Reader<ReaderClass> & TheReader){
|
|
||||||
initialize(TheReader);
|
|
||||||
}
|
|
||||||
|
|
||||||
template < class ReaderClass >
|
|
||||||
void initialize(Reader<ReaderClass> &TheReader){
|
|
||||||
// std::cout << GridLogMessage << "Reading HMC\n";
|
|
||||||
read(TheReader, "HMC", *this);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void print_parameters() const {
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
|
|
||||||
// MD.print_parameters();
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
Grid_init(&argc, &argv);
|
|
||||||
|
|
||||||
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
|
|
||||||
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
|
|
||||||
GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian* UrbGrid =
|
|
||||||
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian* FGrid = UGrid;
|
|
||||||
GridRedBlackCartesian* FrbGrid = UrbGrid;
|
|
||||||
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1, 2, 3, 4});
|
|
||||||
std::vector<int> seeds5({5, 6, 7, 8});
|
|
||||||
GridParallelRNG RNG5(FGrid);
|
|
||||||
RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
GridParallelRNG RNG4(UGrid);
|
|
||||||
RNG4.SeedFixedIntegers(seeds4);
|
|
||||||
GridParallelRNG RNG5rb(FrbGrid);
|
|
||||||
RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid);
|
|
||||||
// SU<Nc>::HotConfiguration(RNG4, Umu);
|
|
||||||
|
|
||||||
FieldMetaData header;
|
|
||||||
std::string file("./config");
|
|
||||||
|
|
||||||
int precision32 = 0;
|
|
||||||
int tworow = 0;
|
|
||||||
// NerscIO::writeConfiguration(Umu,file,tworow,precision32);
|
|
||||||
NerscIO::readConfiguration(Umu,header,file);
|
|
||||||
|
|
||||||
/*
|
|
||||||
std::vector<LatticeColourMatrix> U(4, UGrid);
|
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
int Nstop = 5;
|
|
||||||
int Nk = 10;
|
|
||||||
int Np = 90;
|
|
||||||
int Nm = Nk + Np;
|
|
||||||
int MaxIt = 10000;
|
|
||||||
RealD resid = 1.0e-5;
|
|
||||||
|
|
||||||
RealD mass = -1.0;
|
|
||||||
|
|
||||||
LanczosParameters LanParams;
|
|
||||||
#if 1
|
|
||||||
{
|
|
||||||
XmlReader HMCrd("LanParams.xml");
|
|
||||||
read(HMCrd,"LanczosParameters",LanParams);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
{
|
|
||||||
LanParams.mass = mass;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
std::cout << GridLogMessage<< LanParams <<std::endl;
|
|
||||||
{
|
|
||||||
XmlWriter HMCwr("LanParams.xml.out");
|
|
||||||
write(HMCwr,"LanczosParameters",LanParams);
|
|
||||||
}
|
|
||||||
|
|
||||||
mass=LanParams.mass;
|
|
||||||
resid=LanParams.resid;
|
|
||||||
|
|
||||||
|
|
||||||
while ( mass > - 5.0){
|
|
||||||
FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
|
|
||||||
InvG5LinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator,-2.); /// <-----
|
|
||||||
//SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
|
|
||||||
// Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
|
|
||||||
|
|
||||||
std::vector<double> Coeffs{0, 0, 1.};
|
|
||||||
Polynomial<FermionField> PolyX(Coeffs);
|
|
||||||
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
|
|
||||||
|
|
||||||
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
|
|
||||||
// InvHermOp<FermionField> Op(WilsonOperator,HermOp);
|
|
||||||
PlainHermOp<FermionField> Op (HermOp);
|
|
||||||
// PlainHermOp<FermionField> Op2 (HermOp2);
|
|
||||||
|
|
||||||
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
|
|
||||||
|
|
||||||
std::vector<RealD> eval(Nm);
|
|
||||||
FermionField src(FGrid);
|
|
||||||
gaussian(RNG5, src);
|
|
||||||
std::vector<FermionField> evec(Nm, FGrid);
|
|
||||||
for (int i = 0; i < 1; i++) {
|
|
||||||
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
|
|
||||||
<< std::endl;
|
|
||||||
};
|
|
||||||
|
|
||||||
int Nconv;
|
|
||||||
IRL.calc(eval, evec, src, Nconv);
|
|
||||||
|
|
||||||
std::cout << mass <<" : " << eval << std::endl;
|
|
||||||
|
|
||||||
Gamma g5(Gamma::Algebra::Gamma5) ;
|
|
||||||
ComplexD dot;
|
|
||||||
FermionField tmp(FGrid);
|
|
||||||
for (int i = 0; i < Nstop ; i++) {
|
|
||||||
tmp = g5*evec[i];
|
|
||||||
dot = innerProduct(tmp,evec[i]);
|
|
||||||
std::cout << mass << " : " << eval[i] << " " << real(dot) << " " << imag(dot) << std::endl ;
|
|
||||||
}
|
|
||||||
src = evec[0]+evec[1]+evec[2];
|
|
||||||
mass += -0.1;
|
|
||||||
}
|
|
||||||
|
|
||||||
Grid_finalize();
|
|
||||||
}
|
|
@ -1,211 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./tests/Test_dwf_lanczos.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Chulwoo Jung <chulwoo@bnl.gov>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
;
|
|
||||||
|
|
||||||
typedef WilsonFermionD FermionOp;
|
|
||||||
typedef typename WilsonFermionD::FermionField FermionField;
|
|
||||||
|
|
||||||
|
|
||||||
RealD AllZero(RealD x) { return 0.; }
|
|
||||||
|
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
struct LanczosParameters: Serializable {
|
|
||||||
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
|
|
||||||
RealD, mass ,
|
|
||||||
RealD, ChebyLow,
|
|
||||||
RealD, ChebyHigh,
|
|
||||||
Integer, ChebyOrder)
|
|
||||||
// Integer, StartTrajectory,
|
|
||||||
// Integer, Trajectories, /* @brief Number of sweeps in this run */
|
|
||||||
// bool, MetropolisTest,
|
|
||||||
// Integer, NoMetropolisUntil,
|
|
||||||
// std::string, StartingType,
|
|
||||||
// Integer, SW,
|
|
||||||
// RealD, Kappa,
|
|
||||||
// IntegratorParameters, MD)
|
|
||||||
|
|
||||||
LanczosParameters() {
|
|
||||||
////////////////////////////// Default values
|
|
||||||
mass = 0;
|
|
||||||
// MetropolisTest = true;
|
|
||||||
// NoMetropolisUntil = 10;
|
|
||||||
// StartTrajectory = 0;
|
|
||||||
// SW = 2;
|
|
||||||
// Trajectories = 10;
|
|
||||||
// StartingType = "HotStart";
|
|
||||||
/////////////////////////////////
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class ReaderClass >
|
|
||||||
LanczosParameters(Reader<ReaderClass> & TheReader){
|
|
||||||
initialize(TheReader);
|
|
||||||
}
|
|
||||||
|
|
||||||
template < class ReaderClass >
|
|
||||||
void initialize(Reader<ReaderClass> &TheReader){
|
|
||||||
// std::cout << GridLogMessage << "Reading HMC\n";
|
|
||||||
read(TheReader, "HMC", *this);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void print_parameters() const {
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
|
|
||||||
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
|
|
||||||
// MD.print_parameters();
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
Grid_init(&argc, &argv);
|
|
||||||
|
|
||||||
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
|
|
||||||
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
|
|
||||||
GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian* UrbGrid =
|
|
||||||
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian* FGrid = UGrid;
|
|
||||||
GridRedBlackCartesian* FrbGrid = UrbGrid;
|
|
||||||
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1, 2, 3, 4});
|
|
||||||
std::vector<int> seeds5({5, 6, 7, 8});
|
|
||||||
GridParallelRNG RNG5(FGrid);
|
|
||||||
RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
GridParallelRNG RNG4(UGrid);
|
|
||||||
RNG4.SeedFixedIntegers(seeds4);
|
|
||||||
GridParallelRNG RNG5rb(FrbGrid);
|
|
||||||
RNG5.SeedFixedIntegers(seeds5);
|
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid);
|
|
||||||
// SU<Nc>::HotConfiguration(RNG4, Umu);
|
|
||||||
|
|
||||||
FieldMetaData header;
|
|
||||||
std::string file("./config");
|
|
||||||
|
|
||||||
int precision32 = 0;
|
|
||||||
int tworow = 0;
|
|
||||||
// NerscIO::writeConfiguration(Umu,file,tworow,precision32);
|
|
||||||
NerscIO::readConfiguration(Umu,header,file);
|
|
||||||
|
|
||||||
/*
|
|
||||||
std::vector<LatticeColourMatrix> U(4, UGrid);
|
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
int Nstop = 10;
|
|
||||||
int Nk = 20;
|
|
||||||
int Np = 80;
|
|
||||||
int Nm = Nk + Np;
|
|
||||||
int MaxIt = 10000;
|
|
||||||
RealD resid = 1.0e-5;
|
|
||||||
|
|
||||||
RealD mass = -1.0;
|
|
||||||
|
|
||||||
LanczosParameters LanParams;
|
|
||||||
#if 1
|
|
||||||
{
|
|
||||||
XmlReader HMCrd("LanParams.xml");
|
|
||||||
read(HMCrd,"LanczosParameters",LanParams);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
{
|
|
||||||
LanParams.mass = mass;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
std::cout << GridLogMessage<< LanParams <<std::endl;
|
|
||||||
{
|
|
||||||
XmlWriter HMCwr("LanParams.xml.out");
|
|
||||||
write(HMCwr,"LanczosParameters",LanParams);
|
|
||||||
}
|
|
||||||
|
|
||||||
mass=LanParams.mass;
|
|
||||||
|
|
||||||
|
|
||||||
while ( mass > - 5.0){
|
|
||||||
FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
|
|
||||||
MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
|
|
||||||
//SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
|
|
||||||
Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
|
|
||||||
|
|
||||||
std::vector<double> Coeffs{0, 1.};
|
|
||||||
Polynomial<FermionField> PolyX(Coeffs);
|
|
||||||
// Chebyshev<FermionField> Cheby(0.5, 60., 31);
|
|
||||||
// RealD, ChebyLow,
|
|
||||||
// RealD, ChebyHigh,
|
|
||||||
// Integer, ChebyOrder)
|
|
||||||
|
|
||||||
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
|
|
||||||
|
|
||||||
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
|
|
||||||
PlainHermOp<FermionField> Op (HermOp);
|
|
||||||
PlainHermOp<FermionField> Op2 (HermOp2);
|
|
||||||
|
|
||||||
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
|
|
||||||
|
|
||||||
std::vector<RealD> eval(Nm);
|
|
||||||
FermionField src(FGrid);
|
|
||||||
gaussian(RNG5, src);
|
|
||||||
std::vector<FermionField> evec(Nm, FGrid);
|
|
||||||
for (int i = 0; i < 1; i++) {
|
|
||||||
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
|
|
||||||
<< std::endl;
|
|
||||||
};
|
|
||||||
|
|
||||||
int Nconv;
|
|
||||||
IRL.calc(eval, evec, src, Nconv);
|
|
||||||
|
|
||||||
std::cout << mass <<" : " << eval << std::endl;
|
|
||||||
|
|
||||||
Gamma g5(Gamma::Algebra::Gamma5) ;
|
|
||||||
ComplexD dot;
|
|
||||||
FermionField tmp(FGrid);
|
|
||||||
for (int i = 0; i < Nstop ; i++) {
|
|
||||||
tmp = g5*evec[i];
|
|
||||||
dot = innerProduct(tmp,evec[i]);
|
|
||||||
std::cout << mass << " : " << eval[i] << " " << real(dot) << " " << imag(dot) << std::endl ;
|
|
||||||
}
|
|
||||||
src = evec[0]+evec[1]+evec[2];
|
|
||||||
mass += -0.1;
|
|
||||||
}
|
|
||||||
|
|
||||||
Grid_finalize();
|
|
||||||
}
|
|
Reference in New Issue
Block a user