1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 20:57:06 +01:00

Compare commits

..

2 Commits

Author SHA1 Message Date
dcfc4ed81f Merge 32e6d58356 into ba9bbe0221 2025-02-13 11:26:59 +00:00
32e6d58356 use accelerator for setCheckerboard in RHMC 2021-12-22 23:43:43 +00:00
43 changed files with 178 additions and 1632 deletions

View File

@ -191,7 +191,7 @@ public:
Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite);
//std::cout << "CPU view" << std::endl;
std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@ -215,7 +215,7 @@ public:
else if ( sign == forward ) div = 1.0;
else assert(0);
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -229,7 +229,7 @@ public:
}
// Barrel shift and collect global pencil
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
@ -251,7 +251,7 @@ public:
}
}
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
@ -274,7 +274,7 @@ public:
usec += timer.useconds();
flops+= flops_call*NN;
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result
{
autoView(pgbuf_v,pgbuf,CpuRead);
@ -291,7 +291,7 @@ public:
}
result = result*div;
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif

View File

@ -277,38 +277,6 @@ public:
assert(0);
}
};
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
}
};
//////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several

View File

@ -208,8 +208,8 @@ public:
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T);
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@ -367,67 +367,28 @@ public:
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else {
assert(0);
@ -453,8 +414,8 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T);
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@ -553,70 +514,28 @@ public:
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else {
assert(0);
@ -742,41 +661,29 @@ public:
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
});
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
}
@ -902,40 +809,28 @@ public:
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);

View File

@ -245,10 +245,9 @@ until convergence
_HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n);
RealD na = std::sqrt(vnum/vden);
RealD na = vnum/vden;
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
@ -256,7 +255,6 @@ until convergence
src_n = tmp;
}
}
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm);

View File

@ -97,7 +97,7 @@ public:
RealD scale;
ConjugateGradient<FineField> CG(1.0e-3,400,false);
ConjugateGradient<FineField> CG(1.0e-2,100,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
@ -110,7 +110,7 @@ public:
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<4;i++){
for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]);
@ -146,7 +146,7 @@ public:
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<2;i++){
for(int i=0;i<3;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;

View File

@ -441,20 +441,8 @@ public:
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
#else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
@ -470,9 +458,11 @@ public:
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,V.subspace);
blockOrthogonalise(InnerProd,U.subspace);
blockOrthogonalise(InnerProd,Subspace.subspace);
// for(int s=0;s<Subspace.subspace.size();s++){
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
// }
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
@ -552,7 +542,7 @@ public:
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond();
phaV = phaF[p]*V.subspace[i];
phaV = phaF[p]*Subspace.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
@ -565,7 +555,7 @@ public:
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond();
blockProject(coarseInner,MphaV,U.subspace);
blockProject(coarseInner,MphaV,Subspace.subspace);
coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner;

View File

@ -69,7 +69,7 @@ public:
}
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p, const _Tp& __val) { assert(0);};
void construct(pointer __p) { };
void destroy(pointer __p) { };
};

View File

@ -234,9 +234,6 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
if(bytes>=DeviceMaxBytes) {
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){

View File

@ -149,8 +149,7 @@ public:
sizeof(obj),d*100+p);
}
if (!list.empty()) // avoid triggering assert in comms == none
CommsComplete(list);
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}

View File

@ -438,15 +438,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(rrq);
off_node_bytes+=rbytes;
}
#ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif
}
// This is a NVLINK PUT
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
@ -455,11 +448,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
}
}
return off_node_bytes;
@ -468,7 +459,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise();
if (nreq==0) return;
@ -755,31 +746,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
// int nreq=list.size();
std::vector<MPI_Status> status;
std::vector<MPI_Request> MpiRequests;
for(int r=0;r<list.size();r++){
// Must check each Send buf is clear to reuse
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
}
// if (nreq==0) return;
// std::vector<MPI_Status> status(nreq);
// std::vector<MPI_Request> MpiRequests(nreq);
int nreq=MpiRequests.size();
if (nreq>0) {
status.resize(MpiRequests.size());
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
assert(ierr==0);
}
// for(int r=0;r<nreq;r++){
// MpiRequests[r] = list[r].req;
// }
// int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
// assert(ierr==0);
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs

View File

@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{
assert(0);
}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,

View File

@ -137,7 +137,7 @@ public:
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
};

View File

@ -542,12 +542,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
// printf("Host buffer allocate for GPU non-aware MPI\n");
printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#if 0
#ifdef HAVE_NUMAIF_H
#warning "Moving host buffers to specific NUMA domain"
int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
@ -916,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes);
#endif
}
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
// acceleratorCopyToDevice(src,dest,bytes);
//#else
// bcopy(src,dest,bytes);
//#endif
//}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes);
#else
bcopy(src,dest,bytes);
#endif
}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
@ -959,7 +959,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
}
ShmBufferFreeAll();
@ -990,7 +989,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
}
#endif
SharedMemoryTest();
//SharedMemoryTest();
}
//////////////////////////////////////////////////////////////////
// On node barrier
@ -1012,18 +1011,19 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode;
check[1]=r;
check[2]=magic;
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
}
}
ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
ShmBarrier();
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==magic);
ShmBarrier();
}
ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
}
void *SharedMemory::ShmBuffer(int rank)

View File

@ -126,8 +126,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int cb= (cbmask==0x2)? Odd : Even;
@ -244,6 +244,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);

View File

@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
exit(EXIT_FAILURE);
}
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy
accelerator_barrier();

View File

@ -485,6 +485,7 @@ public:
assert(this->u_comm_offset==this->_unified_buffer_size);
accelerator_barrier();
#ifdef NVLINK_GET
#warning "NVLINK_GET"
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
// Or issue barrier AFTER the DMA is running

View File

@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
} else { \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
acceleratorSynchronise(); \
acceleratorSynchronise(); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi);
@ -504,7 +504,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
autoView(st_v , st,AcceleratorRead);
if( interior && exterior ) {
acceleratorFenceComputeStream();
// acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#ifndef GRID_CUDA

View File

@ -86,8 +86,13 @@ public:
assert(ForceE.Checkerboard()==Even);
assert(ForceO.Checkerboard()==Odd);
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorSetCheckerboard(Force,ForceE);
acceleratorSetCheckerboard(Force,ForceO);
#else
setCheckerboard(Force,ForceE);
setCheckerboard(Force,ForceO);
#endif
Force=-Force;
delete forcecb;
@ -130,8 +135,13 @@ public:
assert(ForceE.Checkerboard()==Even);
assert(ForceO.Checkerboard()==Odd);
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorSetCheckerboard(Force,ForceE);
acceleratorSetCheckerboard(Force,ForceO);
#else
setCheckerboard(Force,ForceE);
setCheckerboard(Force,ForceO);
#endif
Force=-Force;
delete forcecb;

View File

@ -446,7 +446,6 @@ public:
Communicate();
CommsMergeSHM(compress);
CommsMerge(compress);
accelerator_barrier();
}
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@ -519,6 +518,7 @@ public:
}
accelerator_barrier(); // All my local gathers are complete
#ifdef NVLINK_GET
#warning "NVLINK_GET"
_grid->StencilBarrier(); // He can now get mu local gather, I can get his
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
// Or issue barrier AFTER the DMA is running
@ -690,7 +690,6 @@ public:
}
}
}
// std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
surface_list.resize(surface_list_size);
std::vector<int> surface_list_host(surface_list_size);
int32_t ss=0;
@ -710,7 +709,7 @@ public:
}
}
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
// std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
}
/// Introduce a block structure and switch off comms on boundaries
void DirichletBlock(const Coordinate &dirichlet_block)
@ -802,8 +801,8 @@ public:
this->_entries_host_p = &_entries[0];
this->_entries_p = &_entries_device[0];
// std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
// <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
<<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
for(int ii=0;ii<npoints;ii++){

View File

@ -242,33 +242,19 @@ inline void *acceleratorAllocDevice(size_t bytes)
return ptr;
};
typedef int acceleratorEvent_t;
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
return 0;
}
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
acceleratorCopyFromDevice(from,to,bytes);
return 0;
}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
return 0;
}
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
//auto discard=cudaStreamSynchronize(ev);
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
inline int acceleratorIsCommunicable(void *ptr)
@ -377,8 +363,8 @@ inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *t
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
inline int acceleratorIsCommunicable(void *ptr)
@ -492,7 +478,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = hipHostMalloc((void **)&ptr,bytes);
auto err = hipMallocHost((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
@ -525,35 +511,23 @@ inline void *acceleratorAllocDevice(size_t bytes)
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
typedef int acceleratorEvent_t;
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
return 0;
}
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
acceleratorCopyToDevice(from,to,bytes);
return 0;
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
}
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
acceleratorCopyFromDevice(from,to,bytes);
return 0;
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
}
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
// auto discard=hipStreamSynchronize(ev);
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
#endif
inline void acceleratorPin(void *ptr,unsigned long bytes)
@ -590,8 +564,6 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)
#undef GRID_SIMT
typedef int acceleratorEvent_t;
inline void acceleratorMem(void)
{
/*
@ -611,12 +583,9 @@ inline void acceleratorMem(void)
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyToDevice(from,to,bytes); return 0; }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyFromDevice(from,to,bytes); return 0; }
inline void acceleratorEventWait(acceleratorEvent_t ev){}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); return 0;}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
inline void acceleratorCopySynchronise(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
@ -705,9 +674,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
acceleratorCopySynchronise();
}
template<class T> void acceleratorPut(T& dev,const T&host)
template<class T> void acceleratorPut(T& dev,T&host)
{
acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
acceleratorCopyToDevice(&host,&dev,sizeof(T));
}
template<class T> T acceleratorGet(T& dev)
{

View File

@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define thread_critical DO_PRAGMA(omp critical)
#ifdef GRID_OMP
inline void thread_bcopy(const void *from, void *to,size_t bytes)
inline void thread_bcopy(void *from, void *to,size_t bytes)
{
const uint64_t *ufrom = (const uint64_t *)from;
uint64_t *ufrom = (uint64_t *)from;
uint64_t *uto = (uint64_t *)to;
assert(bytes%8==0);
uint64_t words=bytes/8;
@ -84,7 +84,7 @@ inline void thread_bcopy(const void *from, void *to,size_t bytes)
});
}
#else
inline void thread_bcopy(const void *from, void *to,size_t bytes)
inline void thread_bcopy(void *from, void *to,size_t bytes)
{
bcopy(from,to,bytes);
}

View File

@ -509,14 +509,7 @@ void Grid_init(int *argc,char ***argv)
Grid_default_latt,
Grid_default_mpi);
if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
FlightRecorder::PrintEntireLog = 1;
FlightRecorder::ChecksumComms = 1;
FlightRecorder::ChecksumCommsSend=1;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@ -658,4 +651,3 @@ void Grid_debug_handler_init(void)
}
NAMESPACE_END(Grid);

View File

@ -50,7 +50,7 @@ namespace Grid{
int64_t index64;
IndexFromCoorReversed(coor,index64,dims);
if ( index64>=2*1024*1024*1024LL ){
// std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
}
assert(index64<2*1024*1024*1024LL);
index = (int) index64;

View File

@ -492,18 +492,17 @@ public:
}
FGrid->Barrier();
double t1=usecond();
uint64_t no = 50;
uint64_t ni = 100;
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(no);
for(uint64_t i=0;i<no;i++){
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
for(uint64_t j=0;j<ni;j++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
Dw.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
@ -521,11 +520,11 @@ public:
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max*ni;
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean*ni;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
@ -536,7 +535,6 @@ public:
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
}
@ -656,19 +654,17 @@ public:
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
uint64_t no = 50;
uint64_t ni = 100;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(no);
for(uint64_t i=0;i<no;i++){
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
for(uint64_t j=0;j<ni;j++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
Ds.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
@ -679,11 +675,11 @@ public:
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max*ni;
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean*ni;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
@ -693,7 +689,6 @@ public:
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
}
@ -797,18 +792,19 @@ public:
Dc.M(src,r);
}
FGrid->Barrier();
uint64_t ni = 100;
uint64_t no = 50;
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(no);
for(uint64_t i=0;i<no;i++){
double t0=usecond();
for(uint64_t j=0;j<ni;j++){
Dc.M(src,r);
}
double t1=usecond();
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Dc.M(src,r);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
@ -818,21 +814,20 @@ public:
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max*ni;
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean*ni;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call "<< timestat.mean/ni<<std::endl;
}
@ -877,7 +872,7 @@ int main (int argc, char ** argv)
int do_dslash=1;
int sel=4;
std::vector<int> L_list({8,12,16,24,32});
std::vector<int> L_list({8,12,16,24});
int selm1=sel-1;
std::vector<double> clover;

View File

@ -1,19 +1,18 @@
#Ahead of time compile for PVC
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
#JIT compile
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
../configure \
../../configure \
--enable-simd=GPU \
--enable-reduction=grid \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--enable-debug \
--prefix $HOME/gpt-install \
--disable-gparity \
--disable-fermion-reps \
--with-lime=$CLIME \

View File

@ -1,22 +0,0 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=none \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"

View File

@ -1,16 +0,0 @@
echo spack
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
#module load cce/15.0.1
module load rocm/6.3.1
module load cray-fftw
module load craype-accel-amd-gfx90a
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Ugly hacks to get down level software working on current system
#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .

View File

@ -30,10 +30,14 @@ source ${root}/sourceme.sh
export OMP_NUM_THREADS=7
export MPICH_GPU_SUPPORT_ENABLED=1
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
#64.64.32.96
for vol in 64.64.32.64
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
for vol in 32.32.32.64
do
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
done

View File

@ -3,19 +3,20 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=none \
--enable-tracing=timer \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--enable-accelerator-cshift \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"

View File

@ -1,25 +1,12 @@
echo spack
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
module load cce/15.0.1
module load rocm/5.3.0
spack load c-lime
module load emacs
module load PrgEnv-gnu
module load rocm/6.0.0
module load cray-mpich
module load gmp
module load cray-fftw
module load craype-accel-amd-gfx90a
#Ugly hacks to get down level software working on current system
export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
#echo spack load c-lime
#spack load c-lime
#module load emacs
##module load PrgEnv-gnu
##module load cray-mpich
##module load cray-fftw
##module load craype-accel-amd-gfx90a
##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Hack for lib
##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH

View File

@ -1,206 +0,0 @@
The purpose of this file is to collate all non-obvious known magic shell variables
and compiler flags required for either correctness or performance on various systems.
A repository of work-arounds.
Contents:
1. Interconnect + MPI
2. Compilation
3. Profiling
************************
* 1. INTERCONNECT + MPI
************************
--------------------------------------------------------------------
MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O
--------------------------------------------------------------------
export OMPI_MCA_io=romio321
--------------------------------------
ROMIO fail with > 2GB per node read (32 bit issue)
--------------------------------------
Use later MPICH
https://github.com/paboyle/Grid/issues/381
https://github.com/pmodels/mpich/commit/3a479ab0
--------------------------------------------------------------------
Slingshot: Frontier and Perlmutter libfabric slow down
and physical memory fragmentation
--------------------------------------------------------------------
export FI_MR_CACHE_MONITOR=disabled
or
export FI_MR_CACHE_MONITOR=kdreg2
--------------------------------------------------------------------
Perlmutter
--------------------------------------------------------------------
export MPICH_RDMA_ENABLED_CUDA=1
export MPICH_GPU_IPC_ENABLED=1
export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
export MPICH_GPU_NO_ASYNC_MEMCPY=0
--------------------------------------------------------------------
Frontier/LumiG
--------------------------------------------------------------------
Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
cat << EOF > select_gpu
#!/bin/bash
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
export GPU_MAP=(0 1 2 3 7 6 5 4)
export NUMA_MAP=(3 3 1 1 2 2 0 0)
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
export HIP_VISIBLE_DEVICES=\$GPU
unset ROCR_VISIBLE_DEVICES
echo RANK \$SLURM_LOCALID using GPU \$GPU
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./select_gpu
srun ./select_gpu BINARY
--------------------------------------------------------------------
Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
--------------------------------------------------------------------
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
--------------------------------------------------------------------
Mellanox + A100 correctness (Tursa, Booster, Leonardo)
--------------------------------------------------------------------
export UCX_MEMTYPE_CACHE=n
--------------------------------------------------------------------
MPICH/Aurora/PVC correctness and performance
--------------------------------------------------------------------
https://github.com/pmodels/mpich/issues/7302
--enable-cuda-aware-mpi=no
--enable-unified=no
Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
Do not use SVM
Ideally use MPICH with fix to issue 7302:
https://github.com/pmodels/mpich/pull/7312
Ideally:
MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
Alternatives:
export MPIR_CVAR_NOLOCAL=1
export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
--------------------------------------------------------------------
MPICH/Aurora/PVC correctness and performance
--------------------------------------------------------------------
Broken:
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
This gives good peformance without requiring
--enable-cuda-aware-mpi=no
But is an open issue reported by James Osborn
https://github.com/pmodels/mpich/issues/7139
Possibly resolved but unclear if in the installed software yet.
************************
* 2. COMPILATION
************************
--------------------------------------------------------------------
G++ compiler breakage / graveyard
--------------------------------------------------------------------
9.3.0, 10.3.1,
https://github.com/paboyle/Grid/issues/290
https://github.com/paboyle/Grid/issues/264
Working (-) Broken (X):
4.9.0 -
4.9.1 -
5.1.0 X
5.2.0 X
5.3.0 X
5.4.0 X
6.1.0 X
6.2.0 X
6.3.0 -
7.1.0 -
8.0.0 (HEAD) -
https://github.com/paboyle/Grid/issues/100
--------------------------------------------------------------------
AMD GPU nodes :
--------------------------------------------------------------------
multiple ROCM versions broken; use 5.3.0
manifests itself as wrong results in fp32
https://github.com/paboyle/Grid/issues/464
--------------------------------------------------------------------
Aurora/PVC
--------------------------------------------------------------------
SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
SYCL slow link and relocatable code issues (Christoph Lehner)
Opt large register file required for good performance in fp64
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC"
--------------------------------------------------------------------
Aurora/PVC useful extra options
--------------------------------------------------------------------
Host only sanitizer:
-Xarch_host -fsanitize=leak
-Xarch_host -fsanitize=address
Deterministic MPI reduction:
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
************************
* 3. Visual profile tools
************************
--------------------------------------------------------------------
Frontier/rocprof
--------------------------------------------------------------------
--------------------------------------------------------------------
Aurora/unitrace
--------------------------------------------------------------------
--------------------------------------------------------------------
Tursa/nsight-sys
--------------------------------------------------------------------

View File

@ -1,32 +0,0 @@
#!/bin/bash
#SBATCH --partition lqcd
#SBATCH --time=00:50:00
#SBATCH -A lqcdtest
#SBATCH -q lqcd
#SBATCH --exclusive
#SBATCH --nodes=1
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=64
#SBATCH --qos lqcd
source sourceme.sh
export PLACES=(1:16:4 1:32:2 0:64:1);
export THR=(16 32 64)
for t in 2
do
export OMP_NUM_THREADS=${THR[$t]}
export OMP_PLACES=${PLACES[$t]}
export thr=${THR[$t]}
#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
for vol in 48.48.48.96
do
srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
done
#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
done

View File

@ -1,36 +0,0 @@
#!/bin/bash
#SBATCH --partition lqcd
#SBATCH --time=00:50:00
#SBATCH -A lqcdtest
#SBATCH -q lqcd
#SBATCH --exclusive
#SBATCH --nodes=2
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=64
#SBATCH --qos lqcd
source sourceme.sh
export PLACES=(1:16:4 1:32:2 0:64:1);
export THR=(16 32 64)
nodes=2
mpi=1.1.1.2
for t in 2
do
export OMP_NUM_THREADS=${THR[$t]}
export OMP_PLACES=${PLACES[$t]}
export thr=${THR[$t]}
#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
for vol in 64.64.64.128
do
srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
done
done

View File

@ -1,16 +0,0 @@
../../configure \
--enable-comms=mpi-auto \
--enable-unified=yes \
--enable-shm=shmopen \
--enable-shm-fast-path=shmopen \
--enable-accelerator=none \
--enable-simd=AVX512 \
--disable-accelerator-cshift \
--disable-fermion-reps \
--disable-gparity \
CXX=clang++ \
MPICXX=mpicxx \
CXXFLAGS="-std=c++17"

View File

@ -1,4 +0,0 @@
source $HOME/spack/share/spack/setup-env.sh
spack load llvm@17.0.4
export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
module load openmpi

View File

@ -1,17 +0,0 @@
../../src/Grid/configure \
--prefix /home/pab/NPR/install \
--enable-comms=mpi-auto \
--enable-simd=AVX2 \
--enable-shm=none \
--enable-debug \
--with-lime=$CLIME \
--with-hdf5=$HDF5 \
--with-fftw=$FFTW \
--with-gmp=$GMP \
--with-mpfr=$MPFR \
--disable-gparity \
--disable-fermion-reps \
CXX=clang++ \
MPICXX=mpicxx \
CXXFLAGS="-std=c++17 "

View File

@ -1,28 +0,0 @@
source $HOME/spack/share/spack/setup-env.sh
spack load llvm@12
spack load autoconf%clang@12.0.1
spack load automake%clang@12.0.1
spack load c-lime%clang@12.0.1
spack load fftw%clang@12.0.1
spack load gmp%clang@12.0.1
spack load mpfr%clang@12.0.1
spack load openmpi%clang@12.0.1
spack load openssl%clang@12.0.1
spack load hdf5+cxx%clang@12.0.1
spack load cmake%clang@12.0.1
export FFTW=`spack find --paths fftw%clang@12.0.1 | grep ^fftw | awk '{print $2}' `
export HDF5=`spack find --paths hdf5+cxx%clang@12.0.1 | grep ^hdf5 | awk '{print $2}' `
export CLIME=`spack find --paths c-lime%clang@12.0.1 | grep ^c-lime | awk '{print $2}' `
export MPFR=`spack find --paths mpfr%clang@12.0.1 | grep ^mpfr | awk '{print $2}' `
export LLVM=`spack find --paths llvm@12 | grep ^llvm | awk '{print $2}' `
export OPENSSL=`spack find --paths openssl%clang@12.0.1 | grep openssl | awk '{print $2}' `
export GMP=`spack find --paths gmp%clang@12.0.1 | grep ^gmp | awk '{print $2}' `
export TCLAP=`spack find --paths tclap%clang@12.0.1 | grep ^tclap | awk '{print $2}' `
export LD_LIBRARY_PATH=${TCLAP}/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$FFTW/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$LLVM/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$LLVM/lib/x86_64-unknown-linux-gnu/:$LD_LIBRARY_PATH
ulimit -s 81920

View File

@ -1,19 +0,0 @@
cd
git clone https://github.com/spack/spack.git
source $HOME/spack/share/spack/setup-env.sh
spack install llvm@12
spack install autoconf%clang@12.0.1
spack install automake%clang@12.0.1
spack install c-lime%clang@12.0.1
spack install fftw%clang@12.0.1
spack install gmp%clang@12.0.1
spack install mpfr%clang@12.0.1
spack install openmpi%clang@12.0.1
spack install openssl%clang@12.0.1
spack install hdf5+cxx%clang@12.0.1
spack install cmake%clang@12.0.1
spack install tclap%clang@12.0.1
spack install emacs%clang@12.0.1

View File

@ -47,20 +47,20 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
void Op (const Field &in, Field &out){
// std::cout << "Op: PVdag M "<<std::endl;
std::cout << "Op: PVdag M "<<std::endl;
Field tmp(in.Grid());
_Mat.M(in,tmp);
_PV.Mdag(tmp,out);
}
void AdjOp (const Field &in, Field &out){
// std::cout << "AdjOp: Mdag PV "<<std::endl;
std::cout << "AdjOp: Mdag PV "<<std::endl;
Field tmp(in.Grid());
_PV.M(in,tmp);
_Mat.Mdag(tmp,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
void HermOp(const Field &in, Field &out){
// std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
Field tmp(in.Grid());
// _Mat.M(in,tmp);
// _PV.Mdag(tmp,out);
@ -83,14 +83,14 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
void Op (const Field &in, Field &out){
// std::cout << "Op: PVdag M "<<std::endl;
std::cout << "Op: PVdag M "<<std::endl;
Field tmp(in.Grid());
_Mat.M(in,tmp);
_PV.Mdag(tmp,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
// std::cout << "AdjOp: Mdag PV "<<std::endl;
std::cout << "AdjOp: Mdag PV "<<std::endl;
Field tmp(in.Grid());
_PV.M(tmp,out);
_Mat.Mdag(in,tmp);
@ -98,7 +98,7 @@ public:
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
void HermOp(const Field &in, Field &out){
// std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
Field tmp(in.Grid());
Op(in,tmp);
AdjOp(tmp,out);

View File

@ -1,14 +0,0 @@
<?xml version="1.0"?>
<grid>
<LanczosParameters>
<mass>0.00107</mass>
<M5>1.8</M5>
<Ls>48</Ls>
<Nstop>10</Nstop>
<Nk>15</Nk>
<Np>85</Np>
<ChebyLow>0.003</ChebyLow>
<ChebyHigh>60</ChebyHigh>
<ChebyOrder>201</ChebyOrder>
</LanczosParameters>
</grid>

View File

@ -1,346 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_G5R5.cc
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
From Duo and Bob's Chirality study
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
;
//typedef WilsonFermionD FermionOp;
typedef DomainWallFermionD FermionOp;
typedef typename DomainWallFermionD::FermionField FermionField;
RealD AllZero(RealD x) { return 0.; }
namespace Grid {
struct LanczosParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
RealD, mass ,
RealD, M5 ,
Integer, Ls,
Integer, Nstop,
Integer, Nk,
Integer, Np,
RealD, ChebyLow,
RealD, ChebyHigh,
Integer, ChebyOrder)
// Integer, StartTrajectory,
// Integer, Trajectories, /* @brief Number of sweeps in this run */
// bool, MetropolisTest,
// Integer, NoMetropolisUntil,
// std::string, StartingType,
// Integer, SW,
// RealD, Kappa,
// IntegratorParameters, MD)
LanczosParameters() {
////////////////////////////// Default values
mass = 0;
// MetropolisTest = true;
// NoMetropolisUntil = 10;
// StartTrajectory = 0;
// SW = 2;
// Trajectories = 10;
// StartingType = "HotStart";
/////////////////////////////////
}
template <class ReaderClass >
LanczosParameters(Reader<ReaderClass> & TheReader){
initialize(TheReader);
}
template < class ReaderClass >
void initialize(Reader<ReaderClass> &TheReader){
// std::cout << GridLogMessage << "Reading HMC\n";
read(TheReader, "HMC", *this);
}
void print_parameters() const {
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
// MD.print_parameters();
}
};
}
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
LanczosParameters LanParams;
#if 1
{
XmlReader HMCrd("LanParams.xml");
read(HMCrd,"LanczosParameters",LanParams);
}
#else
{
LanParams.mass = mass;
}
#endif
std::cout << GridLogMessage<< LanParams <<std::endl;
{
XmlWriter HMCwr("LanParams.xml.out");
write(HMCwr,"LanczosParameters",LanParams);
}
int Ls=16;
RealD M5=1.8;
RealD mass = -1.0;
mass=LanParams.mass;
Ls=LanParams.Ls;
M5=LanParams.M5;
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian* UrbGrid =
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
// GridCartesian* FGrid = UGrid;
// GridRedBlackCartesian* FrbGrid = UrbGrid;
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
FieldMetaData header;
std::string file("./config");
int precision32 = 0;
int tworow = 0;
NerscIO::readConfiguration(Umu,header,file);
/*
std::vector<LatticeColourMatrix> U(4, UGrid);
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
*/
int Nstop = 10;
int Nk = 20;
int Np = 80;
Nstop=LanParams.Nstop;
Nk=LanParams.Nk;
Np=LanParams.Np;
int Nm = Nk + Np;
int MaxIt = 10000;
RealD resid = 1.0e-5;
//while ( mass > - 5.0){
FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
// Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
// Gamma5R5HermitianLinearOperator
std::vector<double> Coeffs{0, 1.};
Polynomial<FermionField> PolyX(Coeffs);
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
PlainHermOp<FermionField> Op (HermOp);
PlainHermOp<FermionField> Op2 (G5R5Herm);
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
std::vector<RealD> eval(Nm);
FermionField src(FGrid);
gaussian(RNG5, src);
std::vector<FermionField> evec(Nm, FGrid);
for (int i = 0; i < 1; i++) {
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
<< std::endl;
};
int Nconv;
IRL.calc(eval, evec, src, Nconv);
std::cout << mass <<" : " << eval << std::endl;
#if 0
Gamma g5(Gamma::Algebra::Gamma5) ;
ComplexD dot;
FermionField tmp(FGrid);
// RealD eMe,eMMe;
for (int i = 0; i < Nstop ; i++) {
// tmp = g5*evec[i];
dot = innerProduct(evec[i],evec[i]);
// G5R5(tmp,evec[i]);
G5R5Herm.HermOpAndNorm(evec[i],tmp,eMe,eMMe);
std::cout <<"Norm "<<M5<<" "<< mass << " : " << i << " " << real(dot) << " " << imag(dot) << " "<< eMe << " " <<eMMe<< std::endl ;
for (int j = 0; j < Nstop ; j++) {
dot = innerProduct(tmp,evec[j]);
std::cout <<"G5R5 "<<M5<<" "<< mass << " : " << i << " " <<j<<" " << real(dot) << " " << imag(dot) << std::endl ;
}
}
// src = evec[0]+evec[1]+evec[2];
// mass += -0.1;
#endif
//**********************************************************************
//orthogonalization
//calculat the matrix
cout << "Start orthogonalization " << endl;
cout << "calculate the matrix element" << endl;
vector<LatticeFermion> G5R5Mevec(Nconv, FGrid);
vector<LatticeFermion> finalevec(Nconv, FGrid);
vector<RealD> eMe(Nconv), eMMe(Nconv);
for(int i = 0; i < Nconv; i++){
G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
}
cout << "Re<evec, G5R5M(evec)>: " << endl;
cout << eMe << endl;
cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
cout << eMMe << endl;
vector<vector<ComplexD>> VevecG5R5Mevec(Nconv);
Eigen::MatrixXcd evecG5R5Mevec = Eigen::MatrixXcd::Zero(Nconv, Nconv);
for(int i = 0; i < Nconv; i++){
VevecG5R5Mevec[i].resize(Nconv);
for(int j = 0; j < Nconv; j++){
VevecG5R5Mevec[i][j] = innerProduct(evec[i], G5R5Mevec[j]);
evecG5R5Mevec(i, j) = VevecG5R5Mevec[i][j];
}
}
//calculate eigenvector
cout << "Eigen solver" << endl;
Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(evecG5R5Mevec);
vector<RealD> eigeneval(Nconv);
vector<vector<ComplexD>> eigenevec(Nconv);
for(int i = 0; i < Nconv; i++){
eigeneval[i] = eigensolver.eigenvalues()[i];
eigenevec[i].resize(Nconv);
for(int j = 0; j < Nconv; j++){
eigenevec[i][j] = eigensolver.eigenvectors()(i, j);
}
}
//rotation
cout << "Do rotation" << endl;
for(int i = 0; i < Nconv; i++){
finalevec[i] = finalevec[i] - finalevec[i];
for(int j = 0; j < Nconv; j++){
finalevec[i] = eigenevec[j][i]*evec[j] + finalevec[i];
}
}
//normalize again;
for(int i = 0; i < Nconv; i++){
RealD tmp_RealD = norm2(finalevec[i]);
tmp_RealD = 1./pow(tmp_RealD, 0.5);
finalevec[i] = finalevec[i]*tmp_RealD;
}
//check
for(int i = 0; i < Nconv; i++){
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
}
//**********************************************************************
//sort the eigenvectors
vector<LatticeFermion> finalevec_copy(Nconv, FGrid);
for(int i = 0; i < Nconv; i++){
finalevec_copy[i] = finalevec[i];
}
vector<RealD> eMe_copy(eMe);
for(int i = 0; i < Nconv; i++){
eMe[i] = fabs(eMe[i]);
eMe_copy[i] = eMe[i];
}
sort(eMe_copy.begin(), eMe_copy.end());
for(int i = 0; i < Nconv; i++){
for(int j = 0; j < Nconv; j++){
if(eMe[j] == eMe_copy[i]){
finalevec[i] = finalevec_copy[j];
}
}
}
for(int i = 0; i < Nconv; i++){
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
}
cout << "Re<evec, G5R5M(evec)>: " << endl;
cout << eMe << endl;
cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
cout << eMMe << endl;
// vector<LatticeFermion> finalevec(Nconv, FGrid);
// temporary, until doing rotation
// for(int i = 0; i < Nconv; i++)
// finalevec[i]=evec[i];
//**********************************************************************
//calculate chirality matrix
vector<LatticeFermion> G5evec(Nconv, FGrid);
vector<vector<ComplexD>> chiral_matrix(Nconv);
vector<vector<RealD>> chiral_matrix_real(Nconv);
for(int i = 0; i < Nconv; i++){
// G5evec[i] = G5evec[i] - G5evec[i];
G5evec[i] = Zero();
for(int j = 0; j < Ls/2; j++){
axpby_ssp(G5evec[i], 1., finalevec[i], 0., G5evec[i], j, j);
}
for(int j = Ls/2; j < Ls; j++){
axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
}
}
for(int i = 0; i < Nconv; i++){
chiral_matrix_real[i].resize(Nconv);
chiral_matrix[i].resize(Nconv);
for(int j = 0; j < Nconv; j++){
chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
}
}
for(int i = 0; i < Nconv; i++){
if(chiral_matrix[i][i].real() < 0.){
chiral_matrix_real[i][i] = -1. * chiral_matrix_real[i][i];
}
}
Grid_finalize();
}

View File

@ -1,278 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_lanczos.cc
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
;
typedef WilsonFermionD FermionOp;
typedef typename WilsonFermionD::FermionField FermionField;
RealD AllZero(RealD x) { return 0.; }
namespace Grid {
#if 0
template<typename Field>
class RationalHermOp : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
// OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;
RealD _massDen, _massNum;
FunctionHermOp(LinearOperatorBase<Field>& linop, RealD massDen,RealD massNum)
: _Linop(linop) ,_massDen(massDen),_massNum(massNum) {};
void operator()(const Field& in, Field& out) {
// _poly(_Linop,in,out);
}
};
#endif
template<class Matrix,class Field>
class InvG5LinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD _num;
RealD _Tol;
Integer _MaxIt;
Gamma g5;
public:
InvG5LinearOperator(Matrix &Mat,RealD num): _Mat(Mat),_num(num), _Tol(1e-12),_MaxIt(10000), g5(Gamma::Algebra::Gamma5) {};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
assert(0);
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
assert(0);
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
assert(0);
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
Field tmp(in.Grid());
MdagMLinearOperator<Matrix,Field> denom(_Mat);
ConjugateGradient<Field> CG(_Tol,_MaxIt);
_Mat.M(in,tmp);
tmp += _num*in;
_Mat.Mdag(tmp,out);
CG(denom,out,tmp);
out = g5*tmp;
}
};
struct LanczosParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
RealD, mass ,
RealD, resid,
RealD, ChebyLow,
RealD, ChebyHigh,
Integer, ChebyOrder)
// Integer, StartTrajectory,
// Integer, Trajectories, /* @brief Number of sweeps in this run */
// bool, MetropolisTest,
// Integer, NoMetropolisUntil,
// std::string, StartingType,
// Integer, SW,
// RealD, Kappa,
// IntegratorParameters, MD)
LanczosParameters() {
////////////////////////////// Default values
mass = 0;
// MetropolisTest = true;
// NoMetropolisUntil = 10;
// StartTrajectory = 0;
// SW = 2;
// Trajectories = 10;
// StartingType = "HotStart";
/////////////////////////////////
}
template <class ReaderClass >
LanczosParameters(Reader<ReaderClass> & TheReader){
initialize(TheReader);
}
template < class ReaderClass >
void initialize(Reader<ReaderClass> &TheReader){
// std::cout << GridLogMessage << "Reading HMC\n";
read(TheReader, "HMC", *this);
}
void print_parameters() const {
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
// MD.print_parameters();
}
};
}
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian* UrbGrid =
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian* FGrid = UGrid;
GridRedBlackCartesian* FrbGrid = UrbGrid;
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
GridParallelRNG RNG5(FGrid);
RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid);
RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGrid);
RNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
// SU<Nc>::HotConfiguration(RNG4, Umu);
FieldMetaData header;
std::string file("./config");
int precision32 = 0;
int tworow = 0;
// NerscIO::writeConfiguration(Umu,file,tworow,precision32);
NerscIO::readConfiguration(Umu,header,file);
/*
std::vector<LatticeColourMatrix> U(4, UGrid);
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
*/
int Nstop = 5;
int Nk = 10;
int Np = 90;
int Nm = Nk + Np;
int MaxIt = 10000;
RealD resid = 1.0e-5;
RealD mass = -1.0;
LanczosParameters LanParams;
#if 1
{
XmlReader HMCrd("LanParams.xml");
read(HMCrd,"LanczosParameters",LanParams);
}
#else
{
LanParams.mass = mass;
}
#endif
std::cout << GridLogMessage<< LanParams <<std::endl;
{
XmlWriter HMCwr("LanParams.xml.out");
write(HMCwr,"LanczosParameters",LanParams);
}
mass=LanParams.mass;
resid=LanParams.resid;
while ( mass > - 5.0){
FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
InvG5LinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator,-2.); /// <-----
//SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
// Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
std::vector<double> Coeffs{0, 0, 1.};
Polynomial<FermionField> PolyX(Coeffs);
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
// InvHermOp<FermionField> Op(WilsonOperator,HermOp);
PlainHermOp<FermionField> Op (HermOp);
// PlainHermOp<FermionField> Op2 (HermOp2);
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
std::vector<RealD> eval(Nm);
FermionField src(FGrid);
gaussian(RNG5, src);
std::vector<FermionField> evec(Nm, FGrid);
for (int i = 0; i < 1; i++) {
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
<< std::endl;
};
int Nconv;
IRL.calc(eval, evec, src, Nconv);
std::cout << mass <<" : " << eval << std::endl;
Gamma g5(Gamma::Algebra::Gamma5) ;
ComplexD dot;
FermionField tmp(FGrid);
for (int i = 0; i < Nstop ; i++) {
tmp = g5*evec[i];
dot = innerProduct(tmp,evec[i]);
std::cout << mass << " : " << eval[i] << " " << real(dot) << " " << imag(dot) << std::endl ;
}
src = evec[0]+evec[1]+evec[2];
mass += -0.1;
}
Grid_finalize();
}

View File

@ -1,211 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_lanczos.cc
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
;
typedef WilsonFermionD FermionOp;
typedef typename WilsonFermionD::FermionField FermionField;
RealD AllZero(RealD x) { return 0.; }
namespace Grid {
struct LanczosParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
RealD, mass ,
RealD, ChebyLow,
RealD, ChebyHigh,
Integer, ChebyOrder)
// Integer, StartTrajectory,
// Integer, Trajectories, /* @brief Number of sweeps in this run */
// bool, MetropolisTest,
// Integer, NoMetropolisUntil,
// std::string, StartingType,
// Integer, SW,
// RealD, Kappa,
// IntegratorParameters, MD)
LanczosParameters() {
////////////////////////////// Default values
mass = 0;
// MetropolisTest = true;
// NoMetropolisUntil = 10;
// StartTrajectory = 0;
// SW = 2;
// Trajectories = 10;
// StartingType = "HotStart";
/////////////////////////////////
}
template <class ReaderClass >
LanczosParameters(Reader<ReaderClass> & TheReader){
initialize(TheReader);
}
template < class ReaderClass >
void initialize(Reader<ReaderClass> &TheReader){
// std::cout << GridLogMessage << "Reading HMC\n";
read(TheReader, "HMC", *this);
}
void print_parameters() const {
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
// MD.print_parameters();
}
};
}
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian* UrbGrid =
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian* FGrid = UGrid;
GridRedBlackCartesian* FrbGrid = UrbGrid;
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
GridParallelRNG RNG5(FGrid);
RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid);
RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGrid);
RNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
// SU<Nc>::HotConfiguration(RNG4, Umu);
FieldMetaData header;
std::string file("./config");
int precision32 = 0;
int tworow = 0;
// NerscIO::writeConfiguration(Umu,file,tworow,precision32);
NerscIO::readConfiguration(Umu,header,file);
/*
std::vector<LatticeColourMatrix> U(4, UGrid);
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
*/
int Nstop = 10;
int Nk = 20;
int Np = 80;
int Nm = Nk + Np;
int MaxIt = 10000;
RealD resid = 1.0e-5;
RealD mass = -1.0;
LanczosParameters LanParams;
#if 1
{
XmlReader HMCrd("LanParams.xml");
read(HMCrd,"LanczosParameters",LanParams);
}
#else
{
LanParams.mass = mass;
}
#endif
std::cout << GridLogMessage<< LanParams <<std::endl;
{
XmlWriter HMCwr("LanParams.xml.out");
write(HMCwr,"LanczosParameters",LanParams);
}
mass=LanParams.mass;
while ( mass > - 5.0){
FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
//SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
std::vector<double> Coeffs{0, 1.};
Polynomial<FermionField> PolyX(Coeffs);
// Chebyshev<FermionField> Cheby(0.5, 60., 31);
// RealD, ChebyLow,
// RealD, ChebyHigh,
// Integer, ChebyOrder)
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
PlainHermOp<FermionField> Op (HermOp);
PlainHermOp<FermionField> Op2 (HermOp2);
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
std::vector<RealD> eval(Nm);
FermionField src(FGrid);
gaussian(RNG5, src);
std::vector<FermionField> evec(Nm, FGrid);
for (int i = 0; i < 1; i++) {
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
<< std::endl;
};
int Nconv;
IRL.calc(eval, evec, src, Nconv);
std::cout << mass <<" : " << eval << std::endl;
Gamma g5(Gamma::Algebra::Gamma5) ;
ComplexD dot;
FermionField tmp(FGrid);
for (int i = 0; i < Nstop ; i++) {
tmp = g5*evec[i];
dot = innerProduct(tmp,evec[i]);
std::cout << mass << " : " << eval[i] << " " << real(dot) << " " << imag(dot) << std::endl ;
}
src = evec[0]+evec[1]+evec[2];
mass += -0.1;
}
Grid_finalize();
}