mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge branch 'feature/staggered-comms-compute' of https://github.com/paboyle/Grid into feature/staggered-comms-compute
This commit is contained in:
commit
4f4181c54a
@ -51,7 +51,7 @@ namespace Grid {
|
||||
|
||||
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
|
||||
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
|
||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
|
||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
|
||||
virtual void HermOp(const Field &in, Field &out)=0;
|
||||
};
|
||||
|
||||
@ -305,36 +305,59 @@ namespace Grid {
|
||||
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
|
||||
protected:
|
||||
Matrix &_Mat;
|
||||
Field tmp;
|
||||
RealD mass;
|
||||
double tMpc;
|
||||
double tIP;
|
||||
double tMeo;
|
||||
double taxpby_norm;
|
||||
uint64_t ncall;
|
||||
public:
|
||||
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
|
||||
void Report(void)
|
||||
{
|
||||
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
|
||||
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
|
||||
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
|
||||
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
|
||||
}
|
||||
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
|
||||
{
|
||||
assert( _Mat.isTrivialEE() );
|
||||
mass = _Mat.Mass();
|
||||
tMpc=0;
|
||||
tIP =0;
|
||||
tMeo=0;
|
||||
taxpby_norm=0;
|
||||
ncall=0;
|
||||
}
|
||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||
GridLogIterative.TimingMode(1);
|
||||
std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl;
|
||||
ncall++;
|
||||
tMpc-=usecond();
|
||||
n2 = Mpc(in,out);
|
||||
std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl;
|
||||
tMpc+=usecond();
|
||||
tIP-=usecond();
|
||||
ComplexD dot= innerProduct(in,out);
|
||||
std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl;
|
||||
tIP+=usecond();
|
||||
n1 = real(dot);
|
||||
}
|
||||
virtual void HermOp(const Field &in, Field &out){
|
||||
std::cout << GridLogIterative << " HermOp "<<std::endl;
|
||||
Mpc(in,out);
|
||||
ncall++;
|
||||
tMpc-=usecond();
|
||||
_Mat.Meooe(in,out);
|
||||
_Mat.Meooe(out,tmp);
|
||||
tMpc+=usecond();
|
||||
taxpby_norm-=usecond();
|
||||
axpby(out,-1.0,mass*mass,tmp,in);
|
||||
taxpby_norm+=usecond();
|
||||
}
|
||||
virtual RealD Mpc (const Field &in, Field &out) {
|
||||
Field tmp(in._grid);
|
||||
Field tmp2(in._grid);
|
||||
|
||||
std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
|
||||
_Mat.Mooee(in,out);
|
||||
_Mat.Mooee(out,tmp);
|
||||
std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
|
||||
|
||||
tMeo-=usecond();
|
||||
_Mat.Meooe(in,out);
|
||||
_Mat.Meooe(out,tmp2);
|
||||
std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl;
|
||||
|
||||
RealD nn=axpy_norm(out,-1.0,tmp2,tmp);
|
||||
std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl;
|
||||
_Mat.Meooe(out,tmp);
|
||||
tMeo+=usecond();
|
||||
taxpby_norm-=usecond();
|
||||
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
|
||||
taxpby_norm+=usecond();
|
||||
return nn;
|
||||
}
|
||||
virtual RealD MpcDag (const Field &in, Field &out){
|
||||
|
@ -70,7 +70,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
||||
|
||||
|
||||
Linop.HermOpAndNorm(psi, mmp, d, b);
|
||||
|
||||
|
||||
r = src - mmp;
|
||||
p = r;
|
||||
@ -97,6 +96,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
||||
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
|
||||
|
||||
GridStopWatch LinalgTimer;
|
||||
GridStopWatch InnerTimer;
|
||||
GridStopWatch AxpyNormTimer;
|
||||
GridStopWatch LinearCombTimer;
|
||||
GridStopWatch MatrixTimer;
|
||||
GridStopWatch SolverTimer;
|
||||
|
||||
@ -106,30 +108,32 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
||||
c = cp;
|
||||
|
||||
MatrixTimer.Start();
|
||||
Linop.HermOpAndNorm(p, mmp, d, qq);
|
||||
Linop.HermOp(p, mmp);
|
||||
MatrixTimer.Stop();
|
||||
|
||||
LinalgTimer.Start();
|
||||
// AA
|
||||
// RealD qqck = norm2(mmp);
|
||||
// ComplexD dck = innerProduct(p,mmp);
|
||||
|
||||
InnerTimer.Start();
|
||||
ComplexD dc = innerProduct(p,mmp);
|
||||
InnerTimer.Stop();
|
||||
d = dc.real();
|
||||
a = c / d;
|
||||
b_pred = a * (a * qq - d) / c;
|
||||
|
||||
AxpyNormTimer.Start();
|
||||
cp = axpy_norm(r, -a, mmp, r);
|
||||
AxpyNormTimer.Stop();
|
||||
b = cp / c;
|
||||
|
||||
// Fuse these loops ; should be really easy
|
||||
psi = a * p + psi;
|
||||
p = p * b + r;
|
||||
|
||||
LinearCombTimer.Start();
|
||||
parallel_for(int ss=0;ss<src._grid->oSites();ss++){
|
||||
vstream(psi[ss], a * p[ss] + psi[ss]);
|
||||
vstream(p [ss], b * p[ss] + r[ss]);
|
||||
}
|
||||
LinearCombTimer.Stop();
|
||||
LinalgTimer.Stop();
|
||||
|
||||
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
||||
<< " residual " << cp << " target " << rsq << std::endl;
|
||||
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
|
||||
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
|
||||
|
||||
// Stopping condition
|
||||
if (cp <= rsq) {
|
||||
@ -150,6 +154,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
||||
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
|
||||
|
||||
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
||||
|
||||
|
@ -244,19 +244,11 @@ namespace Grid {
|
||||
|
||||
template<class sobj,class vobj> strong_inline
|
||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||
ret.checkerboard = x.checkerboard;
|
||||
conformable(ret,x);
|
||||
conformable(x,y);
|
||||
axpy(ret,a,x,y);
|
||||
return norm2(ret);
|
||||
return axpy_norm_fast(ret,a,x,y);
|
||||
}
|
||||
template<class sobj,class vobj> strong_inline
|
||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||
ret.checkerboard = x.checkerboard;
|
||||
conformable(ret,x);
|
||||
conformable(x,y);
|
||||
axpby(ret,a,b,x,y);
|
||||
return norm2(ret); // FIXME implement parallel norm in ss loop
|
||||
return axpby_norm_fast(ret,a,b,x,y);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -33,7 +33,7 @@ namespace Grid {
|
||||
// Deterministic Reduction operations
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
||||
ComplexD nrm = innerProduct(arg,arg);
|
||||
auto nrm = innerProduct(arg,arg);
|
||||
return std::real(nrm);
|
||||
}
|
||||
|
||||
@ -43,31 +43,84 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
||||
{
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_typeD vector_type;
|
||||
scalar_type nrm;
|
||||
|
||||
GridBase *grid = left._grid;
|
||||
|
||||
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
|
||||
|
||||
const int pad = 8;
|
||||
|
||||
ComplexD inner;
|
||||
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
|
||||
|
||||
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||
int nwork, mywork, myoff;
|
||||
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
||||
|
||||
decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
|
||||
decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
|
||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
|
||||
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
|
||||
}
|
||||
sumarray[thr]=TensorRemove(vnrm) ;
|
||||
// All threads sum across SIMD; reduce serial work at end
|
||||
// one write per cacheline with streaming store
|
||||
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
|
||||
vstream(sumarray[thr*pad],tmp);
|
||||
}
|
||||
|
||||
vector_type vvnrm; vvnrm=zero; // sum across threads
|
||||
inner=0.0;
|
||||
for(int i=0;i<grid->SumArraySize();i++){
|
||||
vvnrm = vvnrm+sumarray[i];
|
||||
inner = inner+sumarray[i*pad];
|
||||
}
|
||||
nrm = Reduce(vvnrm);// sum across simd
|
||||
right._grid->GlobalSum(nrm);
|
||||
return nrm;
|
||||
right._grid->GlobalSum(inner);
|
||||
return inner;
|
||||
}
|
||||
|
||||
/////////////////////////
|
||||
// Fast axpby_norm
|
||||
// z = a x + b y
|
||||
// return norm z
|
||||
/////////////////////////
|
||||
template<class sobj,class vobj> strong_inline RealD
|
||||
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||
{
|
||||
sobj one(1.0);
|
||||
return axpby_norm_fast(z,a,one,x,y);
|
||||
}
|
||||
|
||||
template<class sobj,class vobj> strong_inline RealD
|
||||
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||
{
|
||||
const int pad = 8;
|
||||
z.checkerboard = x.checkerboard;
|
||||
conformable(z,x);
|
||||
conformable(x,y);
|
||||
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
typedef typename vobj::vector_typeD vector_type;
|
||||
RealD nrm;
|
||||
|
||||
GridBase *grid = x._grid;
|
||||
|
||||
Vector<RealD> sumarray(grid->SumArraySize()*pad);
|
||||
|
||||
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||
int nwork, mywork, myoff;
|
||||
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
|
||||
|
||||
// private to thread; sub summation
|
||||
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
|
||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
|
||||
vnrm = vnrm + innerProductD(tmp,tmp);
|
||||
vstream(z._odata[ss],tmp);
|
||||
}
|
||||
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
|
||||
}
|
||||
|
||||
nrm = 0.0; // sum across threads; linear in thread count but fast
|
||||
for(int i=0;i<grid->SumArraySize();i++){
|
||||
nrm = nrm+sumarray[i*pad];
|
||||
}
|
||||
z._grid->GlobalSum(nrm);
|
||||
return nrm;
|
||||
}
|
||||
|
||||
|
||||
template<class Op,class T1>
|
||||
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
||||
|
@ -49,7 +49,8 @@ inline double usecond(void) {
|
||||
|
||||
typedef std::chrono::system_clock GridClock;
|
||||
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
||||
typedef std::chrono::milliseconds GridTime;
|
||||
typedef std::chrono::milliseconds GridMillisecs;
|
||||
typedef std::chrono::microseconds GridTime;
|
||||
typedef std::chrono::microseconds GridUsecs;
|
||||
|
||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
|
||||
@ -57,6 +58,11 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli
|
||||
stream << time.count()<<" ms";
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
|
||||
{
|
||||
stream << time.count()<<" usec";
|
||||
return stream;
|
||||
}
|
||||
|
||||
class GridStopWatch {
|
||||
private:
|
||||
|
@ -62,9 +62,12 @@ namespace Grid {
|
||||
virtual RealD M (const FermionField &in, FermionField &out)=0;
|
||||
virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
|
||||
|
||||
// half checkerboard operaions
|
||||
// Query the even even properties to make algorithmic decisions
|
||||
virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
|
||||
virtual int isTrivialEE(void) { return 0; };
|
||||
virtual RealD Mass(void) {return 0.0;};
|
||||
|
||||
// half checkerboard operaions
|
||||
virtual void Meooe (const FermionField &in, FermionField &out)=0;
|
||||
virtual void MeooeDag (const FermionField &in, FermionField &out)=0;
|
||||
virtual void Mooee (const FermionField &in, FermionField &out)=0;
|
||||
|
@ -327,6 +327,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
|
||||
DhopCalls+=2;
|
||||
conformable(in._grid, _grid); // verifies full grid
|
||||
conformable(in._grid, out._grid);
|
||||
|
||||
@ -337,6 +338,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
|
||||
DhopCalls+=1;
|
||||
conformable(in._grid, _cbgrid); // verifies half grid
|
||||
conformable(in._grid, out._grid); // drops the cb check
|
||||
|
||||
@ -348,6 +350,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
|
||||
DhopCalls+=1;
|
||||
conformable(in._grid, _cbgrid); // verifies half grid
|
||||
conformable(in._grid, out._grid); // drops the cb check
|
||||
|
||||
@ -400,13 +403,18 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
||||
int len = U._grid->oSites();
|
||||
const int LLs = 1;
|
||||
|
||||
DhopTotalTime -= usecond();
|
||||
|
||||
DhopFaceTime -= usecond();
|
||||
st.Prepare();
|
||||
st.HaloGather(in,compressor);
|
||||
st.CommsMergeSHM(compressor);
|
||||
DhopFaceTime += usecond();
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
DhopComputeTime -= usecond();
|
||||
#pragma omp parallel
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
@ -448,10 +456,14 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
||||
st.CommunicateThreaded();
|
||||
}
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
|
||||
// First to enter, last to leave timing
|
||||
DhopFaceTime -= usecond();
|
||||
st.CommsMerge(compressor);
|
||||
DhopFaceTime -= usecond();
|
||||
|
||||
DhopComputeTime2 -= usecond();
|
||||
if (dag == DaggerYes) {
|
||||
int sz=st.surface_list.size();
|
||||
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||
@ -465,6 +477,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
||||
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
|
||||
}
|
||||
}
|
||||
DhopComputeTime2 += usecond();
|
||||
#else
|
||||
assert(0);
|
||||
#endif
|
||||
@ -480,9 +493,14 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
|
||||
{
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
|
||||
DhopTotalTime -= usecond();
|
||||
|
||||
DhopCommTime -= usecond();
|
||||
Compressor compressor;
|
||||
st.HaloExchange(in, compressor);
|
||||
DhopCommTime += usecond();
|
||||
|
||||
DhopComputeTime -= usecond();
|
||||
if (dag == DaggerYes) {
|
||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
||||
@ -492,8 +510,65 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
|
||||
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
||||
}
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Reporting
|
||||
////////////////////////////////////////////////////////////////
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::Report(void)
|
||||
{
|
||||
std::vector<int> latt = GridDefaultLatt();
|
||||
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||
RealD NP = _grid->_Nprocessors;
|
||||
RealD NN = _grid->NodeCount();
|
||||
|
||||
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
|
||||
<< DhopCalls << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
|
||||
<< DhopTotalTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
|
||||
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
|
||||
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||
|
||||
// Average the compute time
|
||||
_grid->GlobalSum(DhopComputeTime);
|
||||
DhopComputeTime/=NP;
|
||||
|
||||
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
||||
|
||||
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
||||
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
|
||||
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
|
||||
{
|
||||
DhopCalls = 0;
|
||||
DhopTotalTime = 0;
|
||||
DhopCommTime = 0;
|
||||
DhopComputeTime = 0;
|
||||
DhopFaceTime = 0;
|
||||
|
||||
Stencil.ZeroCounters();
|
||||
StencilEven.ZeroCounters();
|
||||
StencilOdd.ZeroCounters();
|
||||
}
|
||||
|
||||
|
||||
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
|
||||
|
||||
//AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
|
||||
|
@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
||||
FermionField _tmp;
|
||||
FermionField &tmp(void) { return _tmp; }
|
||||
|
||||
////////////////////////////////////////
|
||||
// Performance monitoring
|
||||
////////////////////////////////////////
|
||||
void Report(void);
|
||||
void ZeroCounters(void);
|
||||
double DhopTotalTime;
|
||||
double DhopCalls;
|
||||
double DhopCommTime;
|
||||
double DhopComputeTime;
|
||||
double DhopComputeTime2;
|
||||
double DhopFaceTime;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Implement the abstract base
|
||||
///////////////////////////////////////////////////////////////
|
||||
@ -141,7 +153,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
||||
// protected:
|
||||
public:
|
||||
// any other parameters of action ???
|
||||
|
||||
virtual int isTrivialEE(void) { return 1; };
|
||||
virtual RealD Mass(void) { return mass; }
|
||||
RealD mass;
|
||||
RealD u0;
|
||||
RealD c1;
|
||||
|
@ -283,14 +283,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
DhopTotalTime-=usecond();
|
||||
#ifdef GRID_OMP
|
||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||
DhopTotalTime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -404,6 +402,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
||||
|
||||
|
||||
|
||||
//double t1=usecond();
|
||||
DhopTotalTime -= usecond();
|
||||
DhopCommTime -= usecond();
|
||||
st.HaloExchange(in,compressor);
|
||||
@ -424,6 +423,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
||||
}
|
||||
DhopComputeTime += usecond();
|
||||
DhopTotalTime += usecond();
|
||||
//double t2=usecond();
|
||||
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
|
||||
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
|
||||
|
||||
}
|
||||
/*CHANGE END*/
|
||||
|
||||
|
@ -177,6 +177,9 @@ namespace QCD {
|
||||
// Data members require to support the functionality
|
||||
///////////////////////////////////////////////////////////////
|
||||
public:
|
||||
|
||||
virtual int isTrivialEE(void) { return 1; };
|
||||
virtual RealD Mass(void) { return mass; }
|
||||
|
||||
GridBase *_FourDimGrid;
|
||||
GridBase *_FourDimRedBlackGrid;
|
||||
|
@ -135,6 +135,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
||||
|
||||
// protected:
|
||||
public:
|
||||
virtual RealD Mass(void) { return mass; }
|
||||
virtual int isTrivialEE(void) { return 1; };
|
||||
RealD mass;
|
||||
|
||||
GridBase *_grid;
|
||||
|
@ -74,6 +74,11 @@ int main (int argc, char ** argv)
|
||||
|
||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
||||
|
||||
double volume=1;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
volume=volume*latt_size[mu];
|
||||
}
|
||||
|
||||
RealD mass=0.003;
|
||||
RealD c1=9.0/8.0;
|
||||
RealD c2=-1.0/24.0;
|
||||
@ -90,14 +95,26 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
|
||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
||||
FermionField src4d(UGrid); random(pRNG,src4d);
|
||||
FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d);
|
||||
FermionField result4d_o(UrbGrid);
|
||||
|
||||
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
|
||||
result4d_o=zero;
|
||||
CG(HermOp4d,src4d_o,result4d_o);
|
||||
{
|
||||
double t1=usecond();
|
||||
CG(HermOp4d,src4d_o,result4d_o);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
HermOp4d.Report();
|
||||
}
|
||||
Ds4d.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
|
||||
@ -106,7 +123,17 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
Ds.ZeroCounters();
|
||||
result_o=zero;
|
||||
CG(HermOp,src_o,result_o);
|
||||
{
|
||||
double t1=usecond();
|
||||
CG(HermOp,src_o,result_o);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete*Ls;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
HermOp.Report();
|
||||
}
|
||||
Ds.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
@ -115,7 +142,18 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
Ds.ZeroCounters();
|
||||
result_o=zero;
|
||||
mCG(HermOp,src_o,result_o);
|
||||
{
|
||||
double t1=usecond();
|
||||
mCG(HermOp,src_o,result_o);
|
||||
double t2=usecond();
|
||||
double ncall=mCG.IterationsToComplete*Ls;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
HermOp.Report();
|
||||
}
|
||||
|
||||
Ds.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
@ -124,7 +162,17 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
Ds.ZeroCounters();
|
||||
result_o=zero;
|
||||
BCGrQ(HermOp,src_o,result_o);
|
||||
{
|
||||
double t1=usecond();
|
||||
BCGrQ(HermOp,src_o,result_o);
|
||||
double t2=usecond();
|
||||
double ncall=BCGrQ.IterationsToComplete*Ls;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
HermOp.Report();
|
||||
}
|
||||
Ds.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
|
@ -77,6 +77,12 @@ int main (int argc, char ** argv)
|
||||
RealD c1=9.0/8.0;
|
||||
RealD c2=-1.0/24.0;
|
||||
RealD u0=1.0;
|
||||
|
||||
double volume=1;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
volume=volume*latt_size[mu];
|
||||
}
|
||||
|
||||
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
||||
|
||||
@ -93,7 +99,19 @@ int main (int argc, char ** argv)
|
||||
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
||||
FermionField src4d(UGrid); random(pRNG,src4d);
|
||||
FermionField result4d(UGrid); result4d=zero;
|
||||
CG(HermOp4d,src4d,result4d);
|
||||
|
||||
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
|
||||
{
|
||||
double t1=usecond();
|
||||
CG(HermOp4d,src4d,result4d);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
|
||||
@ -101,9 +119,18 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
result=zero;
|
||||
{
|
||||
Ds.ZeroCounters();
|
||||
double t1=usecond();
|
||||
CG(HermOp,src,result);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
Ds.Report();
|
||||
}
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
@ -111,7 +138,16 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
result=zero;
|
||||
Ds.ZeroCounters();
|
||||
{
|
||||
double t1=usecond();
|
||||
mCG(HermOp,src,result);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
}
|
||||
Ds.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
@ -120,7 +156,16 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
result=zero;
|
||||
Ds.ZeroCounters();
|
||||
{
|
||||
double t1=usecond();
|
||||
BCGrQ(HermOp,src,result);
|
||||
double t2=usecond();
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops = deodoe_flops * ncall;
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
}
|
||||
Ds.Report();
|
||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||
|
||||
|
@ -83,7 +83,19 @@ int main (int argc, char ** argv)
|
||||
|
||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||
double t1=usecond();
|
||||
CG(HermOpEO,src_o,res_o);
|
||||
double t2=usecond();
|
||||
|
||||
// Schur solver: uses DeoDoe => volume * 1146
|
||||
double ncall=CG.IterationsToComplete;
|
||||
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
||||
|
||||
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||
|
||||
|
||||
|
||||
FermionField tmp(&RBGrid);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user