1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Merge branch 'feature/staggered-comms-compute' of https://github.com/paboyle/Grid into feature/staggered-comms-compute

This commit is contained in:
Azusa Yamaguchi 2018-05-02 14:59:13 +01:00
commit 4f4181c54a
14 changed files with 355 additions and 68 deletions

View File

@ -51,7 +51,7 @@ namespace Grid {
virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void Op (const Field &in, Field &out) = 0; // Abstract base
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0; virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
virtual void HermOp(const Field &in, Field &out)=0; virtual void HermOp(const Field &in, Field &out)=0;
}; };
@ -305,36 +305,59 @@ namespace Grid {
class SchurStaggeredOperator : public SchurOperatorBase<Field> { class SchurStaggeredOperator : public SchurOperatorBase<Field> {
protected: protected:
Matrix &_Mat; Matrix &_Mat;
Field tmp;
RealD mass;
double tMpc;
double tIP;
double tMeo;
double taxpby_norm;
uint64_t ncall;
public: public:
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){}; void Report(void)
{
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
}
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{
assert( _Mat.isTrivialEE() );
mass = _Mat.Mass();
tMpc=0;
tIP =0;
tMeo=0;
taxpby_norm=0;
ncall=0;
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
GridLogIterative.TimingMode(1); ncall++;
std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl; tMpc-=usecond();
n2 = Mpc(in,out); n2 = Mpc(in,out);
std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl; tMpc+=usecond();
tIP-=usecond();
ComplexD dot= innerProduct(in,out); ComplexD dot= innerProduct(in,out);
std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl; tIP+=usecond();
n1 = real(dot); n1 = real(dot);
} }
virtual void HermOp(const Field &in, Field &out){ virtual void HermOp(const Field &in, Field &out){
std::cout << GridLogIterative << " HermOp "<<std::endl; ncall++;
Mpc(in,out); tMpc-=usecond();
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp);
tMpc+=usecond();
taxpby_norm-=usecond();
axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
} }
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid); tMeo-=usecond();
Field tmp2(in._grid);
std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
_Mat.Mooee(in,out);
_Mat.Mooee(out,tmp);
std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.Meooe(out,tmp2); _Mat.Meooe(out,tmp);
std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl; tMeo+=usecond();
taxpby_norm-=usecond();
RealD nn=axpy_norm(out,-1.0,tmp2,tmp); RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl; taxpby_norm+=usecond();
return nn; return nn;
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual RealD MpcDag (const Field &in, Field &out){

View File

@ -70,7 +70,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
Linop.HermOpAndNorm(psi, mmp, d, b); Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp; r = src - mmp;
p = r; p = r;
@ -97,6 +96,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer; GridStopWatch LinalgTimer;
GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer;
GridStopWatch LinearCombTimer;
GridStopWatch MatrixTimer; GridStopWatch MatrixTimer;
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
@ -106,30 +108,32 @@ class ConjugateGradient : public OperatorFunction<Field> {
c = cp; c = cp;
MatrixTimer.Start(); MatrixTimer.Start();
Linop.HermOpAndNorm(p, mmp, d, qq); Linop.HermOp(p, mmp);
MatrixTimer.Stop(); MatrixTimer.Stop();
LinalgTimer.Start(); LinalgTimer.Start();
// AA
// RealD qqck = norm2(mmp);
// ComplexD dck = innerProduct(p,mmp);
InnerTimer.Start();
ComplexD dc = innerProduct(p,mmp);
InnerTimer.Stop();
d = dc.real();
a = c / d; a = c / d;
b_pred = a * (a * qq - d) / c;
AxpyNormTimer.Start();
cp = axpy_norm(r, -a, mmp, r); cp = axpy_norm(r, -a, mmp, r);
AxpyNormTimer.Stop();
b = cp / c; b = cp / c;
// Fuse these loops ; should be really easy LinearCombTimer.Start();
psi = a * p + psi; parallel_for(int ss=0;ss<src._grid->oSites();ss++){
p = p * b + r; vstream(psi[ss], a * p[ss] + psi[ss]);
vstream(p [ss], b * p[ss] + r[ss]);
}
LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
// Stopping condition // Stopping condition
if (cp <= rsq) { if (cp <= rsq) {
@ -150,6 +154,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

View File

@ -244,19 +244,11 @@ namespace Grid {
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> strong_inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard; return axpy_norm_fast(ret,a,x,y);
conformable(ret,x);
conformable(x,y);
axpy(ret,a,x,y);
return norm2(ret);
} }
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> strong_inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard; return axpby_norm_fast(ret,a,b,x,y);
conformable(ret,x);
conformable(x,y);
axpby(ret,a,b,x,y);
return norm2(ret); // FIXME implement parallel norm in ss loop
} }
} }

View File

@ -33,7 +33,7 @@ namespace Grid {
// Deterministic Reduction operations // Deterministic Reduction operations
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg); auto nrm = innerProduct(arg,arg);
return std::real(nrm); return std::real(nrm);
} }
@ -43,31 +43,84 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
scalar_type nrm;
GridBase *grid = left._grid; GridBase *grid = left._grid;
const int pad = 8;
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
ComplexD inner;
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff; int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){ for(int ss=myoff;ss<mywork+myoff; ss++){
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]); vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
} }
sumarray[thr]=TensorRemove(vnrm) ; // All threads sum across SIMD; reduce serial work at end
// one write per cacheline with streaming store
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
vstream(sumarray[thr*pad],tmp);
} }
vector_type vvnrm; vvnrm=zero; // sum across threads inner=0.0;
for(int i=0;i<grid->SumArraySize();i++){ for(int i=0;i<grid->SumArraySize();i++){
vvnrm = vvnrm+sumarray[i]; inner = inner+sumarray[i*pad];
} }
nrm = Reduce(vvnrm);// sum across simd right._grid->GlobalSum(inner);
right._grid->GlobalSum(nrm); return inner;
return nrm;
} }
/////////////////////////
// Fast axpby_norm
// z = a x + b y
// return norm z
/////////////////////////
template<class sobj,class vobj> strong_inline RealD
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
sobj one(1.0);
return axpby_norm_fast(z,a,one,x,y);
}
template<class sobj,class vobj> strong_inline RealD
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
const int pad = 8;
z.checkerboard = x.checkerboard;
conformable(z,x);
conformable(x,y);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
RealD nrm;
GridBase *grid = x._grid;
Vector<RealD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
// private to thread; sub summation
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vnrm = vnrm + innerProductD(tmp,tmp);
vstream(z._odata[ss],tmp);
}
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
}
nrm = 0.0; // sum across threads; linear in thread count but fast
for(int i=0;i<grid->SumArraySize();i++){
nrm = nrm+sumarray[i*pad];
}
z._grid->GlobalSum(nrm);
return nrm;
}
template<class Op,class T1> template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)

View File

@ -49,7 +49,8 @@ inline double usecond(void) {
typedef std::chrono::system_clock GridClock; typedef std::chrono::system_clock GridClock;
typedef std::chrono::time_point<GridClock> GridTimePoint; typedef std::chrono::time_point<GridClock> GridTimePoint;
typedef std::chrono::milliseconds GridTime; typedef std::chrono::milliseconds GridMillisecs;
typedef std::chrono::microseconds GridTime;
typedef std::chrono::microseconds GridUsecs; typedef std::chrono::microseconds GridUsecs;
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time) inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
@ -57,6 +58,11 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli
stream << time.count()<<" ms"; stream << time.count()<<" ms";
return stream; return stream;
} }
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
{
stream << time.count()<<" usec";
return stream;
}
class GridStopWatch { class GridStopWatch {
private: private:

View File

@ -62,9 +62,12 @@ namespace Grid {
virtual RealD M (const FermionField &in, FermionField &out)=0; virtual RealD M (const FermionField &in, FermionField &out)=0;
virtual RealD Mdag (const FermionField &in, FermionField &out)=0; virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
// half checkerboard operaions // Query the even even properties to make algorithmic decisions
virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
virtual int isTrivialEE(void) { return 0; };
virtual RealD Mass(void) {return 0.0;};
// half checkerboard operaions
virtual void Meooe (const FermionField &in, FermionField &out)=0; virtual void Meooe (const FermionField &in, FermionField &out)=0;
virtual void MeooeDag (const FermionField &in, FermionField &out)=0; virtual void MeooeDag (const FermionField &in, FermionField &out)=0;
virtual void Mooee (const FermionField &in, FermionField &out)=0; virtual void Mooee (const FermionField &in, FermionField &out)=0;

View File

@ -327,6 +327,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) { void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=2;
conformable(in._grid, _grid); // verifies full grid conformable(in._grid, _grid); // verifies full grid
conformable(in._grid, out._grid); conformable(in._grid, out._grid);
@ -337,6 +338,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) { void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=1;
conformable(in._grid, _cbgrid); // verifies half grid conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check conformable(in._grid, out._grid); // drops the cb check
@ -348,6 +350,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) { void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=1;
conformable(in._grid, _cbgrid); // verifies half grid conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check conformable(in._grid, out._grid); // drops the cb check
@ -400,13 +403,18 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
int len = U._grid->oSites(); int len = U._grid->oSites();
const int LLs = 1; const int LLs = 1;
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare(); st.Prepare();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
st.CommsMergeSHM(compressor); st.CommsMergeSHM(compressor);
DhopFaceTime += usecond();
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons. // Ugly explicit thread mapping introduced for OPA reasons.
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
#pragma omp parallel #pragma omp parallel
{ {
int tid = omp_get_thread_num(); int tid = omp_get_thread_num();
@ -448,10 +456,14 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
st.CommunicateThreaded(); st.CommunicateThreaded();
} }
} }
DhopComputeTime += usecond();
// First to enter, last to leave timing // First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
int sz=st.surface_list.size(); int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) { parallel_for (int ss = 0; ss < sz; ss++) {
@ -465,6 +477,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1); Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
} }
} }
DhopComputeTime2 += usecond();
#else #else
assert(0); assert(0);
#endif #endif
@ -480,9 +493,14 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
{ {
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor; Compressor compressor;
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
@ -492,8 +510,65 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
} }
} }
DhopComputeTime += usecond();
DhopTotalTime += usecond();
}; };
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void ImprovedStaggeredFermion<Impl>::Report(void)
{
std::vector<int> latt = GridDefaultLatt();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion); FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
//AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion); //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);

View File

@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@ -141,7 +153,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
// protected: // protected:
public: public:
// any other parameters of action ??? // any other parameters of action ???
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
RealD mass; RealD mass;
RealD u0; RealD u0;
RealD c1; RealD c1;

View File

@ -283,14 +283,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
DoubledGaugeField & U,DoubledGaugeField & UUU, DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
DhopTotalTime-=usecond();
#ifdef GRID_OMP #ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else else
#endif #endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
DhopTotalTime+=usecond();
} }
template<class Impl> template<class Impl>
@ -404,6 +402,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
//double t1=usecond();
DhopTotalTime -= usecond(); DhopTotalTime -= usecond();
DhopCommTime -= usecond(); DhopCommTime -= usecond();
st.HaloExchange(in,compressor); st.HaloExchange(in,compressor);
@ -424,6 +423,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
} }
DhopComputeTime += usecond(); DhopComputeTime += usecond();
DhopTotalTime += usecond(); DhopTotalTime += usecond();
//double t2=usecond();
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
} }
/*CHANGE END*/ /*CHANGE END*/

View File

@ -177,6 +177,9 @@ namespace QCD {
// Data members require to support the functionality // Data members require to support the functionality
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
public: public:
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
GridBase *_FourDimGrid; GridBase *_FourDimGrid;
GridBase *_FourDimRedBlackGrid; GridBase *_FourDimRedBlackGrid;

View File

@ -135,6 +135,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
// protected: // protected:
public: public:
virtual RealD Mass(void) { return mass; }
virtual int isTrivialEE(void) { return 1; };
RealD mass; RealD mass;
GridBase *_grid; GridBase *_grid;

View File

@ -74,6 +74,11 @@ int main (int argc, char ** argv)
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
double volume=1;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
RealD mass=0.003; RealD mass=0.003;
RealD c1=9.0/8.0; RealD c1=9.0/8.0;
RealD c2=-1.0/24.0; RealD c2=-1.0/24.0;
@ -90,14 +95,26 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "****************************************************************** "<<std::endl; std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl; std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
std::cout << GridLogMessage << "****************************************************************** "<<std::endl; std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass); ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d); SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
FermionField src4d(UGrid); random(pRNG,src4d); FermionField src4d(UGrid); random(pRNG,src4d);
FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d); FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d);
FermionField result4d_o(UrbGrid); FermionField result4d_o(UrbGrid);
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
result4d_o=zero; result4d_o=zero;
CG(HermOp4d,src4d_o,result4d_o); {
double t1=usecond();
CG(HermOp4d,src4d_o,result4d_o);
double t2=usecond();
double ncall=CG.IterationsToComplete;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp4d.Report();
}
Ds4d.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -106,7 +123,17 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
Ds.ZeroCounters(); Ds.ZeroCounters();
result_o=zero; result_o=zero;
CG(HermOp,src_o,result_o); {
double t1=usecond();
CG(HermOp,src_o,result_o);
double t2=usecond();
double ncall=CG.IterationsToComplete*Ls;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp.Report();
}
Ds.Report(); Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -115,7 +142,18 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
Ds.ZeroCounters(); Ds.ZeroCounters();
result_o=zero; result_o=zero;
mCG(HermOp,src_o,result_o); {
double t1=usecond();
mCG(HermOp,src_o,result_o);
double t2=usecond();
double ncall=mCG.IterationsToComplete*Ls;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp.Report();
}
Ds.Report(); Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -124,7 +162,17 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
Ds.ZeroCounters(); Ds.ZeroCounters();
result_o=zero; result_o=zero;
BCGrQ(HermOp,src_o,result_o); {
double t1=usecond();
BCGrQ(HermOp,src_o,result_o);
double t2=usecond();
double ncall=BCGrQ.IterationsToComplete*Ls;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
HermOp.Report();
}
Ds.Report(); Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

View File

@ -77,6 +77,12 @@ int main (int argc, char ** argv)
RealD c1=9.0/8.0; RealD c1=9.0/8.0;
RealD c2=-1.0/24.0; RealD c2=-1.0/24.0;
RealD u0=1.0; RealD u0=1.0;
double volume=1;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0); ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds); MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
@ -93,7 +99,19 @@ int main (int argc, char ** argv)
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d); MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
FermionField src4d(UGrid); random(pRNG,src4d); FermionField src4d(UGrid); random(pRNG,src4d);
FermionField result4d(UGrid); result4d=zero; FermionField result4d(UGrid); result4d=zero;
CG(HermOp4d,src4d,result4d);
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
{
double t1=usecond();
CG(HermOp4d,src4d,result4d);
double t2=usecond();
double ncall=CG.IterationsToComplete;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
}
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -101,9 +119,18 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl; std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero; result=zero;
{
Ds.ZeroCounters(); Ds.ZeroCounters();
double t1=usecond();
CG(HermOp,src,result); CG(HermOp,src,result);
double t2=usecond();
double ncall=CG.IterationsToComplete;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
Ds.Report(); Ds.Report();
}
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -111,7 +138,16 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero; result=zero;
Ds.ZeroCounters(); Ds.ZeroCounters();
{
double t1=usecond();
mCG(HermOp,src,result); mCG(HermOp,src,result);
double t2=usecond();
double ncall=CG.IterationsToComplete;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
}
Ds.Report(); Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
@ -120,7 +156,16 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
result=zero; result=zero;
Ds.ZeroCounters(); Ds.ZeroCounters();
{
double t1=usecond();
BCGrQ(HermOp,src,result); BCGrQ(HermOp,src,result);
double t2=usecond();
double ncall=CG.IterationsToComplete;
double flops = deodoe_flops * ncall;
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
}
Ds.Report(); Ds.Report();
std::cout << GridLogMessage << "************************************************************************ "<<std::endl; std::cout << GridLogMessage << "************************************************************************ "<<std::endl;

View File

@ -83,7 +83,19 @@ int main (int argc, char ** argv)
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds); SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
ConjugateGradient<FermionField> CG(1.0e-8,10000); ConjugateGradient<FermionField> CG(1.0e-8,10000);
double t1=usecond();
CG(HermOpEO,src_o,res_o); CG(HermOpEO,src_o,res_o);
double t2=usecond();
// Schur solver: uses DeoDoe => volume * 1146
double ncall=CG.IterationsToComplete;
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
FermionField tmp(&RBGrid); FermionField tmp(&RBGrid);