diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 76b904e9..951486f2 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -43,7 +43,7 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, const ImplParams &p, const WilsonAnisotropyCoefficients &anis) - : + : Kernels(p), _grid(&Fgrid), _cbgrid(&Hgrid), @@ -70,8 +70,91 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, } +template +void WilsonFermion::Report(void) +{ + RealD NP = _FourDimGrid->_Nprocessors; + RealD NN = _FourDimGrid->NodeCount(); + RealD volume = Ls; + Coordinate latt = _FourDimGrid->GlobalDimensions(); + for(int mu=0;mu 0 ) { + std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; + std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl; + std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl; + std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl; + + // Average the compute time + _FourDimGrid->GlobalSum(DhopComputeTime); + DhopComputeTime/=NP; + RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl; + + RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; + + } + + if ( DerivCalls > 0 ) { + std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl; + std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " < 0 || DhopCalls > 0){ + std::cout << GridLogMessage << "WilsonFermion Stencil" < 0){ + std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" < +void WilsonFermion::ZeroCounters(void) { + DhopCalls = 0; // ok + DhopCommTime = 0; + DhopComputeTime = 0; + DhopComputeTime2= 0; + DhopFaceTime = 0; + DhopTotalTime = 0; + + DerivCalls = 0; // ok + DerivCommTime = 0; + DerivComputeTime = 0; + DerivDhopComputeTime = 0; + + Stencil.ZeroCounters(); + StencilEven.ZeroCounters(); + StencilOdd.ZeroCounters(); + Stencil.ZeroCountersi(); + StencilEven.ZeroCountersi(); + StencilOdd.ZeroCountersi(); +} + + template -void WilsonFermion::ImportGauge(const GaugeField &_Umu) +void WilsonFermion::ImportGauge(const GaugeField &_Umu) { GaugeField HUmu(_Umu.Grid()); @@ -132,7 +215,7 @@ void WilsonFermion::MeooeDag(const FermionField &in, FermionField &out) { DhopOE(in, out, DaggerYes); } } - + template void WilsonFermion::Mooee(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); @@ -151,7 +234,7 @@ void WilsonFermion::MooeeInv(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); out = (1.0/(diag_mass))*in; } - + template void WilsonFermion::MooeeInvDag(const FermionField &in, FermionField &out) { out.Checkerboard() = in.Checkerboard(); @@ -159,59 +242,59 @@ void WilsonFermion::MooeeInvDag(const FermionField &in, FermionField &out) } template void WilsonFermion::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector twist) -{ +{ typedef typename FermionField::vector_type vector_type; typedef typename FermionField::scalar_type ScalComplex; typedef Lattice > LatComplex; - - // what type LatticeComplex + + // what type LatticeComplex conformable(_grid,out.Grid()); - + Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, Gamma::Algebra::GammaZ, Gamma::Algebra::GammaT }; - + Coordinate latt_size = _grid->_fdimensions; - + FermionField num (_grid); num = Zero(); LatComplex wilson(_grid); wilson= Zero(); LatComplex one (_grid); one = ScalComplex(1.0,0.0); - + LatComplex denom(_grid); denom= Zero(); - LatComplex kmu(_grid); + LatComplex kmu(_grid); ScalComplex ci(0.0,1.0); // momphase = n * 2pi / L for(int mu=0;mu void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, const FermionField &A, const FermionField &B, int dag) { + DerivCalls++; assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); @@ -229,8 +313,11 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, FermionField Atilde(B.Grid()); Atilde = A; + DerivCommTime-=usecond(); st.HaloExchange(B, compressor); + DerivCommTime+=usecond(); + DerivComputeTime-=usecond(); for (int mu = 0; mu < Nd; mu++) { //////////////////////////////////////////////////////////////////////// // Flip gamma (1+g)<->(1-g) if dag @@ -238,6 +325,7 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, int gamma = mu; if (!dag) gamma += Nd; + DerivDhopComputeTime -= usecond(); int Ls=1; Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma); @@ -245,7 +333,9 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, // spin trace outer product ////////////////////////////////////////////////// Impl::InsertForce4D(mat, Btilde, Atilde, mu); + DerivDhopComputeTime += usecond(); } + DerivComputeTime += usecond(); } template @@ -265,7 +355,7 @@ void WilsonFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, co conformable(U.Grid(), V.Grid()); //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido) // Motivation: look at the SchurDiff operator - + assert(V.Checkerboard() == Even); assert(U.Checkerboard() == Odd); mat.Checkerboard() = Odd; @@ -288,6 +378,7 @@ void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, co template void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=2; conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), out.Grid()); @@ -298,6 +389,7 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da template void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=1; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -309,6 +401,7 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int template void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) { + DhopCalls+=1; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -319,18 +412,18 @@ void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int d } template -void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) +void WilsonFermion::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { DhopDir(in, out, dir, disp); } template -void WilsonFermion::MdirAll(const FermionField &in, std::vector &out) +void WilsonFermion::MdirAll(const FermionField &in, std::vector &out) { DhopDirAll(in, out); } template -void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) +void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { Compressor compressor(DaggerNo); Stencil.HaloExchange(in, compressor); @@ -342,12 +435,12 @@ void WilsonFermion::DhopDir(const FermionField &in, FermionField &out, int DhopDirCalc(in, out, dirdisp, gamma, DaggerNo); }; template -void WilsonFermion::DhopDirAll(const FermionField &in, std::vector &out) +void WilsonFermion::DhopDirAll(const FermionField &in, std::vector &out) { Compressor compressor(DaggerNo); Stencil.HaloExchange(in, compressor); - assert((out.size()==8)||(out.size()==9)); + assert((out.size()==8)||(out.size()==9)); for(int dir=0;dir::DhopDirAll(const FermionField &in, std::vector -void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) +void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) { int Ls=1; uint64_t Nsite=in.oSites(); @@ -371,15 +464,16 @@ template void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) + FermionField &out, int dag) { + DhopTotalTime-=usecond(); #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) DhopInternalOverlappedComms(st,lo,U,in,out,dag); else -#endif +#endif DhopInternalSerial(st,lo,U,in,out,dag); - + DhopTotalTime+=usecond(); } template @@ -397,38 +491,53 @@ void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO ///////////////////////////// std::vector > requests; st.Prepare(); + DhopFaceTime-=usecond(); st.HaloGather(in,compressor); + DhopFaceTime+=usecond(); + + DhopCommTime -=usecond(); st.CommunicateBegin(requests); ///////////////////////////// // Overlap with comms ///////////////////////////// + DhopFaceTime-=usecond(); st.CommsMergeSHM(compressor); + DhopFaceTime+=usecond(); ///////////////////////////// // do the compute interior ///////////////////////////// int Opt = WilsonKernelsStatic::Opt; + DhopComputeTime-=usecond(); if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); - } + } + DhopComputeTime+=usecond(); ///////////////////////////// // Complete comms ///////////////////////////// st.CommunicateComplete(requests); + DhopCommTime +=usecond(); + + DhopFaceTime-=usecond(); st.CommsMerge(compressor); + DhopFaceTime+=usecond(); ///////////////////////////// // do the compute exterior ///////////////////////////// + + DhopComputeTime2-=usecond(); if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); } + DhopComputeTime2+=usecond(); }; @@ -439,20 +548,24 @@ void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, FermionField &out, int dag) { assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); + DhopCommTime-=usecond(); st.HaloExchange(in, compressor); + DhopCommTime+=usecond(); + DhopComputeTime-=usecond(); int Opt = WilsonKernelsStatic::Opt; if (dag == DaggerYes) { Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } else { Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); } + DhopComputeTime+=usecond(); }; /*Change ends */ /******************************************************************************* * Conserved current utilities for Wilson fermions, for contracting propagators - * to make a conserved current sink or inserting the conserved current + * to make a conserved current sink or inserting the conserved current * sequentially. ******************************************************************************/ template @@ -493,11 +606,11 @@ void WilsonFermion::ContractConservedCurrent(PropagatorField &q_in_1, template -void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, +void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, PropagatorField &q_out, Current curr_type, unsigned int mu, - unsigned int tmin, + unsigned int tmin, unsigned int tmax, ComplexField &lattice_cmplx) { @@ -535,24 +648,24 @@ void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, Integer timeSlices = Reduce(t_mask()); if (timeSlices > 0) { - Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], - q_out_v[sU], + Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU], + q_out_v[sU], Umu_v, sU, mu, t_mask); } // Repeat for backward direction. - t_mask() = ((coords_v[sU] >= (tmin + tshift)) && + t_mask() = ((coords_v[sU] >= (tmin + tshift)) && (coords_v[sU] <= (tmax + tshift))); - - //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3) + + //if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3) unsigned int t0 = 0; if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 )); - + timeSlices = Reduce(t_mask()); if (timeSlices > 0) { - Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], - q_out_v[sU], + Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU], + q_out_v[sU], Umu_v, sU, mu, t_mask); } });