From bd0430b34f8cde299a2833bab45628da8d2cbd56 Mon Sep 17 00:00:00 2001 From: paboyle Date: Tue, 29 Nov 2016 22:27:55 +0000 Subject: [PATCH] Serialisation in malloc fixed --- benchmarks/Benchmark_mooee.cc | 128 +++++------------- lib/qcd/action/fermion/CayleyFermion5D.cc | 44 ++++++ lib/qcd/action/fermion/CayleyFermion5D.h | 12 ++ .../action/fermion/CayleyFermion5Dcache.cc | 20 +++ lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 22 ++- 5 files changed, 128 insertions(+), 98 deletions(-) diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index c895109f..097e6da3 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -70,7 +70,7 @@ int main (int argc, char ** argv) if (1) { - const int ncall=100; + const int ncall=1000; std::cout << GridLogMessage<< "*********************************************************" <Barrier(); - double t0,t1; - t0=usecond(); - for(int i=0;iBarrier(); - - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.DhopEO(src_o, r_e, DaggerNo); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.Mooee(src_o, r_o); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + Dw.CayleyReport(); \ + std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.MooeeInv(src_o, r_o); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + Dw.CayleyReport(); \ + std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.Meooe(src_o, r_e); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); double t0,t1; - t0=usecond(); - for(int i=0;iBarrier(); - - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.DhopEO(src_o, r_e, DaggerNo); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.Mooee(src_o, r_o); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.MooeeInv(src_o, r_o); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<Barrier(); - t0=usecond(); - for (int i = 0; i < ncall; i++) { - Dw.Meooe(src_o, r_e); - } - t1=usecond(); - FGrid->Barrier(); - std::cout<::Dminus(const FermionField &psi, FermionField &chi) axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi } } + + +template void CayleyFermion5D::CayleyReport(void) +{ + this->Report(); + std::vector latt = GridDefaultLatt(); + RealD volume = this->Ls; for(int mu=0;mu_FourDimGrid->_Nprocessors; + if ( M5Dcalls > 0 ) { + std::cout << GridLogMessage << "#### M5D calls report " << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl; + + // Flops = 6.0*(Nc*Ns) *Ls*vol + RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + } + + if ( MooeeInvCalls > 0 ) { + + std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; + std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; + + // Flops = 9*12*Ls*vol/2 + RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting + std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; + std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; + } + +} +template void CayleyFermion5D::CayleyZeroCounters(void) +{ + this->ZeroCounters(); + M5Dflops=0; + M5Dcalls=0; + M5Dtime=0; + MooeeInvFlops=0; + MooeeInvCalls=0; + MooeeInvTime=0; +} + + template void CayleyFermion5D::DminusDag(const FermionField &psi, FermionField &chi) { diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/lib/qcd/action/fermion/CayleyFermion5D.h index 1d8c2b95..6fb58234 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.h +++ b/lib/qcd/action/fermion/CayleyFermion5D.h @@ -120,6 +120,18 @@ namespace Grid { GridRedBlackCartesian &FourDimRedBlackGrid, RealD _mass,RealD _M5,const ImplParams &p= ImplParams()); + + + void CayleyReport(void); + void CayleyZeroCounters(void); + + double M5Dflops; + double M5Dcalls; + double M5Dtime; + + double MooeeInvFlops; + double MooeeInvCalls; + double MooeeInvTime; protected: void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); diff --git a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc index 62e91dd4..8e7df945 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc @@ -51,6 +51,9 @@ void CayleyFermion5D::M5D(const FermionField &psi, GridBase *grid=psi._grid; assert(phi.checkerboard == psi.checkerboard); chi.checkerboard=psi.checkerboard; + // Flops = 6.0*(Nc*Ns) *Ls*vol + M5Dcalls++; + M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls for(int s=0;s @@ -91,6 +95,9 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi, assert(phi.checkerboard == psi.checkerboard); chi.checkerboard=psi.checkerboard; + // Flops = 6.0*(Nc*Ns) *Ls*vol + M5Dcalls++; + M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls auto tmp = psi._odata[0]; @@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP } } } + M5Dtime+=usecond(); } template @@ -126,10 +134,14 @@ void CayleyFermion5D::MooeeInv (const FermionField &psi, FermionField & chi.checkerboard=psi.checkerboard; + MooeeInvCalls++; + MooeeInvTime-=usecond(); + PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls auto tmp = psi._odata[0]; + // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops // Apply (L^{\prime})^{-1} chi[ss]=psi[ss]; // chi[0]=psi[0] for(int s=1;s @@ -166,6 +181,8 @@ void CayleyFermion5D::MooeeInvDag (const FermionField &psi, FermionField & assert(psi.checkerboard == psi.checkerboard); chi.checkerboard=psi.checkerboard; + MooeeInvCalls++; + MooeeInvTime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=Ls){ // adds Ls @@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP chi[ss+s] = chi[ss+s] - lee[s]*tmp; } } + + MooeeInvTime+=usecond(); + } #ifdef CAYLEY_DPERP_CACHE diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index f6569923..3f3f215c 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -71,6 +71,7 @@ void CayleyFermion5D::M5D(const FermionField &psi, chi.checkerboard=psi.checkerboard; + // just directly address via type pun typedef typename Simd::scalar_type scalar_type; scalar_type * u_p = (scalar_type *)&u[0]; @@ -86,6 +87,8 @@ void CayleyFermion5D::M5D(const FermionField &psi, d_p[ss] = diag[s]; }} + M5Dcalls++; + M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=LLs){ // adds LLs @@ -115,6 +118,7 @@ PARALLEL_FOR_LOOP } } + M5Dtime+=usecond(); } template @@ -154,6 +158,8 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi, d_p[ss] = diag[s]; }} + M5Dcalls++; + M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=LLs){ // adds LLs @@ -183,6 +189,7 @@ PARALLEL_FOR_LOOP } } + M5Dtime+=usecond(); } template @@ -250,13 +257,11 @@ void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField } } + MooeeInvCalls++; + MooeeInvTime-=usecond(); // Dynamic allocate on stack to get per thread without serialised heap acces -PARALLEL_FOR_LOOP - for(auto site=0;site SitePplus(LLs); Vector SitePminus(LLs); @@ -267,6 +272,9 @@ PARALLEL_FOR_LOOP SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; +#pragma omp for + for(auto site=0;site