Serialisation in malloc fixed

2026-08-03 01:13:29 +01:00 · 2016-11-29 22:27:55 +00:00
parent 2f92b4860b
commit bd0430b34f
5 changed files with 128 additions and 98 deletions
@@ -70,7 +70,7 @@ int main (int argc, char ** argv)

  if (1)
  {
-    const int ncall=100;
+    const int ncall=1000;

    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
@@ -81,18 +81,7 @@ int main (int argc, char ** argv)
    LatticeFermion result(FGrid);

    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-
-    FGrid->Barrier();
-
    double t0,t1;
-    t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.Dhop(src,result,0);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-
-    std::cout<<GridLogMessage << "Called Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;

    LatticeFermion r_eo(FGrid);
    LatticeFermion src_e (FrbGrid);
@@ -109,48 +98,44 @@ int main (int argc, char ** argv)
    r_e = zero;
    r_o = zero;

-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.DhopEO(src_o, r_e, DaggerNo);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;

-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.Mooee(src_o, r_o);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
+#define BENCH_DW(A,in,out)			\
+    Dw.CayleyZeroCounters();			\
+    FGrid->Barrier();				\
+    t0=usecond();				\
+    for(int i=0;i<ncall;i++){			\
+      Dw. A (in,out);				\
+    }						\
+    t1=usecond();				\
+    FGrid->Barrier();				\
+    Dw.CayleyReport();					\
+    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
+    std::cout<<GridLogMessage << "******************"<<std::endl;

-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.MooeeInv(src_o, r_o);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
+#define BENCH_DW_MEO(A,in,out)			\
+    Dw.CayleyZeroCounters();			\
+    FGrid->Barrier();				\
+    t0=usecond();				\
+    for(int i=0;i<ncall;i++){			\
+      Dw. A (in,out,0);				\
+    }						\
+    t1=usecond();				\
+    FGrid->Barrier();				\
+    Dw.CayleyReport();					\
+    std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
+    std::cout<<GridLogMessage << "******************"<<std::endl;

-
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.Meooe(src_o, r_e);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
+    BENCH_DW_MEO(Dhop    ,src,result);
+    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
+    BENCH_DW(Meooe   ,src_o,r_e);
+    BENCH_DW(Mooee   ,src_o,r_o);
+    BENCH_DW(MooeeInv,src_o,r_o);

  }

  if (1)
  {
-    const int ncall=100;
+    const int ncall=1000;

    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
@@ -168,14 +153,6 @@ int main (int argc, char ** argv)
    FGrid->Barrier();

    double t0,t1;
-    t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.Dhop(src,result,0);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-
-    std::cout<<GridLogMessage << "Called Vec5D Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;

    LatticeFermion r_eo(sFGrid);
    LatticeFermion src_e (sFrbGrid);
@@ -192,46 +169,13 @@ int main (int argc, char ** argv)
    r_e = zero;
    r_o = zero;

-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.DhopEO(src_o, r_e, DaggerNo);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Vec5D DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
-
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.Mooee(src_o, r_o);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Vec5D Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
-
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.MooeeInv(src_o, r_o);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Vec5D MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
-
-
-    FGrid->Barrier();
-    t0=usecond();
-    for (int i = 0; i < ncall; i++) {
-      Dw.Meooe(src_o, r_e);
-    }
-    t1=usecond();
-    FGrid->Barrier();
-    std::cout<<GridLogMessage << "Called Vec5D Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
+    BENCH_DW_MEO(Dhop    ,src,result);
+    BENCH_DW_MEO(DhopEO  ,src_o,r_e);
+    BENCH_DW(Meooe   ,src_o,r_e);
+    BENCH_DW(Mooee   ,src_o,r_o);
+    BENCH_DW(MooeeInv,src_o,r_o);

  }

-
-
  Grid_finalize();
 }
@@ -62,6 +62,50 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
  }
 }
+
+
+template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
+{
+  this->Report();
+  std::vector<int> latt = GridDefaultLatt();          
+  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP     = this->_FourDimGrid->_Nprocessors;
+  if ( M5Dcalls > 0 ) {
+    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
+
+    // Flops = 6.0*(Nc*Ns) *Ls*vol
+    RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  }
+
+  if ( MooeeInvCalls > 0 ) {
+
+    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
+    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
+
+    // Flops = 9*12*Ls*vol/2
+    RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
+    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  }
+
+}
+template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
+{
+  this->ZeroCounters();
+  M5Dflops=0;
+  M5Dcalls=0;
+  M5Dtime=0;
+  MooeeInvFlops=0;
+  MooeeInvCalls=0;
+  MooeeInvTime=0;
+}
+
+
 template<class Impl>  
 void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
 {
@@ -120,6 +120,18 @@ namespace Grid {
 		      GridRedBlackCartesian &FourDimRedBlackGrid,
 		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());

+      
+
+     void CayleyReport(void);
+     void CayleyZeroCounters(void);
+
+     double M5Dflops;
+     double M5Dcalls;
+     double M5Dtime;
+
+     double MooeeInvFlops;
+     double MooeeInvCalls;
+     double MooeeInvTime;

    protected:
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
@@ -51,6 +51,9 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
  GridBase *grid=psi._grid;
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;
+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  M5Dcalls++;
+  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    for(int s=0;s<Ls;s++){
@@ -76,6 +79,7 @@ PARALLEL_FOR_LOOP
      }
    }
  }
+  M5Dtime+=usecond();
 }

 template<class Impl>  
@@ -91,6 +95,9 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
  assert(phi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;

+  // Flops = 6.0*(Nc*Ns) *Ls*vol
+  M5Dcalls++;
+  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];
@@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP
      }
    }
  }
+  M5Dtime+=usecond();
 }

 template<class Impl>
@@ -126,10 +134,14 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &

  chi.checkerboard=psi.checkerboard;

+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
+
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
    auto tmp = psi._odata[0];

+    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
    // Apply (L^{\prime})^{-1}
    chi[ss]=psi[ss]; // chi[0]=psi[0]
    for(int s=1;s<Ls;s++){
@@ -155,6 +167,9 @@ PARALLEL_FOR_LOOP
      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
    }
  }
+
+  MooeeInvTime+=usecond();
+
 }

 template<class Impl>
@@ -166,6 +181,8 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
  assert(psi.checkerboard == psi.checkerboard);
  chi.checkerboard=psi.checkerboard;

+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();

 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
@@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP
      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
    }
  }
+
+  MooeeInvTime+=usecond();
+
 }

 #ifdef CAYLEY_DPERP_CACHE
@@ -71,6 +71,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,

  chi.checkerboard=psi.checkerboard;

+
  // just directly address via type pun
  typedef typename Simd::scalar_type scalar_type;
  scalar_type * u_p = (scalar_type *)&u[0];
@@ -86,6 +87,8 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
    d_p[ss] = diag[s];
  }}

+  M5Dcalls++;
+  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs

@@ -115,6 +118,7 @@ PARALLEL_FOR_LOOP

    }
  }
+  M5Dtime+=usecond();
 }

 template<class Impl>  
@@ -154,6 +158,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
    d_p[ss] = diag[s];
  }}

+  M5Dcalls++;
+  M5Dtime-=usecond();
 PARALLEL_FOR_LOOP
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs

@@ -183,6 +189,7 @@ PARALLEL_FOR_LOOP

    }
  }
+  M5Dtime+=usecond();
 }

 template<class Impl>
@@ -250,13 +257,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
    }
  }
  
+  MooeeInvCalls++;
+  MooeeInvTime-=usecond();
  // Dynamic allocate on stack to get per thread without serialised heap acces
-PARALLEL_FOR_LOOP
-  for(auto site=0;site<vol;site++){
-    
-    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
-    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
-    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
+#pragma omp parallel  
+  {

    Vector<SiteHalfSpinor> SitePplus(LLs);
    Vector<SiteHalfSpinor> SitePminus(LLs);
@@ -267,6 +272,9 @@ PARALLEL_FOR_LOOP
    SiteHalfSpinor BcastP;
    SiteHalfSpinor BcastM;

+#pragma omp for 
+  for(auto site=0;site<vol;site++){
+
    for(int s=0;s<LLs;s++){
      int lex = s+LLs*site;
      spProj5p(SitePplus[s] ,psi[lex]);
@@ -294,6 +302,8 @@ PARALLEL_FOR_LOOP
      chi[lex] = SiteChi[s]*0.5;
    }
  }
+  }
+  MooeeInvTime+=usecond();
 }

 INSTANTIATE_DPERP(DomainWallVec5dImplD);