mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Serialisation in malloc fixed
This commit is contained in:
		@@ -62,6 +62,50 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
 | 
			
		||||
    axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 | 
			
		||||
{
 | 
			
		||||
  this->Report();
 | 
			
		||||
  std::vector<int> latt = GridDefaultLatt();          
 | 
			
		||||
  RealD volume = this->Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
 | 
			
		||||
  RealD NP     = this->_FourDimGrid->_Nprocessors;
 | 
			
		||||
  if ( M5Dcalls > 0 ) {
 | 
			
		||||
    std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls     : " << M5Dcalls   << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls       : " << M5Dtime / M5Dcalls << " us" << std::endl;
 | 
			
		||||
 | 
			
		||||
    // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
    RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
 | 
			
		||||
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( MooeeInvCalls > 0 ) {
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls     : " << MooeeInvCalls   << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls            : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
 | 
			
		||||
 | 
			
		||||
    // Flops = 9*12*Ls*vol/2
 | 
			
		||||
    RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
 | 
			
		||||
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
 | 
			
		||||
{
 | 
			
		||||
  this->ZeroCounters();
 | 
			
		||||
  M5Dflops=0;
 | 
			
		||||
  M5Dcalls=0;
 | 
			
		||||
  M5Dtime=0;
 | 
			
		||||
  MooeeInvFlops=0;
 | 
			
		||||
  MooeeInvCalls=0;
 | 
			
		||||
  MooeeInvTime=0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>  
 | 
			
		||||
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -120,6 +120,18 @@ namespace Grid {
 | 
			
		||||
		      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
     void CayleyReport(void);
 | 
			
		||||
     void CayleyZeroCounters(void);
 | 
			
		||||
 | 
			
		||||
     double M5Dflops;
 | 
			
		||||
     double M5Dcalls;
 | 
			
		||||
     double M5Dtime;
 | 
			
		||||
 | 
			
		||||
     double MooeeInvFlops;
 | 
			
		||||
     double MooeeInvCalls;
 | 
			
		||||
     double MooeeInvTime;
 | 
			
		||||
 | 
			
		||||
    protected:
 | 
			
		||||
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
 | 
			
		||||
 
 | 
			
		||||
@@ -51,6 +51,9 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 | 
			
		||||
  GridBase *grid=psi._grid;
 | 
			
		||||
  assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
  chi.checkerboard=psi.checkerboard;
 | 
			
		||||
  // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
  M5Dcalls++;
 | 
			
		||||
  M5Dtime-=usecond();
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
@@ -76,6 +79,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  M5Dtime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>  
 | 
			
		||||
@@ -91,6 +95,9 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 | 
			
		||||
  assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
  chi.checkerboard=psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
  // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
  M5Dcalls++;
 | 
			
		||||
  M5Dtime-=usecond();
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    auto tmp = psi._odata[0];
 | 
			
		||||
@@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  M5Dtime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -126,10 +134,14 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &
 | 
			
		||||
 | 
			
		||||
  chi.checkerboard=psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
  MooeeInvCalls++;
 | 
			
		||||
  MooeeInvTime-=usecond();
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
    auto tmp = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
 | 
			
		||||
    // Apply (L^{\prime})^{-1}
 | 
			
		||||
    chi[ss]=psi[ss]; // chi[0]=psi[0]
 | 
			
		||||
    for(int s=1;s<Ls;s++){
 | 
			
		||||
@@ -155,6 +167,9 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      chi[ss+s] = chi[ss+s] - uee[s]*tmp;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  MooeeInvTime+=usecond();
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -166,6 +181,8 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
 | 
			
		||||
  assert(psi.checkerboard == psi.checkerboard);
 | 
			
		||||
  chi.checkerboard=psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
  MooeeInvCalls++;
 | 
			
		||||
  MooeeInvTime-=usecond();
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
 | 
			
		||||
@@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      chi[ss+s] = chi[ss+s] - lee[s]*tmp;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  MooeeInvTime+=usecond();
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef CAYLEY_DPERP_CACHE
 | 
			
		||||
 
 | 
			
		||||
@@ -71,6 +71,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 | 
			
		||||
 | 
			
		||||
  chi.checkerboard=psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // just directly address via type pun
 | 
			
		||||
  typedef typename Simd::scalar_type scalar_type;
 | 
			
		||||
  scalar_type * u_p = (scalar_type *)&u[0];
 | 
			
		||||
@@ -86,6 +87,8 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
 | 
			
		||||
    d_p[ss] = diag[s];
 | 
			
		||||
  }}
 | 
			
		||||
 | 
			
		||||
  M5Dcalls++;
 | 
			
		||||
  M5Dtime-=usecond();
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 | 
			
		||||
 | 
			
		||||
@@ -115,6 +118,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  M5Dtime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>  
 | 
			
		||||
@@ -154,6 +158,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
 | 
			
		||||
    d_p[ss] = diag[s];
 | 
			
		||||
  }}
 | 
			
		||||
 | 
			
		||||
  M5Dcalls++;
 | 
			
		||||
  M5Dtime-=usecond();
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
 | 
			
		||||
 | 
			
		||||
@@ -183,6 +189,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  M5Dtime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
@@ -250,13 +257,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  MooeeInvCalls++;
 | 
			
		||||
  MooeeInvTime-=usecond();
 | 
			
		||||
  // Dynamic allocate on stack to get per thread without serialised heap acces
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
  for(auto site=0;site<vol;site++){
 | 
			
		||||
    
 | 
			
		||||
    //    SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
 | 
			
		||||
    //    SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
 | 
			
		||||
    //    SiteSpinor     *SiteChi   =(SiteSpinor *)     alloca(LLs*sizeof(SiteSpinor));
 | 
			
		||||
#pragma omp parallel  
 | 
			
		||||
  {
 | 
			
		||||
 | 
			
		||||
    Vector<SiteHalfSpinor> SitePplus(LLs);
 | 
			
		||||
    Vector<SiteHalfSpinor> SitePminus(LLs);
 | 
			
		||||
@@ -267,6 +272,9 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    SiteHalfSpinor BcastP;
 | 
			
		||||
    SiteHalfSpinor BcastM;
 | 
			
		||||
 | 
			
		||||
#pragma omp for 
 | 
			
		||||
  for(auto site=0;site<vol;site++){
 | 
			
		||||
 | 
			
		||||
    for(int s=0;s<LLs;s++){
 | 
			
		||||
      int lex = s+LLs*site;
 | 
			
		||||
      spProj5p(SitePplus[s] ,psi[lex]);
 | 
			
		||||
@@ -294,6 +302,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      chi[lex] = SiteChi[s]*0.5;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  MooeeInvTime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
INSTANTIATE_DPERP(DomainWallVec5dImplD);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user