1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Serialisation in malloc fixed

This commit is contained in:
paboyle 2016-11-29 22:27:55 +00:00
parent 2f92b4860b
commit bd0430b34f
5 changed files with 128 additions and 98 deletions

View File

@ -70,7 +70,7 @@ int main (int argc, char ** argv)
if (1)
{
const int ncall=100;
const int ncall=1000;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
@ -81,18 +81,7 @@ int main (int argc, char ** argv)
LatticeFermion result(FGrid);
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
FGrid->Barrier();
double t0,t1;
t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
t1=usecond();
FGrid->Barrier();
std::cout<<GridLogMessage << "Called Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
LatticeFermion r_eo(FGrid);
LatticeFermion src_e (FrbGrid);
@ -109,48 +98,44 @@ int main (int argc, char ** argv)
r_e = zero;
r_o = zero;
FGrid->Barrier();
t0=usecond();
for (int i = 0; i < ncall; i++) {
Dw.DhopEO(src_o, r_e, DaggerNo);
}
t1=usecond();
FGrid->Barrier();
std::cout<<GridLogMessage << "Called DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
FGrid->Barrier();
t0=usecond();
for (int i = 0; i < ncall; i++) {
Dw.Mooee(src_o, r_o);
}
t1=usecond();
FGrid->Barrier();
std::cout<<GridLogMessage << "Called Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
#define BENCH_DW(A,in,out) \
Dw.CayleyZeroCounters(); \
FGrid->Barrier(); \
t0=usecond(); \
for(int i=0;i<ncall;i++){ \
Dw. A (in,out); \
} \
t1=usecond(); \
FGrid->Barrier(); \
Dw.CayleyReport(); \
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
std::cout<<GridLogMessage << "******************"<<std::endl;
FGrid->Barrier();
t0=usecond();
for (int i = 0; i < ncall; i++) {
Dw.MooeeInv(src_o, r_o);
}
t1=usecond();
FGrid->Barrier();
std::cout<<GridLogMessage << "Called MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
#define BENCH_DW_MEO(A,in,out) \
Dw.CayleyZeroCounters(); \
FGrid->Barrier(); \
t0=usecond(); \
for(int i=0;i<ncall;i++){ \
Dw. A (in,out,0); \
} \
t1=usecond(); \
FGrid->Barrier(); \
Dw.CayleyReport(); \
std::cout<<GridLogMessage << "Called " #A " "<< (t1-t0)/ncall<<" us"<<std::endl;\
std::cout<<GridLogMessage << "******************"<<std::endl;
FGrid->Barrier();
t0=usecond();
for (int i = 0; i < ncall; i++) {
Dw.Meooe(src_o, r_e);
}
t1=usecond();
FGrid->Barrier();
std::cout<<GridLogMessage << "Called Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
BENCH_DW_MEO(Dhop ,src,result);
BENCH_DW_MEO(DhopEO ,src_o,r_e);
BENCH_DW(Meooe ,src_o,r_e);
BENCH_DW(Mooee ,src_o,r_o);
BENCH_DW(MooeeInv,src_o,r_o);
}
if (1)
{
const int ncall=100;
const int ncall=1000;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
@ -168,14 +153,6 @@ int main (int argc, char ** argv)
FGrid->Barrier();
double t0,t1;
t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
t1=usecond();
FGrid->Barrier();
std::cout<<GridLogMessage << "Called Vec5D Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
LatticeFermion r_eo(sFGrid);
LatticeFermion src_e (sFrbGrid);
@ -192,46 +169,13 @@ int main (int argc, char ** argv)
r_e = zero;
r_o = zero;
FGrid->Barrier();
t0=usecond();
for (int i = 0; i < ncall; i++) {
Dw.DhopEO(src_o, r_e, DaggerNo);
}
t1=usecond();
FGrid->Barrier();
std::cout<<GridLogMessage << "Called Vec5D DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
FGrid->Barrier();
t0=usecond();
for (int i = 0; i < ncall; i++) {
Dw.Mooee(src_o, r_o);
}
t1=usecond();
FGrid->Barrier();
std::cout<<GridLogMessage << "Called Vec5D Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
FGrid->Barrier();
t0=usecond();
for (int i = 0; i < ncall; i++) {
Dw.MooeeInv(src_o, r_o);
}
t1=usecond();
FGrid->Barrier();
std::cout<<GridLogMessage << "Called Vec5D MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
FGrid->Barrier();
t0=usecond();
for (int i = 0; i < ncall; i++) {
Dw.Meooe(src_o, r_e);
}
t1=usecond();
FGrid->Barrier();
std::cout<<GridLogMessage << "Called Vec5D Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
BENCH_DW_MEO(Dhop ,src,result);
BENCH_DW_MEO(DhopEO ,src_o,r_e);
BENCH_DW(Meooe ,src_o,r_e);
BENCH_DW(Mooee ,src_o,r_o);
BENCH_DW(MooeeInv,src_o,r_o);
}
Grid_finalize();
}

View File

@ -62,6 +62,50 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp,s,s);// chi = (1-c[s] D_W) psi
}
}
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
{
this->Report();
std::vector<int> latt = GridDefaultLatt();
RealD volume = this->Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = this->_FourDimGrid->_Nprocessors;
if ( M5Dcalls > 0 ) {
std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl;
// Flops = 6.0*(Nc*Ns) *Ls*vol
RealD mflops = 6.0*12*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
}
if ( MooeeInvCalls > 0 ) {
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
// Flops = 9*12*Ls*vol/2
RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
}
}
template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
{
this->ZeroCounters();
M5Dflops=0;
M5Dcalls=0;
M5Dtime=0;
MooeeInvFlops=0;
MooeeInvCalls=0;
MooeeInvTime=0;
}
template<class Impl>
void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi)
{

View File

@ -120,6 +120,18 @@ namespace Grid {
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
void CayleyReport(void);
void CayleyZeroCounters(void);
double M5Dflops;
double M5Dcalls;
double M5Dtime;
double MooeeInvFlops;
double MooeeInvCalls;
double MooeeInvTime;
protected:
void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);

View File

@ -51,6 +51,9 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
GridBase *grid=psi._grid;
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
for(int s=0;s<Ls;s++){
@ -76,6 +79,7 @@ PARALLEL_FOR_LOOP
}
}
}
M5Dtime+=usecond();
}
template<class Impl>
@ -91,6 +95,9 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];
@ -116,6 +123,7 @@ PARALLEL_FOR_LOOP
}
}
}
M5Dtime+=usecond();
}
template<class Impl>
@ -126,10 +134,14 @@ void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &
chi.checkerboard=psi.checkerboard;
MooeeInvCalls++;
MooeeInvTime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
// Apply (L^{\prime})^{-1}
chi[ss]=psi[ss]; // chi[0]=psi[0]
for(int s=1;s<Ls;s++){
@ -155,6 +167,9 @@ PARALLEL_FOR_LOOP
chi[ss+s] = chi[ss+s] - uee[s]*tmp;
}
}
MooeeInvTime+=usecond();
}
template<class Impl>
@ -166,6 +181,8 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &
assert(psi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
MooeeInvCalls++;
MooeeInvTime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
@ -197,6 +214,9 @@ PARALLEL_FOR_LOOP
chi[ss+s] = chi[ss+s] - lee[s]*tmp;
}
}
MooeeInvTime+=usecond();
}
#ifdef CAYLEY_DPERP_CACHE

View File

@ -71,6 +71,7 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
chi.checkerboard=psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type * u_p = (scalar_type *)&u[0];
@ -86,6 +87,8 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
d_p[ss] = diag[s];
}}
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
@ -115,6 +118,7 @@ PARALLEL_FOR_LOOP
}
}
M5Dtime+=usecond();
}
template<class Impl>
@ -154,6 +158,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
d_p[ss] = diag[s];
}}
M5Dcalls++;
M5Dtime-=usecond();
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
@ -183,6 +189,7 @@ PARALLEL_FOR_LOOP
}
}
M5Dtime+=usecond();
}
template<class Impl>
@ -250,13 +257,11 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField
}
}
MooeeInvCalls++;
MooeeInvTime-=usecond();
// Dynamic allocate on stack to get per thread without serialised heap acces
PARALLEL_FOR_LOOP
for(auto site=0;site<vol;site++){
// SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
// SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
// SiteSpinor *SiteChi =(SiteSpinor *) alloca(LLs*sizeof(SiteSpinor));
#pragma omp parallel
{
Vector<SiteHalfSpinor> SitePplus(LLs);
Vector<SiteHalfSpinor> SitePminus(LLs);
@ -267,6 +272,9 @@ PARALLEL_FOR_LOOP
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
#pragma omp for
for(auto site=0;site<vol;site++){
for(int s=0;s<LLs;s++){
int lex = s+LLs*site;
spProj5p(SitePplus[s] ,psi[lex]);
@ -294,6 +302,8 @@ PARALLEL_FOR_LOOP
chi[lex] = SiteChi[s]*0.5;
}
}
}
MooeeInvTime+=usecond();
}
INSTANTIATE_DPERP(DomainWallVec5dImplD);