1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-19 00:07:05 +01:00

Improvements to the assembler interface that let us move chunks of the

site and s loop into the kernels. This will save on function call overhead and
guarantee L2 prefetching strategy is right since OMP can't distribute the
sub-chunks of work.
This commit is contained in:
paboyle
2016-06-09 01:12:36 -07:00
parent d9408893b3
commit 55f65b81b5
10 changed files with 77 additions and 87 deletions

View File

@ -95,11 +95,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
// Allocate the required comms buffer
ImportGauge(_Umu);
alltime=0;
commtime=0;
jointime=0;
dslashtime=0;
dslash1time=0;
}
template<class Impl>
@ -291,33 +286,6 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
}
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
return;
#if 0
std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
std::cout<<GridLogMessage << "Wilson5d time "<<alltime <<" us"<<std::endl;
std::cout<<GridLogMessage << "HaloBegin time "<<commtime <<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
std::cout<<GridLogMessage << "HaloComplete time "<<jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
std::cout<<GridLogMessage << "Stencil all gather time "<<Stencil.halogtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil splice gather time "<<Stencil.splicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather "<<Stencil.gathertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather simd "<<Stencil.gathermtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil merge simd "<<Stencil.mergetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil spin simd "<<Stencil.spintime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil MB/s "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
#endif
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
const FermionField &A,
@ -341,42 +309,28 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
const FermionField &in, FermionField &out,int dag)
{
// assert((dag==DaggerNo) ||(dag==DaggerYes));
alltime-=usecond();
Compressor compressor(dag);
int LLs = in._grid->_rdimensions[0];
commtime -=usecond();
st.HaloExchange(in,compressor);
commtime +=usecond();
jointime -=usecond();
jointime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
dslashtime -=usecond();
if ( dag == DaggerYes ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF=s+LLs*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
int sF=LLs*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=lo.Reorder(ss);
int sU=ss;
int sF=LLs*sU;
for(int s=0;s<LLs;s++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
sF++;
}
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
}
}
dslashtime +=usecond();
alltime+=usecond();
}