1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 04:37:05 +01:00

Overlap comms compute changes

This commit is contained in:
paboyle
2016-01-10 19:20:16 +00:00
parent c99d748da6
commit d19321dfde
8 changed files with 220 additions and 184 deletions

View File

@ -58,7 +58,6 @@ namespace QCD {
UmuOdd (&Hgrid)
{
// Allocate the required comms buffer
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
ImportGauge(_Umu);
}
@ -153,7 +152,7 @@ namespace QCD {
FermionField Atilde(B._grid);
Atilde = A;
st.HaloExchange(B,comm_buf,compressor);
st.HaloExchange(B,compressor);
for(int mu=0;mu<Nd;mu++){
@ -168,7 +167,7 @@ namespace QCD {
////////////////////////
PARALLEL_FOR_LOOP
for(int sss=0;sss<B._grid->oSites();sss++){
Kernels::DiracOptDhopDir(st,U,comm_buf,sss,sss,B,Btilde,mu,gamma);
Kernels::DiracOptDhopDir(st,U,st.comm_buf,sss,sss,B,Btilde,mu,gamma);
}
//////////////////////////////////////////////////
@ -274,11 +273,11 @@ PARALLEL_FOR_LOOP
Compressor compressor(dag);
Stencil.HaloExchange(in,comm_buf,compressor);
Stencil.HaloExchange(in,compressor);
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp,gamma);
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sss,sss,in,out,dirdisp,gamma);
}
};
@ -300,30 +299,30 @@ PARALLEL_FOR_LOOP
assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor(dag);
st.HaloExchange(in,comm_buf,compressor);
st.HaloExchange(in,compressor);
if ( dag == DaggerYes ) {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
}
}
} else {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
}
}
@ -338,8 +337,7 @@ PARALLEL_FOR_LOOP
Compressor compressor(dag);
std::thread comms_thread = st.HaloExchangeBegin(in,comm_buf,compressor);
comms_thread.join();
auto handle = st.HaloExchangeBegin(in,compressor);
bool local = true;
bool nonlocal = false;
@ -347,28 +345,29 @@ PARALLEL_FOR_LOOP
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
} else {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
}
st.HaloExchangeComplete(handle);
local = false;
nonlocal = true;
@ -376,24 +375,24 @@ PARALLEL_FOR_LOOP
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
} else {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
}

View File

@ -152,9 +152,6 @@ namespace Grid {
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
// Comms buffer
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
};

View File

@ -98,12 +98,11 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
}
// Allocate the required comms buffer
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
ImportGauge(_Umu);
commtime=0;
jointime=0;
dslashtime=0;
dslash1time=0;
}
template<class Impl>
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@ -121,7 +120,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
Compressor compressor(DaggerNo);
Stencil.HaloExchange(in,comm_buf,compressor);
Stencil.HaloExchange(in,compressor);
int skip = (disp==1) ? 0 : 1;
@ -136,7 +135,7 @@ PARALLEL_FOR_LOOP
for(int s=0;s<Ls;s++){
int sU=ss;
int sF = s+Ls*sU;
Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp,gamma);
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
}
}
};
@ -159,7 +158,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
FermionField Btilde(B._grid);
FermionField Atilde(B._grid);
st.HaloExchange(B,comm_buf,compressor);
st.HaloExchange(B,compressor);
Atilde=A;
@ -184,7 +183,7 @@ PARALLEL_FOR_LOOP
assert ( sF< B._grid->oSites());
assert ( sU< U._grid->oSites());
Kernels::DiracOptDhopDir(st,U,comm_buf,sF,sU,B,Btilde,mu,gamma);
Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma);
////////////////////////////
// spin trace outer product
@ -236,9 +235,10 @@ template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "join time "<<jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
std::cout<<GridLogMessage << "join time "<<jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
@ -299,11 +299,11 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
int nwork = U._grid->oSites();
commtime -=usecond();
std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
auto handle = st.HaloExchangeBegin(in,compressor);
st.HaloExchangeComplete(handle);
commtime +=usecond();
jointime -=usecond();
thr.join();
jointime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
@ -319,7 +319,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
@ -330,7 +330,7 @@ PARALLEL_FOR_LOOP
for(sd=0;sd<Ls;sd++){
int sU=ss;
int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
}
}
@ -362,7 +362,7 @@ PARALLEL_FOR_LOOP
sU = lo.Reorder(sU);
}
sF = s+Ls*sU;
Kernels::DiracOptAsmDhopSite(st,U,comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
}
}
}
@ -387,7 +387,7 @@ PARALLEL_FOR_LOOP
sU=ss+ ssoff;
for(int s=soff;s<soff+swork;s++){
sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
}
@ -398,7 +398,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
@ -407,7 +407,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
}
@ -432,7 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
int nwork = U._grid->oSites();
commtime -=usecond();
std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
auto handle = st.HaloExchangeBegin(in,compressor);
commtime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
@ -450,7 +450,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
@ -461,7 +461,7 @@ PARALLEL_FOR_LOOP
for(sd=0;sd<Ls;sd++){
int sU=ss;
int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
@ -473,7 +473,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
@ -482,7 +482,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
@ -490,12 +490,12 @@ PARALLEL_FOR_LOOP
dslashtime +=usecond();
jointime -=usecond();
thr.join();
st.HaloExchangeComplete(handle);
jointime +=usecond();
local = false;
nonlocal = true;
dslashtime -=usecond();
dslash1time -=usecond();
if ( dag == DaggerYes ) {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
@ -503,7 +503,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
@ -514,7 +514,7 @@ PARALLEL_FOR_LOOP
for(sd=0;sd<Ls;sd++){
int sU=ss;
int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
@ -526,7 +526,7 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
@ -535,13 +535,12 @@ PARALLEL_FOR_LOOP
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
}
dslashtime +=usecond();
dslash1time +=usecond();
}

View File

@ -64,6 +64,7 @@ namespace Grid {
double jointime;
double commtime;
double dslashtime;
double dslash1time;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////