mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-13 04:37:05 +01:00
Overlap comms compute changes
This commit is contained in:
@ -58,7 +58,6 @@ namespace QCD {
|
||||
UmuOdd (&Hgrid)
|
||||
{
|
||||
// Allocate the required comms buffer
|
||||
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
|
||||
ImportGauge(_Umu);
|
||||
}
|
||||
|
||||
@ -153,7 +152,7 @@ namespace QCD {
|
||||
FermionField Atilde(B._grid);
|
||||
Atilde = A;
|
||||
|
||||
st.HaloExchange(B,comm_buf,compressor);
|
||||
st.HaloExchange(B,compressor);
|
||||
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
@ -168,7 +167,7 @@ namespace QCD {
|
||||
////////////////////////
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<B._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopDir(st,U,comm_buf,sss,sss,B,Btilde,mu,gamma);
|
||||
Kernels::DiracOptDhopDir(st,U,st.comm_buf,sss,sss,B,Btilde,mu,gamma);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
@ -274,11 +273,11 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
Compressor compressor(dag);
|
||||
|
||||
Stencil.HaloExchange(in,comm_buf,compressor);
|
||||
Stencil.HaloExchange(in,compressor);
|
||||
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp,gamma);
|
||||
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sss,sss,in,out,dirdisp,gamma);
|
||||
}
|
||||
|
||||
};
|
||||
@ -300,30 +299,30 @@ PARALLEL_FOR_LOOP
|
||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
|
||||
Compressor compressor(dag);
|
||||
st.HaloExchange(in,comm_buf,compressor);
|
||||
st.HaloExchange(in,compressor);
|
||||
|
||||
if ( dag == DaggerYes ) {
|
||||
if( HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out);
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if( HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out);
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out);
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -338,8 +337,7 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
Compressor compressor(dag);
|
||||
|
||||
std::thread comms_thread = st.HaloExchangeBegin(in,comm_buf,compressor);
|
||||
comms_thread.join();
|
||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||
|
||||
bool local = true;
|
||||
bool nonlocal = false;
|
||||
@ -347,28 +345,29 @@ PARALLEL_FOR_LOOP
|
||||
if( HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if( HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
st.HaloExchangeComplete(handle);
|
||||
|
||||
local = false;
|
||||
nonlocal = true;
|
||||
@ -376,24 +375,24 @@ PARALLEL_FOR_LOOP
|
||||
if( HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if( HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -152,9 +152,6 @@ namespace Grid {
|
||||
DoubledGaugeField Umu;
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
// Comms buffer
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
};
|
||||
|
||||
|
@ -98,12 +98,11 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
||||
}
|
||||
|
||||
// Allocate the required comms buffer
|
||||
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
|
||||
|
||||
ImportGauge(_Umu);
|
||||
commtime=0;
|
||||
jointime=0;
|
||||
dslashtime=0;
|
||||
dslash1time=0;
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||
@ -121,7 +120,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
|
||||
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
|
||||
|
||||
Compressor compressor(DaggerNo);
|
||||
Stencil.HaloExchange(in,comm_buf,compressor);
|
||||
Stencil.HaloExchange(in,compressor);
|
||||
|
||||
int skip = (disp==1) ? 0 : 1;
|
||||
|
||||
@ -136,7 +135,7 @@ PARALLEL_FOR_LOOP
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sU=ss;
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sF,sU,in,out,dirdisp,gamma);
|
||||
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -159,7 +158,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
||||
FermionField Btilde(B._grid);
|
||||
FermionField Atilde(B._grid);
|
||||
|
||||
st.HaloExchange(B,comm_buf,compressor);
|
||||
st.HaloExchange(B,compressor);
|
||||
|
||||
Atilde=A;
|
||||
|
||||
@ -184,7 +183,7 @@ PARALLEL_FOR_LOOP
|
||||
assert ( sF< B._grid->oSites());
|
||||
assert ( sU< U._grid->oSites());
|
||||
|
||||
Kernels::DiracOptDhopDir(st,U,comm_buf,sF,sU,B,Btilde,mu,gamma);
|
||||
Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma);
|
||||
|
||||
////////////////////////////
|
||||
// spin trace outer product
|
||||
@ -236,9 +235,10 @@ template<class Impl>
|
||||
void WilsonFermion5D<Impl>::Report(void)
|
||||
{
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "join time "<<jointime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "join time "<<jointime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
|
||||
@ -299,11 +299,11 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
|
||||
int nwork = U._grid->oSites();
|
||||
|
||||
commtime -=usecond();
|
||||
std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
|
||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||
st.HaloExchangeComplete(handle);
|
||||
commtime +=usecond();
|
||||
|
||||
jointime -=usecond();
|
||||
thr.join();
|
||||
jointime +=usecond();
|
||||
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
@ -319,7 +319,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -330,7 +330,7 @@ PARALLEL_FOR_LOOP
|
||||
for(sd=0;sd<Ls;sd++){
|
||||
int sU=ss;
|
||||
int sF = sd+Ls*sU;
|
||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -362,7 +362,7 @@ PARALLEL_FOR_LOOP
|
||||
sU = lo.Reorder(sU);
|
||||
}
|
||||
sF = s+Ls*sU;
|
||||
Kernels::DiracOptAsmDhopSite(st,U,comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
|
||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -387,7 +387,7 @@ PARALLEL_FOR_LOOP
|
||||
sU=ss+ ssoff;
|
||||
for(int s=soff;s<soff+swork;s++){
|
||||
sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -398,7 +398,7 @@ PARALLEL_FOR_LOOP
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -407,7 +407,7 @@ PARALLEL_FOR_LOOP
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out);
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -432,7 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
|
||||
int nwork = U._grid->oSites();
|
||||
|
||||
commtime -=usecond();
|
||||
std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
|
||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||
commtime +=usecond();
|
||||
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
@ -450,7 +450,7 @@ PARALLEL_FOR_LOOP
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -461,7 +461,7 @@ PARALLEL_FOR_LOOP
|
||||
for(sd=0;sd<Ls;sd++){
|
||||
int sU=ss;
|
||||
int sF = sd+Ls*sU;
|
||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -473,7 +473,7 @@ PARALLEL_FOR_LOOP
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -482,7 +482,7 @@ PARALLEL_FOR_LOOP
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -490,12 +490,12 @@ PARALLEL_FOR_LOOP
|
||||
dslashtime +=usecond();
|
||||
|
||||
jointime -=usecond();
|
||||
thr.join();
|
||||
st.HaloExchangeComplete(handle);
|
||||
jointime +=usecond();
|
||||
|
||||
local = false;
|
||||
nonlocal = true;
|
||||
dslashtime -=usecond();
|
||||
dslash1time -=usecond();
|
||||
if ( dag == DaggerYes ) {
|
||||
if( this->HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
@ -503,7 +503,7 @@ PARALLEL_FOR_LOOP
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -514,7 +514,7 @@ PARALLEL_FOR_LOOP
|
||||
for(sd=0;sd<Ls;sd++){
|
||||
int sU=ss;
|
||||
int sF = sd+Ls*sU;
|
||||
Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -526,7 +526,7 @@ PARALLEL_FOR_LOOP
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -535,13 +535,12 @@ PARALLEL_FOR_LOOP
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dslashtime +=usecond();
|
||||
|
||||
dslash1time +=usecond();
|
||||
|
||||
}
|
||||
|
||||
|
@ -64,6 +64,7 @@ namespace Grid {
|
||||
double jointime;
|
||||
double commtime;
|
||||
double dslashtime;
|
||||
double dslash1time;
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Implement the abstract base
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
Reference in New Issue
Block a user