1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-19 00:07:05 +01:00

Merge branch 'develop' into feature/hmc_generalise

This commit is contained in:
Guido Cossu
2017-05-01 12:13:56 +01:00
69 changed files with 3971 additions and 3179 deletions

View File

@ -118,6 +118,18 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
// Allocate the required comms buffer
ImportGauge(_Umu);
// Build lists of exterior only nodes
int LLs = FiveDimGrid._rdimensions[0];
int vol4;
vol4=FourDimGrid.oSites();
Stencil.BuildSurfaceList(LLs,vol4);
vol4=FourDimRedBlackGrid.oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
<<" " << StencilEven.surface_list.size()<<std::endl;
}
template<class Impl>
@ -359,6 +371,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DhopTotalTime+=usecond();
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
@ -372,12 +385,21 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
int LLs = in._grid->_rdimensions[0];
int len = U._grid->oSites();
DhopFaceTime-=usecond();
st.HaloExchangeOptGather(in,compressor);
DhopFaceTime+=usecond();
std::vector<std::vector<CommsRequest_t> > reqs;
// Rely on async comms; start comms before merge of local data
DhopCommTime-=usecond();
st.CommunicateBegin(reqs);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+=usecond();
// Perhaps use omp task and region
#pragma omp parallel
{
int nthreads = omp_get_num_threads();
@ -388,8 +410,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
int sF = LLs * myoff;
if ( me == 0 ) {
DhopCommTime-=usecond();
st.CommunicateBegin(reqs);
st.CommunicateComplete(reqs);
DhopCommTime+=usecond();
} else {
@ -402,28 +422,37 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
}
DhopFaceTime-=usecond();
st.CommsMerge();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
#pragma omp parallel
{
int nthreads = omp_get_num_threads();
int me = omp_get_thread_num();
int myoff, mywork;
GridThread::GetWork(len,me,mywork,myoff,nthreads);
int sF = LLs * myoff;
// Exterior links in stencil
if ( me==0 ) DhopComputeTime2-=usecond();
if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1);
if ( me==0 ) DhopComputeTime2+=usecond();
}// end parallel region
// Load imbalance alert. Should use dynamic schedule OMP for loop
// Perhaps create a list of only those sites with face work, and
// load balance process the list.
DhopComputeTime2-=usecond();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
int sF = LLs * sU;
Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
}
} else {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
int sF = LLs * sU;
Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1);
}
}
DhopComputeTime2+=usecond();
#else
assert(0);
#endif
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
@ -642,7 +671,6 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
}
FermOpTemplateInstantiate(WilsonFermion5D);
GparityFermOpTemplateInstantiate(WilsonFermion5D);