mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 11:15:55 +01:00
Adding a trial for openmp overhead minimisation
This commit is contained in:
parent
f7b1060aed
commit
e67fc2be18
@ -418,6 +418,126 @@ PARALLEL_FOR_LOOP
|
|||||||
alltime+=usecond();
|
alltime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField & U,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
alltime-=usecond();
|
||||||
|
Compressor compressor(dag);
|
||||||
|
|
||||||
|
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
||||||
|
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
int HT = GridThread::GetHyperThreads();
|
||||||
|
int cores = GridThread::GetCores();
|
||||||
|
int nwork = U._grid->oSites();
|
||||||
|
|
||||||
|
commtime -=usecond();
|
||||||
|
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||||
|
st.HaloExchangeComplete(handle);
|
||||||
|
commtime +=usecond();
|
||||||
|
|
||||||
|
jointime -=usecond();
|
||||||
|
jointime +=usecond();
|
||||||
|
|
||||||
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
|
// Not loop ordering and data layout.
|
||||||
|
// Designed to create
|
||||||
|
// - per thread reuse in L1 cache for U
|
||||||
|
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
for(int jjj=0;jjj<1000;jjj++){
|
||||||
|
#pragma omp barrier
|
||||||
|
dslashtime -=usecond();
|
||||||
|
if ( dag == DaggerYes ) {
|
||||||
|
if( this->HandOptDslash ) {
|
||||||
|
#pragma omp for
|
||||||
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
|
int sU=ss;
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
int sF = s+Ls*sU;
|
||||||
|
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
|
||||||
|
#pragma omp for
|
||||||
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
|
{
|
||||||
|
int sd;
|
||||||
|
for(sd=0;sd<Ls;sd++){
|
||||||
|
int sU=ss;
|
||||||
|
int sF = sd+Ls*sU;
|
||||||
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if( this->AsmOptDslash ) {
|
||||||
|
// for(int i=0;i<1;i++){
|
||||||
|
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||||
|
// PerformanceCounter Counter(i);
|
||||||
|
// Counter.Start();
|
||||||
|
|
||||||
|
#pragma omp for
|
||||||
|
for(int t=0;t<threads;t++){
|
||||||
|
|
||||||
|
int hyperthread = t%HT;
|
||||||
|
int core = t/HT;
|
||||||
|
|
||||||
|
int sswork, swork,soff,ssoff, sU,sF;
|
||||||
|
|
||||||
|
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
||||||
|
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
||||||
|
|
||||||
|
for(int ss=0;ss<sswork;ss++){
|
||||||
|
for(int s=soff;s<soff+swork;s++){
|
||||||
|
|
||||||
|
sU=ss+ ssoff;
|
||||||
|
|
||||||
|
if ( LebesgueOrder::UseLebesgueOrder ) {
|
||||||
|
sU = lo.Reorder(sU);
|
||||||
|
}
|
||||||
|
sF = s+Ls*sU;
|
||||||
|
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Counter.Stop();
|
||||||
|
// Counter.Report();
|
||||||
|
// }
|
||||||
|
} else if( this->HandOptDslash ) {
|
||||||
|
#pragma omp for
|
||||||
|
|
||||||
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
|
int sU=ss;
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
int sF = s+Ls*sU;
|
||||||
|
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
#pragma omp for
|
||||||
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
|
int sU=ss;
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
int sF = s+Ls*sU;
|
||||||
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dslashtime +=usecond();
|
||||||
|
alltime+=usecond();
|
||||||
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
|
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
|
@ -120,6 +120,13 @@ namespace Grid {
|
|||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
|
void DhopInternalOMPbench(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out,
|
||||||
|
int dag);
|
||||||
|
|
||||||
void DhopInternalCommsThenCompute(StencilImpl & st,
|
void DhopInternalCommsThenCompute(StencilImpl & st,
|
||||||
LebesgueOrder &lo,
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user