diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index b78f030e..581a3fc5 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -418,6 +418,126 @@ PARALLEL_FOR_LOOP alltime+=usecond(); } +template +void WilsonFermion5D::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo, + DoubledGaugeField & U, + const FermionField &in, FermionField &out,int dag) +{ + // assert((dag==DaggerNo) ||(dag==DaggerYes)); + alltime-=usecond(); + Compressor compressor(dag); + + // Assume balanced KMP_AFFINITY; this is forced in GridThread.h + + int threads = GridThread::GetThreads(); + int HT = GridThread::GetHyperThreads(); + int cores = GridThread::GetCores(); + int nwork = U._grid->oSites(); + + commtime -=usecond(); + auto handle = st.HaloExchangeBegin(in,compressor); + st.HaloExchangeComplete(handle); + commtime +=usecond(); + + jointime -=usecond(); + jointime +=usecond(); + + // Dhop takes the 4d grid from U, and makes a 5d index for fermion + // Not loop ordering and data layout. + // Designed to create + // - per thread reuse in L1 cache for U + // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. + +#pragma omp parallel + { + for(int jjj=0;jjj<1000;jjj++){ +#pragma omp barrier + dslashtime -=usecond(); + if ( dag == DaggerYes ) { + if( this->HandOptDslash ) { +#pragma omp for + for(int ss=0;ssoSites();ss++){ + int sU=ss; + for(int s=0;soSites();ss++){ + { + int sd; + for(sd=0;sdAsmOptDslash ) { + // for(int i=0;i<1;i++){ + // for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ + // PerformanceCounter Counter(i); + // Counter.Start(); + +#pragma omp for + for(int t=0;tHandOptDslash ) { +#pragma omp for + + for(int ss=0;ssoSites();ss++){ + int sU=ss; + for(int s=0;soSites();ss++){ + int sU=ss; + for(int s=0;s void WilsonFermion5D::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index 30e663e8..164a3c1a 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -120,6 +120,13 @@ namespace Grid { FermionField &out, int dag); + void DhopInternalOMPbench(StencilImpl & st, + LebesgueOrder &lo, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, + int dag); + void DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField &U,