From fd1eb7de13f56b7407f90994a7e00d5b12961da2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 26 Apr 2017 02:34:52 -0400 Subject: [PATCH] Clean implementation of the exterior faces listing only those points on the boudary --- lib/qcd/action/fermion/WilsonFermion5D.cc | 57 ++++------------------- 1 file changed, 10 insertions(+), 47 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 13d26086..fb20108b 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -367,6 +367,7 @@ void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, DhopTotalTime+=usecond(); } + template void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U, @@ -380,7 +381,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg int LLs = in._grid->_rdimensions[0]; int len = U._grid->oSites(); - + DhopFaceTime-=usecond(); st.HaloExchangeOptGather(in,compressor); DhopFaceTime+=usecond(); @@ -390,6 +391,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg st.CommunicateBegin(reqs); st.CommsMergeSHM(compressor); + // Perhaps use omp task and region #pragma omp parallel { int nthreads = omp_get_num_threads(); @@ -419,70 +421,31 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Load imbalance alert. Should use dynamic schedule OMP for loop // Perhaps create a list of only those sites with face work, and // load balance process the list. -#if 1 - -#if 0 -#pragma omp parallel - { - int nthreads = omp_get_num_threads(); - int me = omp_get_thread_num(); - int myoff, mywork; - - GridThread::GetWork(len,me,mywork,myoff,nthreads); - int sF = LLs * myoff; - - // Exterior links in stencil - if ( me==0 ) DhopComputeTime2-=usecond(); - if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); - else Kernels::DhopSite (st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,0,1); - if ( me==0 ) DhopComputeTime2+=usecond(); - }// end parallel region -#else DhopComputeTime2-=usecond(); if (dag == DaggerYes) { -#pragma omp parallel for schedule(static,1) - for (int ss = 0; ss < st.surface_list.size(); ss++) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { int sU = st.surface_list[ss]; int sF = LLs * sU; Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); } } else { -#pragma omp parallel for schedule(static,1) - for (int ss = 0; ss < st.surface_list.size(); ss++) { + int sz=st.surface_list.size(); + parallel_for (int ss = 0; ss < sz; ss++) { int sU = st.surface_list[ss]; int sF = LLs * sU; Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); } } DhopComputeTime2+=usecond(); -#endif - - -#else -DhopComputeTime2-=usecond(); - if (dag == DaggerYes) { -#pragma omp parallel for schedule(static,4) - for (int ss = 0; ss < U._grid->oSites(); ss++) { - int sU = ss; - int sF = LLs * sU; - Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); - } - } else { -#pragma omp parallel for schedule(static,1) - for (int ss = 0; ss < U._grid->oSites(); ss++) { - int sU = ss; - int sF = LLs * sU; - Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,0,1); - } - } -DhopComputeTime2+=usecond(); -#endif - #else assert(0); #endif } + + + template void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, DoubledGaugeField & U,