diff --git a/lib/Stencil.h b/lib/Stencil.h index da5cc51a..f48fc6f1 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -465,8 +465,6 @@ namespace Grid { std::thread HaloExchangeBegin(const Lattice &source,std::vector > & u_comm_buf,compressor &compress) { return std::thread([&] { this->HaloExchangeBlocking(source,u_comm_buf,compress); }); - // std::thread t(&HaloExchangeBlocking,this,source,u_comm_buf,compress); - // return t; } void HaloExchangeBlocking(const Lattice &source,std::vector > &u_comm_buf,compressor &compress) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 2e880c5b..9a1669b6 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -102,6 +102,7 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, ImportGauge(_Umu); commtime=0; + jointime=0; dslashtime=0; } template @@ -237,6 +238,7 @@ void WilsonFermion5D::Report(void) std::cout<::DhopInternalCommsThenCompute(StencilImpl & st, Lebes int nwork = U._grid->oSites(); commtime -=usecond(); - st.HaloExchange(in,comm_buf,compressor); + std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor); commtime +=usecond(); + + jointime -=usecond(); + thr.join(); + jointime +=usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion // Not loop ordering and data layout. @@ -483,9 +489,9 @@ PARALLEL_FOR_LOOP } dslashtime +=usecond(); - commtime -=usecond(); + jointime -=usecond(); thr.join(); - commtime +=usecond(); + jointime +=usecond(); local = false; nonlocal = true; diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index 6b44141a..efe0f043 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -61,6 +61,7 @@ namespace Grid { public: INHERIT_IMPL_TYPES(Impl); typedef WilsonKernels Kernels; + double jointime; double commtime; double dslashtime; ///////////////////////////////////////////////////////////////