From a2ff068e297347bc24a987ad4277e92761e4d380 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 6 Nov 2015 03:47:14 -0800 Subject: [PATCH] Asm and threading for many core --- lib/qcd/action/fermion/WilsonFermion5D.cc | 83 +++++++++++++++++++++-- 1 file changed, 79 insertions(+), 4 deletions(-) diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 66ca67d5..dde1fcac 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -1,4 +1,5 @@ #include +#include namespace Grid { namespace QCD { @@ -7,6 +8,7 @@ namespace QCD { const std::vector WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4}); const std::vector WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1}); int WilsonFermion5DStatic::HandOptDslash; +int WilsonFermion5DStatic::AsmOptDslash; // 5d lattice for DWF. template @@ -220,6 +222,13 @@ void WilsonFermion5D::DhopInternal(CartesianStencil & st, LebesgueOrder &l Compressor compressor(dag); + // Assume balanced KMP_AFFINITY; this is forced in GridThread.h + + int threads = GridThread::GetThreads(); + int HT = GridThread::GetHyperThreads(); + int cores = GridThread::GetCores(); + int nwork = U._grid->oSites(); + st.HaloExchange(in,comm_buf,compressor); // Dhop takes the 4d grid from U, and makes a 5d index for fermion @@ -229,10 +238,13 @@ void WilsonFermion5D::DhopInternal(CartesianStencil & st, LebesgueOrder &l // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable. if ( dag == DaggerYes ) { if( this->HandOptDslash ) { -PARALLEL_FOR_LOOP +#pragma omp parallel for schedule(static) for(int ss=0;ssoSites();ss++){ for(int s=0;sHandOptDslash ) { -PARALLEL_FOR_LOOP + if( this->AsmOptDslash ) { + // for(int i=0;i<1;i++){ + // for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ + // PerformanceCounter Counter(i); + // Counter.Start(); + +#pragma omp parallel for + for(int t=0;tHandOptDslash ) { + +#pragma omp parallel for + for(int t=0;toSites();ss++){ for(int s=0;s