Improvements to the assembler interface that let us move chunks of the

site and s loop into the kernels. This will save on function call overhead and guarantee L2 prefetching strategy is right since OMP can't distribute the sub-chunks of work.
2026-01-16 23:04:42 +00:00 · 2016-06-09 01:12:36 -07:00
parent d9408893b3
commit 55f65b81b5
10 changed files with 77 additions and 87 deletions
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -296,12 +296,12 @@ PARALLEL_FOR_LOOP
    if ( dag == DaggerYes ) {
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
+	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    } else {
 PARALLEL_FOR_LOOP
      for(int sss=0;sss<in._grid->oSites();sss++){
-	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
+	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
      }
    }
  };