Improvements to the assembler interface that let us move chunks of the

site and s loop into the kernels. This will save on function call overhead and guarantee L2 prefetching strategy is right since OMP can't distribute the sub-chunks of work.
2025-07-31 11:47:06 +01:00 · 2016-06-09 01:12:36 -07:00
parent d9408893b3
commit 55f65b81b5
10 changed files with 77 additions and 87 deletions
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -132,19 +132,21 @@ int main (int argc, char ** argv)
  
  RealD NP = UGrid->_Nprocessors;

-  for(int doasm=0;doasm<1;doasm++){
+  for(int doasm=1;doasm<2;doasm++){

    QCD::WilsonKernelsStatic::AsmOpt=doasm;

  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
  
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall =50;
+  int ncall =10;
  if (1) {

    double t0=usecond();
    for(int i=0;i<ncall;i++){
+      __SSC_START;
      Dw.Dhop(src,result,0);
+      __SSC_STOP;
    }
    double t1=usecond();