From a2ff068e297347bc24a987ad4277e92761e4d380 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Fri, 6 Nov 2015 03:47:14 -0800
Subject: [PATCH] Asm and threading for many core

---
 lib/qcd/action/fermion/WilsonFermion5D.cc | 83 +++++++++++++++++++++--
 1 file changed, 79 insertions(+), 4 deletions(-)
diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc
index 66ca67d5..dde1fcac 100644
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -1,4 +1,5 @@
 #include <Grid.h>
+#include <PerfCount.h>
 
 namespace Grid {
 namespace QCD {
@@ -7,6 +8,7 @@ namespace QCD {
 const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
 int WilsonFermion5DStatic::HandOptDslash;
+int WilsonFermion5DStatic::AsmOptDslash;
 
   // 5d lattice for DWF.
 template<class Impl>
@@ -220,6 +222,13 @@ void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &l
 
   Compressor compressor(dag);
 
+  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
+
+  int threads = GridThread::GetThreads();
+  int HT      = GridThread::GetHyperThreads();
+  int cores   = GridThread::GetCores();
+  int nwork = U._grid->oSites();
+  
   st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
   
   // Dhop takes the 4d grid from U, and makes a 5d index for fermion
@@ -229,10 +238,13 @@ void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &l
   // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
   if ( dag == DaggerYes ) {
     if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
+#pragma omp parallel for schedule(static)
       for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<Ls;s++){
 	  int sU=ss;
+	  if (    LebesgueOrder::UseLebesgueOrder ) {
+	    sU=lo.Reorder(ss);
+	  }
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 	  }
@@ -251,16 +263,79 @@ PARALLEL_FOR_LOOP
       }
     }
   } else {
-    if( this->HandOptDslash ) {
-PARALLEL_FOR_LOOP
+    if( this->AsmOptDslash ) {
+      //      for(int i=0;i<1;i++){
+      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+      //	PerformanceCounter Counter(i);
+      //	Counter.Start();
+
+#pragma omp parallel for 
+      for(int t=0;t<threads;t++){
+
+	int hyperthread = t%HT;
+	int core        = t/HT;
+
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
+	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
+
+	for(int ss=0;ss<sswork;ss++){
+	  for(int s=soff;s<soff+swork;s++){
+
+	    sU=ss+ ssoff;
+
+	    if ( LebesgueOrder::UseLebesgueOrder ) {
+	      sU = lo.Reorder(sU);
+	    }
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptAsmDhopSite(st,U,comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
+	  }
+	}
+      }
+      //      Counter.Stop();
+      //      Counter.Report();
+      //      }
+    } else if( this->HandOptDslash ) {
+
+#pragma omp parallel for 
+      for(int t=0;t<threads;t++){
+
+	int hyperthread = t%HT;
+	int core        = t/HT;
+
+        int sswork, swork,soff, sU,sF;
+
+	sswork = (nwork + cores-1)/cores;
+	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
+
+	for(int ss=0;ss<sswork;ss++){
+	  sU=ss+core*sswork; // max locality within an L2 slice
+	  if ( LebesgueOrder::UseLebesgueOrder ) {
+	    sU = lo.Reorder(sU);
+	  }
+	  if ( sU < nwork ) {
+	    for(int s=soff;s<soff+swork;s++){
+	      sF = s+Ls*sU;
+	      Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
+	    }
+	  }
+	}
+      }
+
+      /*
+#pragma omp parallel for schedule(static)
       for(int ss=0;ss<U._grid->oSites();ss++){
 	for(int s=0;s<Ls;s++){
-	  //	  int sU=lo.Reorder(ss);
 	  int sU=ss;
+	  if (    LebesgueOrder::UseLebesgueOrder ) {
+	    sU=lo.Reorder(ss);
+	  }
 	  int sF = s+Ls*sU;
 	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 	}
       }
+      */
 
     } else { 
 PARALLEL_FOR_LOOP