Adding a trial for openmp overhead minimisation

2026-07-26 21:43:27 +01:00 · 2016-03-31 16:00:37 +01:00
parent f7b1060aed
commit e67fc2be18
2 changed files with 127 additions and 0 deletions
@@ -418,6 +418,126 @@ PARALLEL_FOR_LOOP
  alltime+=usecond();
 }

+template<class Impl>
+void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
+						 DoubledGaugeField & U,
+						 const FermionField &in, FermionField &out,int dag)
+{
+  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+  alltime-=usecond();
+  Compressor compressor(dag);
+
+  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
+
+  int threads = GridThread::GetThreads();
+  int HT      = GridThread::GetHyperThreads();
+  int cores   = GridThread::GetCores();
+  int nwork = U._grid->oSites();
+  
+  commtime -=usecond();
+  auto handle = st.HaloExchangeBegin(in,compressor);
+  st.HaloExchangeComplete(handle);
+  commtime +=usecond();
+
+  jointime -=usecond();
+  jointime +=usecond();
+  
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  // Not loop ordering and data layout.
+  // Designed to create 
+  // - per thread reuse in L1 cache for U
+  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
+
+#pragma omp parallel 
+  {
+  for(int jjj=0;jjj<1000;jjj++){
+#pragma omp barrier
+  dslashtime -=usecond();
+  if ( dag == DaggerYes ) {
+    if( this->HandOptDslash ) {
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+      }
+    } else { 
+
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	{
+	  int sd;
+	  for(sd=0;sd<Ls;sd++){
+	    int sU=ss;
+	    int sF = sd+Ls*sU;
+	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+    }
+  } else {
+    if( this->AsmOptDslash ) {
+      //      for(int i=0;i<1;i++){
+      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
+      //	PerformanceCounter Counter(i);
+      //	Counter.Start();
+
+#pragma omp for
+      for(int t=0;t<threads;t++){
+
+	int hyperthread = t%HT;
+	int core        = t/HT;
+
+        int sswork, swork,soff,ssoff,  sU,sF;
+	
+	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
+	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
+
+	for(int ss=0;ss<sswork;ss++){
+	  for(int s=soff;s<soff+swork;s++){
+
+	    sU=ss+ ssoff;
+
+	    if ( LebesgueOrder::UseLebesgueOrder ) {
+	      sU = lo.Reorder(sU);
+	    }
+	    sF = s+Ls*sU;
+	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	  }
+	}
+      }
+      //      Counter.Stop();
+      //      Counter.Report();
+      //      }
+    } else if( this->HandOptDslash ) {
+#pragma omp for
+
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    } else { 
+#pragma omp for
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU; 
+	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
+	}
+      }
+    }
+  }
+  }
+  }
+  dslashtime +=usecond();
+  alltime+=usecond();
+}
+
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 						     DoubledGaugeField & U,
@@ -120,6 +120,13 @@ namespace Grid {
 			FermionField &out,
 			int dag);

+      void DhopInternalOMPbench(StencilImpl & st,
+				LebesgueOrder &lo,
+				DoubledGaugeField &U,
+				const FermionField &in, 
+				FermionField &out,
+				int dag);
+
      void DhopInternalCommsThenCompute(StencilImpl & st,
 			LebesgueOrder &lo,
 			DoubledGaugeField &U,