mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Adding extra benchmark
This commit is contained in:
		@@ -450,7 +450,7 @@ void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
  for(int jjj=0;jjj<1000;jjj++){
 | 
			
		||||
  for(int jjj=0;jjj<100;jjj++){
 | 
			
		||||
#pragma omp barrier
 | 
			
		||||
  dslashtime -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
@@ -538,6 +538,124 @@ void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						DoubledGaugeField & U,
 | 
			
		||||
						const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  alltime-=usecond();
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  st.HaloExchangeComplete(handle);
 | 
			
		||||
  commtime +=usecond();
 | 
			
		||||
 | 
			
		||||
  jointime -=usecond();
 | 
			
		||||
  jointime +=usecond();
 | 
			
		||||
  
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  // Not loop ordering and data layout.
 | 
			
		||||
  // Designed to create 
 | 
			
		||||
  // - per thread reuse in L1 cache for U
 | 
			
		||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
  for(int jjj=0;jjj<100;jjj++){
 | 
			
		||||
#pragma omp barrier
 | 
			
		||||
  dslashtime -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=0;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->AsmOptDslash ) {
 | 
			
		||||
      //      for(int i=0;i<1;i++){
 | 
			
		||||
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      //	PerformanceCounter Counter(i);
 | 
			
		||||
      //	Counter.Start();
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
 | 
			
		||||
	    sU=0;
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //      Counter.Stop();
 | 
			
		||||
      //      Counter.Report();
 | 
			
		||||
      //      }
 | 
			
		||||
    } else if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  dslashtime +=usecond();
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						     DoubledGaugeField & U,
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user