mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	simd in 5th dimension support
This commit is contained in:
		@@ -288,12 +288,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
					 const FermionField &in, FermionField &out,int dag) 
 | 
			
		||||
  {
 | 
			
		||||
    if ( Impl::overlapCommsCompute () ) { 
 | 
			
		||||
      DhopInternalCommsOverlapCompute(st,U,in,out,dag);
 | 
			
		||||
    } else { 
 | 
			
		||||
    DhopInternalCommsThenCompute(st,U,in,out,dag);
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
							 const FermionField &in, FermionField &out,int dag) {
 | 
			
		||||
@@ -331,15 +327,6 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
						     const FermionField &in, FermionField &out,int dag) {
 | 
			
		||||
 | 
			
		||||
    assert(0);
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
  FermOpTemplateInstantiate(WilsonFermion);
 | 
			
		||||
  GparityFermOpTemplateInstantiate(WilsonFermion);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -116,9 +116,6 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
				    const FermionField &in, FermionField &out,int dag) ;
 | 
			
		||||
      void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
 | 
			
		||||
				    const FermionField &in, FermionField &out,int dag) ;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      // Constructor
 | 
			
		||||
      WilsonFermion(GaugeField &_Umu,
 | 
			
		||||
 
 | 
			
		||||
@@ -68,10 +68,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
  // some assertions
 | 
			
		||||
  assert(FiveDimGrid._ndimension==5);
 | 
			
		||||
  assert(FourDimGrid._ndimension==4);
 | 
			
		||||
  
 | 
			
		||||
  assert(FiveDimRedBlackGrid._ndimension==5);
 | 
			
		||||
  assert(FourDimRedBlackGrid._ndimension==4);
 | 
			
		||||
 | 
			
		||||
  assert(FiveDimRedBlackGrid._checker_dim==1);
 | 
			
		||||
 | 
			
		||||
  // Dimension zero of the five-d is the Ls direction
 | 
			
		||||
@@ -106,6 +104,70 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
  dslashtime=0;
 | 
			
		||||
  dslash1time=0;
 | 
			
		||||
}  
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
 | 
			
		||||
				       GridCartesian         &FiveDimGrid,
 | 
			
		||||
				       GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
				       GridCartesian         &FourDimGrid,
 | 
			
		||||
				       GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
				       RealD _M5,const ImplParams &p) :
 | 
			
		||||
  Kernels(p),
 | 
			
		||||
  _FiveDimGrid        (&FiveDimGrid),
 | 
			
		||||
  _FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
 | 
			
		||||
  _FourDimGrid        (&FourDimGrid),
 | 
			
		||||
  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
 | 
			
		||||
  Stencil    (_FiveDimGrid,npoint,Even,directions,displacements),
 | 
			
		||||
  StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
 | 
			
		||||
  StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
 | 
			
		||||
  M5(_M5),
 | 
			
		||||
  Umu(_FourDimGrid),
 | 
			
		||||
  UmuEven(_FourDimRedBlackGrid),
 | 
			
		||||
  UmuOdd (_FourDimRedBlackGrid),
 | 
			
		||||
  Lebesgue(_FourDimGrid),
 | 
			
		||||
  LebesgueEvenOdd(_FourDimRedBlackGrid)
 | 
			
		||||
{
 | 
			
		||||
  int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
  // some assertions
 | 
			
		||||
  assert(FiveDimGrid._ndimension==5);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._ndimension==5);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
 | 
			
		||||
  assert(FourDimGrid._ndimension==4);
 | 
			
		||||
  assert(FourDimRedBlackGrid._ndimension==4);
 | 
			
		||||
 | 
			
		||||
  // Dimension zero of the five-d is the Ls direction
 | 
			
		||||
  Ls=FiveDimGrid._fdimensions[0];
 | 
			
		||||
  assert(FiveDimGrid._processors[0]         ==1);
 | 
			
		||||
  assert(FiveDimGrid._simd_layout[0]        ==nsimd);
 | 
			
		||||
 | 
			
		||||
  assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._processors[0] ==1);
 | 
			
		||||
  assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
 | 
			
		||||
 | 
			
		||||
  // Other dimensions must match the decomposition of the four-D fields 
 | 
			
		||||
  for(int d=0;d<4;d++){
 | 
			
		||||
    assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]);
 | 
			
		||||
    assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
 | 
			
		||||
 | 
			
		||||
    assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]);
 | 
			
		||||
    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
 | 
			
		||||
 | 
			
		||||
    assert(FourDimGrid._simd_layout[d]=1);
 | 
			
		||||
    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
 | 
			
		||||
    assert(FourDimRedBlackGrid._simd_layout[d]  ==1);
 | 
			
		||||
    assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
 | 
			
		||||
 | 
			
		||||
    assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]);
 | 
			
		||||
    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
 | 
			
		||||
    assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Allocate the required comms buffer
 | 
			
		||||
  ImportGauge(_Umu);
 | 
			
		||||
}  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
			
		||||
{
 | 
			
		||||
@@ -294,11 +356,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  int LLs = in._grid->_rdimensions[0];
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
@@ -318,97 +376,48 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	for(int s=0;s<LLs;s++){
 | 
			
		||||
	  int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  int sF = s+LLs*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	for(int s=0;s<LLs;s++){
 | 
			
		||||
	  int sU=ss;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	  int sF = s+LLs*sU;
 | 
			
		||||
	  Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->AsmOptDslash ) {
 | 
			
		||||
      //      for(int i=0;i<1;i++){
 | 
			
		||||
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      //	PerformanceCounter Counter(i);
 | 
			
		||||
      //	Counter.Start();
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel for 
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
 | 
			
		||||
	    sU=ss+ ssoff;
 | 
			
		||||
 | 
			
		||||
	    if ( LebesgueOrder::UseLebesgueOrder ) {
 | 
			
		||||
	      sU = lo.Reorder(sU);
 | 
			
		||||
	    }
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //      Counter.Stop();
 | 
			
		||||
      //      Counter.Report();
 | 
			
		||||
      //      }
 | 
			
		||||
    } else if( this->HandOptDslash ) {
 | 
			
		||||
      /*
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel for schedule(static)
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  sU=ss+ ssoff;
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      */
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	for(int s=0;s<LLs;s++){
 | 
			
		||||
	  int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  int sF = s+LLs*sU;
 | 
			
		||||
	  Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else if( this->HandOptDslash ) {
 | 
			
		||||
PARALLEL_FOR_LOOP     
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	for(int s=0;s<LLs;s++){
 | 
			
		||||
	  int sU=ss;
 | 
			
		||||
	  int sF = s+LLs*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	for(int s=0;s<LLs;s++){
 | 
			
		||||
	  int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  int sF = s+LLs*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
@@ -418,251 +427,6 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						 DoubledGaugeField & U,
 | 
			
		||||
						 const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  alltime-=usecond();
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  st.HaloExchangeComplete(handle);
 | 
			
		||||
  commtime +=usecond();
 | 
			
		||||
 | 
			
		||||
  jointime -=usecond();
 | 
			
		||||
  jointime +=usecond();
 | 
			
		||||
  
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  // Not loop ordering and data layout.
 | 
			
		||||
  // Designed to create 
 | 
			
		||||
  // - per thread reuse in L1 cache for U
 | 
			
		||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
  for(int jjj=0;jjj<100;jjj++){
 | 
			
		||||
#pragma omp barrier
 | 
			
		||||
  dslashtime -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=ss;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->AsmOptDslash ) {
 | 
			
		||||
      //      for(int i=0;i<1;i++){
 | 
			
		||||
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      //	PerformanceCounter Counter(i);
 | 
			
		||||
      //	Counter.Start();
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
 | 
			
		||||
	    sU=ss+ ssoff;
 | 
			
		||||
 | 
			
		||||
	    if ( LebesgueOrder::UseLebesgueOrder ) {
 | 
			
		||||
	      sU = lo.Reorder(sU);
 | 
			
		||||
	    }
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //      Counter.Stop();
 | 
			
		||||
      //      Counter.Report();
 | 
			
		||||
      //      }
 | 
			
		||||
    } else if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  dslashtime +=usecond();
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						DoubledGaugeField & U,
 | 
			
		||||
						const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  alltime-=usecond();
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  st.HaloExchangeComplete(handle);
 | 
			
		||||
  commtime +=usecond();
 | 
			
		||||
 | 
			
		||||
  jointime -=usecond();
 | 
			
		||||
  jointime +=usecond();
 | 
			
		||||
  
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  // Not loop ordering and data layout.
 | 
			
		||||
  // Designed to create 
 | 
			
		||||
  // - per thread reuse in L1 cache for U
 | 
			
		||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
  for(int jjj=0;jjj<100;jjj++){
 | 
			
		||||
#pragma omp barrier
 | 
			
		||||
  dslashtime -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=0;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->AsmOptDslash ) {
 | 
			
		||||
      //      for(int i=0;i<1;i++){
 | 
			
		||||
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      //	PerformanceCounter Counter(i);
 | 
			
		||||
      //	Counter.Start();
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
 | 
			
		||||
	    sU=0;
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //      Counter.Stop();
 | 
			
		||||
      //      Counter.Report();
 | 
			
		||||
      //      }
 | 
			
		||||
    } else if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  dslashtime +=usecond();
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						     DoubledGaugeField & U,
 | 
			
		||||
						     const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  assert(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
@@ -706,6 +470,8 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 | 
			
		||||
 | 
			
		||||
FermOpTemplateInstantiate(WilsonFermion5D);
 | 
			
		||||
GparityFermOpTemplateInstantiate(WilsonFermion5D);
 | 
			
		||||
template class WilsonFermion5D<DomainWallRedBlack5dImplF>;		
 | 
			
		||||
template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -87,6 +87,7 @@ namespace Grid {
 | 
			
		||||
      virtual void   MeooeDag    (const FermionField &in, FermionField &out){assert(0);};
 | 
			
		||||
      virtual void   MooeeDag    (const FermionField &in, FermionField &out){assert(0);};
 | 
			
		||||
      virtual void   MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
 | 
			
		||||
      virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
 | 
			
		||||
 | 
			
		||||
      // These can be overridden by fancy 5d chiral action
 | 
			
		||||
      virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
 | 
			
		||||
@@ -121,32 +122,12 @@ namespace Grid {
 | 
			
		||||
			FermionField &out,
 | 
			
		||||
			int dag);
 | 
			
		||||
 | 
			
		||||
      void DhopInternalOMPbench(StencilImpl & st,
 | 
			
		||||
				LebesgueOrder &lo,
 | 
			
		||||
				DoubledGaugeField &U,
 | 
			
		||||
				const FermionField &in, 
 | 
			
		||||
				FermionField &out,
 | 
			
		||||
				int dag);
 | 
			
		||||
 | 
			
		||||
      void DhopInternalL1bench(StencilImpl & st,
 | 
			
		||||
				LebesgueOrder &lo,
 | 
			
		||||
				DoubledGaugeField &U,
 | 
			
		||||
				const FermionField &in, 
 | 
			
		||||
				FermionField &out,
 | 
			
		||||
				int dag);
 | 
			
		||||
 | 
			
		||||
      void DhopInternalCommsThenCompute(StencilImpl & st,
 | 
			
		||||
			LebesgueOrder &lo,
 | 
			
		||||
			DoubledGaugeField &U,
 | 
			
		||||
			const FermionField &in, 
 | 
			
		||||
			FermionField &out,
 | 
			
		||||
			int dag);
 | 
			
		||||
      void DhopInternalCommsOverlapCompute(StencilImpl & st,
 | 
			
		||||
			LebesgueOrder &lo,
 | 
			
		||||
			DoubledGaugeField &U,
 | 
			
		||||
			const FermionField &in, 
 | 
			
		||||
			FermionField &out,
 | 
			
		||||
			int dag);
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      WilsonFermion5D(GaugeField &_Umu,
 | 
			
		||||
@@ -156,6 +137,15 @@ namespace Grid {
 | 
			
		||||
		      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
		      double _M5,const ImplParams &p= ImplParams());
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      WilsonFermion5D(int simd, 
 | 
			
		||||
		      GaugeField &_Umu,
 | 
			
		||||
		      GridCartesian         &FiveDimGrid,
 | 
			
		||||
		      GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
		      GridCartesian         &FourDimGrid,
 | 
			
		||||
		      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
		      double _M5,const ImplParams &p= ImplParams());
 | 
			
		||||
 | 
			
		||||
      // DoubleStore
 | 
			
		||||
      void ImportGauge(const GaugeField &_Umu);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -529,5 +529,7 @@ void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  FermOpTemplateInstantiate(WilsonKernels);
 | 
			
		||||
template class WilsonKernels<DomainWallRedBlack5dImplF>;		
 | 
			
		||||
template class WilsonKernels<DomainWallRedBlack5dImplD>;
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
@@ -256,5 +256,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 | 
			
		||||
  template class WilsonKernels<WilsonImplD>; 
 | 
			
		||||
  template class WilsonKernels<GparityWilsonImplF>;
 | 
			
		||||
  template class WilsonKernels<GparityWilsonImplD>;
 | 
			
		||||
  template class WilsonKernels<DomainWallRedBlack5dImplF>;
 | 
			
		||||
  template class WilsonKernels<DomainWallRedBlack5dImplD>;
 | 
			
		||||
}}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    Chi_11 = ref()(1)(1);\
 | 
			
		||||
    Chi_12 = ref()(1)(2);
 | 
			
		||||
 | 
			
		||||
// To splat or not to splat depends on the implementation
 | 
			
		||||
#define MULT_2SPIN(A)\
 | 
			
		||||
   auto & ref(U._odata[sU](A));	\
 | 
			
		||||
    U_00 = ref()(0,0);\
 | 
			
		||||
    U_10 = ref()(1,0);\
 | 
			
		||||
    U_20 = ref()(2,0);\
 | 
			
		||||
    U_01 = ref()(0,1);\
 | 
			
		||||
    U_11 = ref()(1,1);				\
 | 
			
		||||
    U_21 = ref()(2,1);\
 | 
			
		||||
   Impl::loadLinkElement(U_00,ref()(0,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_10,ref()(1,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_20,ref()(2,0));	\
 | 
			
		||||
   Impl::loadLinkElement(U_01,ref()(0,1));	\
 | 
			
		||||
   Impl::loadLinkElement(U_11,ref()(1,1));	\
 | 
			
		||||
   Impl::loadLinkElement(U_21,ref()(2,1));	\
 | 
			
		||||
    UChi_00 = U_00*Chi_00;\
 | 
			
		||||
    UChi_10 = U_00*Chi_10;\
 | 
			
		||||
    UChi_01 = U_10*Chi_00;\
 | 
			
		||||
@@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    UChi_11+= U_11*Chi_11;\
 | 
			
		||||
    UChi_02+= U_21*Chi_01;\
 | 
			
		||||
    UChi_12+= U_21*Chi_11;\
 | 
			
		||||
    U_00 = ref()(0,2);\
 | 
			
		||||
    U_10 = ref()(1,2);\
 | 
			
		||||
    U_20 = ref()(2,2);\
 | 
			
		||||
    Impl::loadLinkElement(U_00,ref()(0,2));	\
 | 
			
		||||
    Impl::loadLinkElement(U_10,ref()(1,2));	\
 | 
			
		||||
    Impl::loadLinkElement(U_20,ref()(2,2));	\
 | 
			
		||||
    UChi_00+= U_00*Chi_02;\
 | 
			
		||||
    UChi_10+= U_00*Chi_12;\
 | 
			
		||||
    UChi_01+= U_10*Chi_02;\
 | 
			
		||||
@@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    UChi_02+= U_20*Chi_02;\
 | 
			
		||||
    UChi_12+= U_20*Chi_12;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR(dir)			\
 | 
			
		||||
      permute##dir(Chi_00,Chi_00);\
 | 
			
		||||
      permute##dir(Chi_01,Chi_01);\
 | 
			
		||||
@@ -809,7 +811,6 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Doub
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
 | 
			
		||||
  //check consistency of return types between these functions and the ones in WilsonKernels.cc
 | 
			
		||||
  return 0;
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
@@ -843,6 +844,47 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,D
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  //////////////
 | 
			
		||||
/*
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
  
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<>
 | 
			
		||||
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
			
		||||
{
 | 
			
		||||
  DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
							       int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
@@ -870,4 +912,21 @@ template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilI
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user