mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Pushed the overlap comms tweaks
This commit is contained in:
		@@ -117,6 +117,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  typename DomainWallFermionR::ImplParams params; 
 | 
			
		||||
  params.overlapCommsCompute = overlapComms;
 | 
			
		||||
  
 | 
			
		||||
  RealD NP = UGrid->_Nprocessors;
 | 
			
		||||
 | 
			
		||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 | 
			
		||||
  
 | 
			
		||||
@@ -136,6 +137,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
    err = ref-result; 
 | 
			
		||||
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
    Dw.Report();
 | 
			
		||||
@@ -193,6 +195,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
    double flops=(1344.0*volume*ncall)/2;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
 
 | 
			
		||||
@@ -96,6 +96,7 @@ namespace Grid {
 | 
			
		||||
	Integer to_rank;
 | 
			
		||||
	Integer from_rank;
 | 
			
		||||
	Integer bytes;
 | 
			
		||||
	volatile Integer done;
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      std::vector<Packet> Packets;
 | 
			
		||||
@@ -107,6 +108,8 @@ namespace Grid {
 | 
			
		||||
	p.to_rank  = to;
 | 
			
		||||
	p.from_rank= from;
 | 
			
		||||
	p.bytes    = bytes;
 | 
			
		||||
	p.done     = 0;
 | 
			
		||||
	comms_bytes+=2.0*bytes;
 | 
			
		||||
	Packets.push_back(p);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
@@ -118,6 +121,7 @@ namespace Grid {
 | 
			
		||||
				Packets[i].recv_buf,
 | 
			
		||||
				Packets[i].from_rank,
 | 
			
		||||
				Packets[i].bytes);
 | 
			
		||||
	  Packets[i].done = 1;
 | 
			
		||||
	}
 | 
			
		||||
	commtime+=usecond();
 | 
			
		||||
      }
 | 
			
		||||
@@ -129,27 +133,38 @@ namespace Grid {
 | 
			
		||||
        cobj * mpointer;
 | 
			
		||||
	std::vector<scalar_object *> rpointers;
 | 
			
		||||
	Integer buffer_size;
 | 
			
		||||
	Integer packet_id;
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      std::vector<Merge> Mergers;
 | 
			
		||||
 | 
			
		||||
      void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size) {
 | 
			
		||||
      void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
 | 
			
		||||
	Merge m;
 | 
			
		||||
	m.mpointer = merge_p;
 | 
			
		||||
	m.rpointers= rpointers;
 | 
			
		||||
	m.buffer_size = buffer_size;
 | 
			
		||||
	m.packet_id   = packet_id;
 | 
			
		||||
	Mergers.push_back(m);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      void CommsMerge(void ) { 
 | 
			
		||||
	mergetime-=usecond();
 | 
			
		||||
	//PARALLEL_NESTED_LOOP2 
 | 
			
		||||
	for(int i=0;i<Mergers.size();i++){	
 | 
			
		||||
 | 
			
		||||
	  
 | 
			
		||||
	spintime-=usecond();
 | 
			
		||||
	int packet_id = Mergers[i].packet_id;
 | 
			
		||||
	while(! Packets[packet_id].done ); // spin for completion
 | 
			
		||||
	spintime+=usecond();
 | 
			
		||||
 | 
			
		||||
	mergetime-=usecond();
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
	  for(int o=0;o<Mergers[i].buffer_size;o++){
 | 
			
		||||
	    merge(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 | 
			
		||||
	  }
 | 
			
		||||
	    merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 | 
			
		||||
	  }
 | 
			
		||||
	mergetime+=usecond();
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      ////////////////////////////////////////
 | 
			
		||||
@@ -188,6 +203,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      double commtime;
 | 
			
		||||
      double halogtime;
 | 
			
		||||
      double mergetime;
 | 
			
		||||
      double spintime;
 | 
			
		||||
      double comms_bytes;
 | 
			
		||||
      double gathermtime;
 | 
			
		||||
      double splicetime;
 | 
			
		||||
      double nosplicetime;
 | 
			
		||||
@@ -206,9 +223,11 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      commtime=0;
 | 
			
		||||
      halogtime=0;
 | 
			
		||||
      mergetime=0;
 | 
			
		||||
      spintime=0;
 | 
			
		||||
      gathermtime=0;
 | 
			
		||||
      splicetime=0;
 | 
			
		||||
      nosplicetime=0;
 | 
			
		||||
      comms_bytes=0;
 | 
			
		||||
#endif
 | 
			
		||||
      _npoints = npoints;
 | 
			
		||||
      _grid    = grid;
 | 
			
		||||
@@ -218,8 +237,9 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
      int osites  = _grid->oSites();
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<npoints;i++){
 | 
			
		||||
      for(int ii=0;ii<npoints;ii++){
 | 
			
		||||
 | 
			
		||||
	int i = ii; // reverse direction to get SIMD comms done first
 | 
			
		||||
	int point = i;
 | 
			
		||||
 | 
			
		||||
	_entries[i].resize( osites);
 | 
			
		||||
@@ -512,10 +532,10 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
      void HaloExchangeComplete(std::thread &thr) 
 | 
			
		||||
      {
 | 
			
		||||
	CommsMerge(); // spins
 | 
			
		||||
	jointime-=usecond();
 | 
			
		||||
	thr.join();
 | 
			
		||||
	jointime+=usecond();
 | 
			
		||||
	CommsMerge();
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      void HaloGather(const Lattice<vobj> &source,compressor &compress)
 | 
			
		||||
@@ -750,7 +770,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
		}
 | 
			
		||||
	      }
 | 
			
		||||
 | 
			
		||||
	      AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size);
 | 
			
		||||
	      AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
 | 
			
		||||
 | 
			
		||||
	      u_comm_offset     +=buffer_size;
 | 
			
		||||
	    }
 | 
			
		||||
 
 | 
			
		||||
@@ -249,7 +249,9 @@ void WilsonFermion5D<Impl>::Report(void)
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil gather        "<<Stencil.gathertime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil gather simd   "<<Stencil.gathermtime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil merge  simd   "<<Stencil.mergetime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil spin   simd   "<<Stencil.spintime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil MB/s          "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Stencil join     time "<<Stencil.jointime<<" us"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
			
		||||
@@ -425,6 +427,8 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  alltime-=usecond();
 | 
			
		||||
 | 
			
		||||
  int calls;
 | 
			
		||||
  int updates;
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 
 | 
			
		||||
@@ -62,11 +62,11 @@ namespace Grid {
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
 | 
			
		||||
     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
     
 | 
			
		||||
     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
				 int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -310,7 +310,7 @@ namespace QCD {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
{
 | 
			
		||||
@@ -385,6 +385,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
			
		||||
    if ( SE->_permute ) {
 | 
			
		||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
			
		||||
@@ -397,7 +398,6 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
			
		||||
    num++;  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Yp
 | 
			
		||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
@@ -556,6 +556,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
			
		||||
    vstream(ref()(3)(0),result_30*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(1),result_31*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
			
		||||
    return 1;
 | 
			
		||||
  } else if ( num ) { 
 | 
			
		||||
    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
			
		||||
@@ -569,14 +570,16 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
			
		||||
    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
			
		||||
    return 1;
 | 
			
		||||
  }
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
			
		||||
						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
			
		||||
						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
			
		||||
{
 | 
			
		||||
@@ -822,6 +825,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 | 
			
		||||
    vstream(ref()(3)(0),result_30*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(1),result_31*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
			
		||||
    return 1;
 | 
			
		||||
  } else if ( num ) { 
 | 
			
		||||
    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
			
		||||
    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
			
		||||
@@ -835,7 +839,9 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 | 
			
		||||
    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
			
		||||
    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
			
		||||
    return 1;
 | 
			
		||||
  }
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
  /*
 | 
			
		||||
 
 | 
			
		||||
@@ -217,5 +217,50 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 }
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
  
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  scalar_type *pointer;
 | 
			
		||||
  scalar_type *vp = (scalar_type *)&vec;
 | 
			
		||||
 | 
			
		||||
  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    pointer=(scalar_type *)&extracted[i][offset];
 | 
			
		||||
    for(int w=0;w<words;w++){
 | 
			
		||||
      vp[w*Nsimd+i] = pointer[w];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
  
 | 
			
		||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
 | 
			
		||||
  scalar_type *pointer;
 | 
			
		||||
  scalar_type *vp = (scalar_type *)&vec;
 | 
			
		||||
  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
 | 
			
		||||
 | 
			
		||||
  for(int w=0;w<words;w++){
 | 
			
		||||
    for(int i=0;i<Nsimd;i++){
 | 
			
		||||
      pointer=(scalar_type *)&extracted[i][offset];
 | 
			
		||||
      vp[w*Nsimd+i] =pointer[w];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user