mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Pushed the overlap comms tweaks
This commit is contained in:
		@@ -116,7 +116,8 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  typename DomainWallFermionR::ImplParams params; 
 | 
					  typename DomainWallFermionR::ImplParams params; 
 | 
				
			||||||
  params.overlapCommsCompute = overlapComms;
 | 
					  params.overlapCommsCompute = overlapComms;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  RealD NP = UGrid->_Nprocessors;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 | 
					  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
@@ -136,6 +137,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
				
			||||||
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
				
			||||||
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
				
			||||||
    err = ref-result; 
 | 
					    err = ref-result; 
 | 
				
			||||||
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
				
			||||||
    Dw.Report();
 | 
					    Dw.Report();
 | 
				
			||||||
@@ -193,6 +195,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    double flops=(1344.0*volume*ncall)/2;
 | 
					    double flops=(1344.0*volume*ncall)/2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
					  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -96,6 +96,7 @@ namespace Grid {
 | 
				
			|||||||
	Integer to_rank;
 | 
						Integer to_rank;
 | 
				
			||||||
	Integer from_rank;
 | 
						Integer from_rank;
 | 
				
			||||||
	Integer bytes;
 | 
						Integer bytes;
 | 
				
			||||||
 | 
						volatile Integer done;
 | 
				
			||||||
      };
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      std::vector<Packet> Packets;
 | 
					      std::vector<Packet> Packets;
 | 
				
			||||||
@@ -107,6 +108,8 @@ namespace Grid {
 | 
				
			|||||||
	p.to_rank  = to;
 | 
						p.to_rank  = to;
 | 
				
			||||||
	p.from_rank= from;
 | 
						p.from_rank= from;
 | 
				
			||||||
	p.bytes    = bytes;
 | 
						p.bytes    = bytes;
 | 
				
			||||||
 | 
						p.done     = 0;
 | 
				
			||||||
 | 
						comms_bytes+=2.0*bytes;
 | 
				
			||||||
	Packets.push_back(p);
 | 
						Packets.push_back(p);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -118,6 +121,7 @@ namespace Grid {
 | 
				
			|||||||
				Packets[i].recv_buf,
 | 
									Packets[i].recv_buf,
 | 
				
			||||||
				Packets[i].from_rank,
 | 
									Packets[i].from_rank,
 | 
				
			||||||
				Packets[i].bytes);
 | 
									Packets[i].bytes);
 | 
				
			||||||
 | 
						  Packets[i].done = 1;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	commtime+=usecond();
 | 
						commtime+=usecond();
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
@@ -129,27 +133,38 @@ namespace Grid {
 | 
				
			|||||||
        cobj * mpointer;
 | 
					        cobj * mpointer;
 | 
				
			||||||
	std::vector<scalar_object *> rpointers;
 | 
						std::vector<scalar_object *> rpointers;
 | 
				
			||||||
	Integer buffer_size;
 | 
						Integer buffer_size;
 | 
				
			||||||
 | 
						Integer packet_id;
 | 
				
			||||||
      };
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      std::vector<Merge> Mergers;
 | 
					      std::vector<Merge> Mergers;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size) {
 | 
					      void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
 | 
				
			||||||
	Merge m;
 | 
						Merge m;
 | 
				
			||||||
	m.mpointer = merge_p;
 | 
						m.mpointer = merge_p;
 | 
				
			||||||
	m.rpointers= rpointers;
 | 
						m.rpointers= rpointers;
 | 
				
			||||||
	m.buffer_size = buffer_size;
 | 
						m.buffer_size = buffer_size;
 | 
				
			||||||
 | 
						m.packet_id   = packet_id;
 | 
				
			||||||
	Mergers.push_back(m);
 | 
						Mergers.push_back(m);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      void CommsMerge(void ) { 
 | 
					      void CommsMerge(void ) { 
 | 
				
			||||||
	mergetime-=usecond();
 | 
						//PARALLEL_NESTED_LOOP2 
 | 
				
			||||||
	for(int i=0;i<Mergers.size();i++){	
 | 
						for(int i=0;i<Mergers.size();i++){	
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						spintime-=usecond();
 | 
				
			||||||
 | 
						int packet_id = Mergers[i].packet_id;
 | 
				
			||||||
 | 
						while(! Packets[packet_id].done ); // spin for completion
 | 
				
			||||||
 | 
						spintime+=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						mergetime-=usecond();
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
	  for(int o=0;o<Mergers[i].buffer_size;o++){
 | 
						  for(int o=0;o<Mergers[i].buffer_size;o++){
 | 
				
			||||||
	    merge(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 | 
						    merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	mergetime+=usecond();
 | 
						mergetime+=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      ////////////////////////////////////////
 | 
					      ////////////////////////////////////////
 | 
				
			||||||
@@ -188,6 +203,8 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
      double commtime;
 | 
					      double commtime;
 | 
				
			||||||
      double halogtime;
 | 
					      double halogtime;
 | 
				
			||||||
      double mergetime;
 | 
					      double mergetime;
 | 
				
			||||||
 | 
					      double spintime;
 | 
				
			||||||
 | 
					      double comms_bytes;
 | 
				
			||||||
      double gathermtime;
 | 
					      double gathermtime;
 | 
				
			||||||
      double splicetime;
 | 
					      double splicetime;
 | 
				
			||||||
      double nosplicetime;
 | 
					      double nosplicetime;
 | 
				
			||||||
@@ -206,9 +223,11 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
      commtime=0;
 | 
					      commtime=0;
 | 
				
			||||||
      halogtime=0;
 | 
					      halogtime=0;
 | 
				
			||||||
      mergetime=0;
 | 
					      mergetime=0;
 | 
				
			||||||
 | 
					      spintime=0;
 | 
				
			||||||
      gathermtime=0;
 | 
					      gathermtime=0;
 | 
				
			||||||
      splicetime=0;
 | 
					      splicetime=0;
 | 
				
			||||||
      nosplicetime=0;
 | 
					      nosplicetime=0;
 | 
				
			||||||
 | 
					      comms_bytes=0;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
      _npoints = npoints;
 | 
					      _npoints = npoints;
 | 
				
			||||||
      _grid    = grid;
 | 
					      _grid    = grid;
 | 
				
			||||||
@@ -218,8 +237,9 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      int osites  = _grid->oSites();
 | 
					      int osites  = _grid->oSites();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(int i=0;i<npoints;i++){
 | 
					      for(int ii=0;ii<npoints;ii++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int i = ii; // reverse direction to get SIMD comms done first
 | 
				
			||||||
	int point = i;
 | 
						int point = i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	_entries[i].resize( osites);
 | 
						_entries[i].resize( osites);
 | 
				
			||||||
@@ -512,10 +532,10 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      void HaloExchangeComplete(std::thread &thr) 
 | 
					      void HaloExchangeComplete(std::thread &thr) 
 | 
				
			||||||
      {
 | 
					      {
 | 
				
			||||||
 | 
						CommsMerge(); // spins
 | 
				
			||||||
	jointime-=usecond();
 | 
						jointime-=usecond();
 | 
				
			||||||
	thr.join();
 | 
						thr.join();
 | 
				
			||||||
	jointime+=usecond();
 | 
						jointime+=usecond();
 | 
				
			||||||
	CommsMerge();
 | 
					 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      void HaloGather(const Lattice<vobj> &source,compressor &compress)
 | 
					      void HaloGather(const Lattice<vobj> &source,compressor &compress)
 | 
				
			||||||
@@ -750,7 +770,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
		}
 | 
							}
 | 
				
			||||||
	      }
 | 
						      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size);
 | 
						      AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      u_comm_offset     +=buffer_size;
 | 
						      u_comm_offset     +=buffer_size;
 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -249,7 +249,9 @@ void WilsonFermion5D<Impl>::Report(void)
 | 
				
			|||||||
  std::cout<<GridLogMessage << "Stencil gather        "<<Stencil.gathertime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Stencil gather        "<<Stencil.gathertime<<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil gather simd   "<<Stencil.gathermtime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Stencil gather simd   "<<Stencil.gathermtime<<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil merge  simd   "<<Stencil.mergetime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Stencil merge  simd   "<<Stencil.mergetime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil spin   simd   "<<Stencil.spintime<<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil MB/s          "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Stencil comm     time "<<Stencil.commtime<<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Stencil join     time "<<Stencil.jointime<<" us"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Stencil join     time "<<Stencil.jointime<<" us"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
				
			||||||
@@ -425,6 +427,8 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
 | 
				
			|||||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
					  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
				
			||||||
  alltime-=usecond();
 | 
					  alltime-=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int calls;
 | 
				
			||||||
 | 
					  int updates;
 | 
				
			||||||
  Compressor compressor(dag);
 | 
					  Compressor compressor(dag);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
					  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -62,13 +62,13 @@ namespace Grid {
 | 
				
			|||||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
								      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					     int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			       int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
								      int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
				
			||||||
 | 
					     
 | 
				
			||||||
     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					     int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
				  int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
									 int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     WilsonKernels(const ImplParams &p= ImplParams());
 | 
					     WilsonKernels(const ImplParams &p= ImplParams());
 | 
				
			||||||
     
 | 
					     
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -310,7 +310,7 @@ namespace QCD {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
											   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
						   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
											   int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -385,6 +385,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
				
			|||||||
    if ( SE->_permute ) {
 | 
					    if ( SE->_permute ) {
 | 
				
			||||||
      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
					      PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
					  if ( Nonlocal && (!SE->_is_local) ) { 
 | 
				
			||||||
@@ -397,7 +398,6 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
				
			|||||||
    num++;  
 | 
					    num++;  
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Yp
 | 
					  // Yp
 | 
				
			||||||
  SE=st.GetEntry(ptype,Yp,ss);
 | 
					  SE=st.GetEntry(ptype,Yp,ss);
 | 
				
			||||||
  offset = SE->_offset;
 | 
					  offset = SE->_offset;
 | 
				
			||||||
@@ -556,6 +556,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
				
			|||||||
    vstream(ref()(3)(0),result_30*(-0.5));
 | 
					    vstream(ref()(3)(0),result_30*(-0.5));
 | 
				
			||||||
    vstream(ref()(3)(1),result_31*(-0.5));
 | 
					    vstream(ref()(3)(1),result_31*(-0.5));
 | 
				
			||||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
					    vstream(ref()(3)(2),result_32*(-0.5));
 | 
				
			||||||
 | 
					    return 1;
 | 
				
			||||||
  } else if ( num ) { 
 | 
					  } else if ( num ) { 
 | 
				
			||||||
    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
					    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
				
			||||||
    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
					    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
				
			||||||
@@ -569,14 +570,16 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
				
			|||||||
    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
					    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
				
			||||||
    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
					    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
				
			||||||
    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
					    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
				
			||||||
 | 
					    return 1;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
						std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
											std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
						int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
											int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -822,6 +825,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 | 
				
			|||||||
    vstream(ref()(3)(0),result_30*(-0.5));
 | 
					    vstream(ref()(3)(0),result_30*(-0.5));
 | 
				
			||||||
    vstream(ref()(3)(1),result_31*(-0.5));
 | 
					    vstream(ref()(3)(1),result_31*(-0.5));
 | 
				
			||||||
    vstream(ref()(3)(2),result_32*(-0.5));
 | 
					    vstream(ref()(3)(2),result_32*(-0.5));
 | 
				
			||||||
 | 
					    return 1;
 | 
				
			||||||
  } else if ( num ) { 
 | 
					  } else if ( num ) { 
 | 
				
			||||||
    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
					    vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
 | 
				
			||||||
    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
					    vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
 | 
				
			||||||
@@ -835,7 +839,9 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 | 
				
			|||||||
    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
					    vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
 | 
				
			||||||
    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
					    vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
 | 
				
			||||||
    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
					    vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
 | 
				
			||||||
 | 
					    return 1;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /*
 | 
					  /*
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -217,5 +217,50 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 }
 | 
					 }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class vobj> inline 
 | 
				
			||||||
 | 
					void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  typedef typename vobj::scalar_type scalar_type ;
 | 
				
			||||||
 | 
					  typedef typename vobj::vector_type vector_type ;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  const int Nsimd=vobj::vector_type::Nsimd();
 | 
				
			||||||
 | 
					  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  scalar_type *pointer;
 | 
				
			||||||
 | 
					  scalar_type *vp = (scalar_type *)&vec;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i=0;i<Nsimd;i++){
 | 
				
			||||||
 | 
					    pointer=(scalar_type *)&extracted[i][offset];
 | 
				
			||||||
 | 
					    for(int w=0;w<words;w++){
 | 
				
			||||||
 | 
					      vp[w*Nsimd+i] = pointer[w];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class vobj> inline 
 | 
				
			||||||
 | 
					void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  typedef typename vobj::scalar_type scalar_type ;
 | 
				
			||||||
 | 
					  typedef typename vobj::vector_type vector_type ;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  const int Nsimd=vobj::vector_type::Nsimd();
 | 
				
			||||||
 | 
					  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  scalar_type *pointer;
 | 
				
			||||||
 | 
					  scalar_type *vp = (scalar_type *)&vec;
 | 
				
			||||||
 | 
					  //  assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int w=0;w<words;w++){
 | 
				
			||||||
 | 
					    for(int i=0;i<Nsimd;i++){
 | 
				
			||||||
 | 
					      pointer=(scalar_type *)&extracted[i][offset];
 | 
				
			||||||
 | 
					      vp[w*Nsimd+i] =pointer[w];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user