mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Shaken out stencil to the point where I think wilson dslash is correct.
Need to audit code carefully, consolidate between stencil and cshift, and then benchmark and optimise.
This commit is contained in:
		@@ -5,21 +5,23 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
    template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
 | 
			
		||||
    {
 | 
			
		||||
      typedef typename iobj::scalar_object scalar_object;
 | 
			
		||||
      typedef typename iobj::scalar_type scalar_type;
 | 
			
		||||
      typedef typename iobj::vector_type vector_type;
 | 
			
		||||
 | 
			
		||||
      GridBase *grid = l._grid;
 | 
			
		||||
      int Nsimd = grid->iSites();
 | 
			
		||||
 | 
			
		||||
      std::vector<int> gcoor;
 | 
			
		||||
      std::vector<scalar_type> mergebuf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> mergeptr(Nsimd);
 | 
			
		||||
 | 
			
		||||
      vector_type vI;
 | 
			
		||||
      for(int o=0;o<grid->oSites();o++){
 | 
			
		||||
	for(int i=0;i<grid->iSites();i++){
 | 
			
		||||
	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
 | 
			
		||||
	  mergebuf[i]=gcoor[mu];
 | 
			
		||||
	  mergeptr[i]=&mergebuf[i];
 | 
			
		||||
	  mergebuf[i]=(Integer)gcoor[mu];
 | 
			
		||||
	}
 | 
			
		||||
	merge(vI,mergeptr);
 | 
			
		||||
	AmergeA<vector_type,scalar_type>(vI,mergebuf);
 | 
			
		||||
	l._odata[o]=vI;
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
 
 | 
			
		||||
@@ -94,15 +94,12 @@ namespace Grid {
 | 
			
		||||
      grid->Broadcast(grid->BossRank(),s);
 | 
			
		||||
 | 
			
		||||
      std::vector<sobj> buf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
 | 
			
		||||
      // extract-modify-merge cycle is easiest way and this is not perf critical
 | 
			
		||||
      if ( rank == grid->ThisRank() ) {
 | 
			
		||||
	for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
	extract(l._odata[odx],pointers);
 | 
			
		||||
	extract(l._odata[odx],buf);
 | 
			
		||||
	buf[idx] = s;
 | 
			
		||||
	for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
	merge(l._odata[odx],pointers);
 | 
			
		||||
	merge(l._odata[odx],buf);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      return;
 | 
			
		||||
@@ -127,13 +124,12 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      int rank,odx,idx;
 | 
			
		||||
      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
 | 
			
		||||
      std::vector<sobj> buf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
 | 
			
		||||
      extract(l._odata[odx],pointers);
 | 
			
		||||
      
 | 
			
		||||
      std::vector<sobj> buf(Nsimd);
 | 
			
		||||
      extract(l._odata[odx],buf);
 | 
			
		||||
 | 
			
		||||
      s = buf[idx];
 | 
			
		||||
 | 
			
		||||
      grid->Broadcast(rank,s);
 | 
			
		||||
 | 
			
		||||
      return;
 | 
			
		||||
@@ -160,10 +156,8 @@ namespace Grid {
 | 
			
		||||
      odx= grid->oIndex(site);
 | 
			
		||||
 | 
			
		||||
      std::vector<sobj> buf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
 | 
			
		||||
      extract(l._odata[odx],pointers);
 | 
			
		||||
      extract(l._odata[odx],buf);
 | 
			
		||||
      
 | 
			
		||||
      s = buf[idx];
 | 
			
		||||
 | 
			
		||||
@@ -188,16 +182,13 @@ namespace Grid {
 | 
			
		||||
      odx= grid->oIndex(site);
 | 
			
		||||
 | 
			
		||||
      std::vector<sobj> buf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
 | 
			
		||||
      // extract-modify-merge cycle is easiest way and this is not perf critical
 | 
			
		||||
      extract(l._odata[odx],pointers);
 | 
			
		||||
      extract(l._odata[odx],buf);
 | 
			
		||||
      
 | 
			
		||||
      buf[idx] = s;
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
      merge(l._odata[odx],pointers);
 | 
			
		||||
      merge(l._odata[odx],buf);
 | 
			
		||||
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
 
 | 
			
		||||
@@ -66,9 +66,7 @@ namespace Grid {
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      std::vector<sobj>               buf(Nsimd);
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
 | 
			
		||||
      extract(vsum,pointers);
 | 
			
		||||
      extract(vsum,buf);
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -26,8 +26,21 @@ namespace Grid {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
  // real scalars are one component
 | 
			
		||||
  template<class scalar,class distribution,class generator> void fillScalar(scalar &s,distribution &dist,generator & gen)
 | 
			
		||||
  {
 | 
			
		||||
    s=dist(gen);
 | 
			
		||||
  }
 | 
			
		||||
  template<class distribution,class generator> void fillScalar(ComplexF &s,distribution &dist, generator &gen)
 | 
			
		||||
  {
 | 
			
		||||
    s=ComplexF(dist(gen),dist(gen));
 | 
			
		||||
  }
 | 
			
		||||
  template<class distribution,class generator> void fillScalar(ComplexD &s,distribution &dist,generator &gen)
 | 
			
		||||
  {
 | 
			
		||||
    s=ComplexD(dist(gen),dist(gen));
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  class GridRNGbase {
 | 
			
		||||
 | 
			
		||||
  public:
 | 
			
		||||
@@ -64,20 +77,6 @@ namespace Grid {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // real scalars are one component
 | 
			
		||||
    template<class scalar,class distribution> void fillScalar(scalar &s,distribution &dist)
 | 
			
		||||
    {
 | 
			
		||||
      s=dist(_generators[0]);
 | 
			
		||||
    }
 | 
			
		||||
    template<class distribution> void fillScalar(ComplexF &s,distribution &dist)
 | 
			
		||||
    {
 | 
			
		||||
      s=ComplexF(dist(_generators[0]),dist(_generators[0]));
 | 
			
		||||
    }
 | 
			
		||||
    template<class distribution> void fillScalar(ComplexD &s,distribution &dist)
 | 
			
		||||
    {
 | 
			
		||||
      s=ComplexD(dist(_generators[0]),dist(_generators[0]));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    template <class sobj,class distribution> inline void fill(sobj &l,distribution &dist){
 | 
			
		||||
 | 
			
		||||
@@ -88,7 +87,7 @@ namespace Grid {
 | 
			
		||||
      scalar_type *buf = (scalar_type *) & l;
 | 
			
		||||
 | 
			
		||||
      for(int idx=0;idx<words;idx++){
 | 
			
		||||
	fillScalar(buf[idx],dist);
 | 
			
		||||
	fillScalar(buf[idx],dist,_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
@@ -96,47 +95,47 @@ namespace Grid {
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template <class distribution>  inline void fill(ComplexF &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      fillScalar(l,dist,_generators[0]);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(ComplexD &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      fillScalar(l,dist,_generators[0]);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(RealF &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      fillScalar(l,dist,_generators[0]);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(RealD &l,distribution &dist){
 | 
			
		||||
      fillScalar(l,dist);
 | 
			
		||||
      fillScalar(l,dist,_generators[0]);
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    // vector fill
 | 
			
		||||
    template <class distribution>  inline void fill(vComplexF &l,distribution &dist){
 | 
			
		||||
      RealF *pointer=(RealF *)&l;
 | 
			
		||||
      for(int i=0;i<2*vComplexF::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
	fillScalar(pointer[i],dist,_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(vComplexD &l,distribution &dist){
 | 
			
		||||
      RealD *pointer=(RealD *)&l;
 | 
			
		||||
      for(int i=0;i<2*vComplexD::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
	fillScalar(pointer[i],dist,_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(vRealF &l,distribution &dist){
 | 
			
		||||
      RealF *pointer=(RealF *)&l;
 | 
			
		||||
      for(int i=0;i<vRealF::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
	fillScalar(pointer[i],dist,_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
    template <class distribution>  inline void fill(vRealD &l,distribution &dist){
 | 
			
		||||
      RealD *pointer=(RealD *)&l;
 | 
			
		||||
      for(int i=0;i<vRealD::Nsimd();i++){
 | 
			
		||||
	fillScalar(pointer[i],dist);
 | 
			
		||||
	fillScalar(pointer[i],dist,_generators[0]);
 | 
			
		||||
      }
 | 
			
		||||
      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
 | 
			
		||||
    }
 | 
			
		||||
@@ -187,18 +186,31 @@ namespace Grid {
 | 
			
		||||
    {
 | 
			
		||||
      std::vector<int> gcoor;
 | 
			
		||||
 | 
			
		||||
      for(int gidx=0;gidx<_grid->_gsites;gidx++){
 | 
			
		||||
      int gsites = _grid->_gsites;
 | 
			
		||||
 | 
			
		||||
      typename source::result_type init = src();
 | 
			
		||||
      std::ranlux48 pseeder(init);
 | 
			
		||||
      std::uniform_int_distribution<uint64_t> ui;
 | 
			
		||||
 | 
			
		||||
      for(int gidx=0;gidx<gsites;gidx++){
 | 
			
		||||
 | 
			
		||||
	int rank,o_idx,i_idx;
 | 
			
		||||
	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
 | 
			
		||||
	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
 | 
			
		||||
 | 
			
		||||
	int l_idx=generator_idx(o_idx,i_idx);
 | 
			
		||||
	
 | 
			
		||||
	std::vector<int> site_seeds(4);
 | 
			
		||||
	for(int i=0;i<4;i++){
 | 
			
		||||
	  site_seeds[i]= ui(pseeder);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	typename source::result_type init = src();
 | 
			
		||||
	_grid->Broadcast(0,(void *)&site_seeds[0],sizeof(int)*site_seeds.size());
 | 
			
		||||
 | 
			
		||||
	_grid->Broadcast(0,(void *)&init,sizeof(init));
 | 
			
		||||
	if( rank == _grid->ThisRank() ){
 | 
			
		||||
	  _generators[l_idx] = std::ranlux48(init);
 | 
			
		||||
	  fixedSeed ssrc(site_seeds);
 | 
			
		||||
	  typename source::result_type sinit = ssrc();
 | 
			
		||||
	  _generators[l_idx] = std::ranlux48(sinit);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      _seeded=1;
 | 
			
		||||
@@ -210,6 +222,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
    template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,distribution &dist){
 | 
			
		||||
 | 
			
		||||
      typedef typename vobj::scalar_object scalar_object;
 | 
			
		||||
      typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
      typedef typename vobj::vector_type vector_type;
 | 
			
		||||
      
 | 
			
		||||
@@ -217,25 +230,22 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      int     Nsimd =_grid->Nsimd();
 | 
			
		||||
      int     osites=_grid->oSites();
 | 
			
		||||
      int words=sizeof(scalar_object)/sizeof(scalar_type);
 | 
			
		||||
 | 
			
		||||
      int words = sizeof(vobj)/sizeof(vector_type);
 | 
			
		||||
      std::vector<std::vector<scalar_type> > buf(Nsimd,std::vector<scalar_type>(words));
 | 
			
		||||
      std::vector<scalar_type *> pointers(Nsimd);  
 | 
			
		||||
      std::vector<scalar_object> buf(Nsimd);
 | 
			
		||||
      
 | 
			
		||||
      for(int ss=0;ss<osites;ss++){
 | 
			
		||||
 | 
			
		||||
	for(int si=0;si<Nsimd;si++){
 | 
			
		||||
 | 
			
		||||
	  int gdx = generator_idx(ss,si); // index of generator state
 | 
			
		||||
 | 
			
		||||
	  pointers[si] = (scalar_type *)&buf[si][0];
 | 
			
		||||
	  scalar_type *pointer = (scalar_type *)&buf[si];
 | 
			
		||||
	  for(int idx=0;idx<words;idx++){
 | 
			
		||||
	    pointers[si][idx] = dist(_generators[gdx]);
 | 
			
		||||
	    fillScalar(pointer[idx],dist,_generators[gdx]);
 | 
			
		||||
	  }
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
	// merge into SIMD lanes
 | 
			
		||||
	merge(l._odata[ss],pointers);
 | 
			
		||||
	merge(l._odata[ss],buf);
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user