mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Slice summation working. May move this into lattice/Grid_lattice_reduction however
This commit is contained in:
		@@ -98,6 +98,7 @@ public:
 | 
			
		||||
#include <lattice/Grid_lattice_coordinate.h>
 | 
			
		||||
#include <lattice/Grid_lattice_rng.h>
 | 
			
		||||
#include <lattice/Grid_lattice_transfer.h>
 | 
			
		||||
#include <Grid_summation.h>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -126,16 +126,16 @@ namespace Grid {
 | 
			
		||||
/////////////////////////////////////////////////////////////////
 | 
			
		||||
// Generic extract/merge/permute
 | 
			
		||||
/////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void Gextract(const vsimd &y,std::vector<scalar *> &extracted){
 | 
			
		||||
  // FIXME: bounce off stack is painful
 | 
			
		||||
  // temporary hack while I figure out better way.
 | 
			
		||||
  // There are intrinsics to do this work without the storage.
 | 
			
		||||
  // FIXME: bounce off memory is painful
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
 | 
			
		||||
 | 
			
		||||
  vstore(y,&buf[0]);
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    *extracted[i] = buf[i*s];
 | 
			
		||||
@@ -143,6 +143,19 @@ inline void Gextract(const vsimd &y,std::vector<scalar *> &extracted){
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void Gextract(const vsimd &y,std::vector<scalar> &extracted){
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
 | 
			
		||||
 | 
			
		||||
  vstore(y,&buf[0]);
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    extracted[i] = buf[i*s];
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
@@ -158,23 +171,6 @@ inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
 | 
			
		||||
  vset(y,&buf[0]); 
 | 
			
		||||
};
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void Gextract(const vsimd &y,std::vector<scalar> &extracted){
 | 
			
		||||
  // FIXME: bounce off stack is painful
 | 
			
		||||
  // temporary hack while I figure out better way.
 | 
			
		||||
  // There are intrinsics to do this work without the storage.
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
  int s=Nsimd/Nextr;
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
 | 
			
		||||
 | 
			
		||||
  vstore(y,&buf[0]);
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nextr;i++){
 | 
			
		||||
    extracted[i] = buf[i*s];
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
template<class vsimd,class scalar>
 | 
			
		||||
inline void Gmerge(vsimd &y,std::vector<scalar> &extracted){
 | 
			
		||||
  int Nextr=extracted.size();
 | 
			
		||||
  int Nsimd=vsimd::Nsimd();
 | 
			
		||||
 
 | 
			
		||||
@@ -1,7 +1,8 @@
 | 
			
		||||
#ifndef GRID_SUMMATION_H
 | 
			
		||||
#define GRID_SUMMATION_H
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
void subdivides(GridBase *coarse,GridBase *fine)
 | 
			
		||||
inline void subdivides(GridBase *coarse,GridBase *fine)
 | 
			
		||||
{
 | 
			
		||||
  assert(coarse->_ndimension == fine->_ndimension);
 | 
			
		||||
 | 
			
		||||
@@ -9,58 +10,43 @@ void subdivides(GridBase *coarse,GridBase *fine)
 | 
			
		||||
 | 
			
		||||
  // local and global volumes subdivide cleanly after SIMDization
 | 
			
		||||
  for(int d=0;d<_ndimension;d++){
 | 
			
		||||
    assert(coarse->_processors[d]  == fine->_processors[d]);
 | 
			
		||||
    assert(coarse->_simd_layout[d] == fine->_simd_layout[d]);
 | 
			
		||||
    assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]); 
 | 
			
		||||
    assert((fine->_ldimensions[d] / coarse->_ldimensions[d])* coarse->_ldimensions[d]==fine->_ldimensions[d]); 
 | 
			
		||||
    assert((fine->_gdimensions[d] / coarse->_gdimensions[d])* coarse->_gdimensions[d]==fine->_gdimensions[d]); 
 | 
			
		||||
    assert((fine->_fdimensions[d] / coarse->_fdimensions[d])* coarse->_fdimensions[d]==fine->_fdimensions[d]); 
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// useful in multigrid project;
 | 
			
		||||
// Generic name : Coarsen?
 | 
			
		||||
//              : SubMeshSum?
 | 
			
		||||
//
 | 
			
		||||
template<class vobj>
 | 
			
		||||
inline void sumBlocks(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 | 
			
		||||
{
 | 
			
		||||
  GridBase * fine  = findData._grid;
 | 
			
		||||
  GridBase * coarse= findData._grid;
 | 
			
		||||
  GridBase * fine  = fineData._grid;
 | 
			
		||||
  GridBase * coarse= coarseData._grid;
 | 
			
		||||
 | 
			
		||||
  subdivides(coars,fine); // require they map
 | 
			
		||||
  subdivides(coarse,fine); // require they map
 | 
			
		||||
 | 
			
		||||
  int _ndimension = coarse->_ndimension;
 | 
			
		||||
  
 | 
			
		||||
  std::vector<bool> replicated(_ndimension,false);
 | 
			
		||||
  std::vector<int>  block_r   (_dimension);
 | 
			
		||||
  std::vector<int>  block_f   (_dimension);
 | 
			
		||||
 | 
			
		||||
  std::vector<int>  block_r      (_ndimension);
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
  // Detect whether the result is replicated in dimension d
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
  
 | 
			
		||||
  for(int d=0 ; d<_ndimension;d++){
 | 
			
		||||
    if ( (_fdimensions[d] == 1) && (coarse->_processors[d]>1) ) {
 | 
			
		||||
      replicated[d]=true;
 | 
			
		||||
    }
 | 
			
		||||
    block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
 | 
			
		||||
    block_l[d] = fine->_ldimensions[d] / coarse->_ldimensions[d];
 | 
			
		||||
    block_f[d] = fine->_fdimensions[d] / coarse->_fdimensions[d];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  coaseData=zero;
 | 
			
		||||
 | 
			
		||||
  //FIXME Bagel's strategy: loop over fine sites
 | 
			
		||||
  // identify corresponding coarse site, but coarse sites are 
 | 
			
		||||
  // divided across threads. Not so easy to do in openmp but
 | 
			
		||||
  // there must be a way
 | 
			
		||||
  coarseData=zero;
 | 
			
		||||
  for(int sf=0;sf<fine->oSites();sf++){
 | 
			
		||||
    
 | 
			
		||||
    int sc;
 | 
			
		||||
    vobj sum=zero;
 | 
			
		||||
    std::vector<int> coor_c(_ndimension);
 | 
			
		||||
    std::vector<int> coor_f(_ndimension);
 | 
			
		||||
 | 
			
		||||
    GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
 | 
			
		||||
 | 
			
		||||
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/fine->_rdimensions;
 | 
			
		||||
 | 
			
		||||
    for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
 | 
			
		||||
    GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
 | 
			
		||||
 | 
			
		||||
    coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
 | 
			
		||||
@@ -68,4 +54,75 @@ inline void sumBlocks(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 | 
			
		||||
  }
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
 | 
			
		||||
  GridBase  *grid = Data._grid;
 | 
			
		||||
  const int    Nd = grid->_ndimension;
 | 
			
		||||
  const int Nsimd = grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
  assert(orthogdim >= 0);
 | 
			
		||||
  assert(orthogdim < Nd);
 | 
			
		||||
 | 
			
		||||
  int fd=grid->_fdimensions[orthogdim];
 | 
			
		||||
  int ld=grid->_ldimensions[orthogdim];
 | 
			
		||||
  int rd=grid->_rdimensions[orthogdim];
 | 
			
		||||
 | 
			
		||||
  std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
 | 
			
		||||
  std::vector<sobj> lsSum(ld,sobj(zero)); // sum across these down to scalars
 | 
			
		||||
  std::vector<sobj> extracted(Nsimd);     // splitting the SIMD
 | 
			
		||||
 | 
			
		||||
  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
 | 
			
		||||
  for(int r=0;r<rd;r++){
 | 
			
		||||
    lvSum[r]=zero;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::vector<int>  coor(Nd);  
 | 
			
		||||
 | 
			
		||||
  // sum over reduced dimension planes, breaking out orthog dir
 | 
			
		||||
  for(int ss=0;ss<grid->oSites();ss++){
 | 
			
		||||
    GridBase::CoorFromIndex(coor,ss,grid->_rdimensions);
 | 
			
		||||
    int r = coor[orthogdim];
 | 
			
		||||
    lvSum[r]=lvSum[r]+Data._odata[ss];
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
  // Sum across simd lanes in the plane, breaking out orthog dir.
 | 
			
		||||
  std::vector<int> icoor(Nd);
 | 
			
		||||
 | 
			
		||||
  for(int rt=0;rt<rd;rt++){
 | 
			
		||||
 | 
			
		||||
    extract(lvSum[rt],extracted);
 | 
			
		||||
 | 
			
		||||
    for(int idx=0;idx<Nsimd;idx++){
 | 
			
		||||
 | 
			
		||||
      grid->iCoorFromIindex(icoor,idx);
 | 
			
		||||
 | 
			
		||||
      int ldx =rt+icoor[orthogdim]*rd;
 | 
			
		||||
 | 
			
		||||
      lsSum[ldx]=lsSum[ldx]+extracted[idx];
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // sum over nodes.
 | 
			
		||||
  sobj gsum;
 | 
			
		||||
  for(int t=0;t<fd;t++){
 | 
			
		||||
    int pt = t/ld; // processor plane
 | 
			
		||||
    int lt = t%ld;
 | 
			
		||||
    if ( pt == grid->_processor_coor[orthogdim] ) {
 | 
			
		||||
      gsum=lsSum[lt];
 | 
			
		||||
    } else {
 | 
			
		||||
      gsum=zero;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    grid->GlobalSum(gsum);
 | 
			
		||||
 | 
			
		||||
    result[t]=gsum;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -93,7 +93,7 @@ public:
 | 
			
		||||
    static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
 | 
			
		||||
      int nd= dims.size();
 | 
			
		||||
      coor.resize(nd);
 | 
			
		||||
      for(int d=0;d<_ndimension;d++){
 | 
			
		||||
      for(int d=0;d<nd;d++){
 | 
			
		||||
	coor[d] = index % dims[d];
 | 
			
		||||
	index   = index / dims[d];
 | 
			
		||||
      }
 | 
			
		||||
@@ -102,7 +102,7 @@ public:
 | 
			
		||||
      int nd=dims.size();
 | 
			
		||||
      int stride=1;
 | 
			
		||||
      index=0;
 | 
			
		||||
      for(int d=0;d<_ndimension;d++){
 | 
			
		||||
      for(int d=0;d<nd;d++){
 | 
			
		||||
	index = index+stride*coor[d];
 | 
			
		||||
	stride=stride*dims[d];
 | 
			
		||||
      }
 | 
			
		||||
@@ -164,6 +164,25 @@ public:
 | 
			
		||||
	mult*=_gdimensions[mu];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
 | 
			
		||||
    {
 | 
			
		||||
      pcoor.resize(_ndimension);
 | 
			
		||||
      lcoor.resize(_ndimension);
 | 
			
		||||
      for(int mu=0;mu<_ndimension;mu++){
 | 
			
		||||
	pcoor[mu] = gcoor[mu]/_ldimensions[mu];
 | 
			
		||||
	lcoor[mu] = gcoor[mu]%_ldimensions[mu];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
 | 
			
		||||
    {
 | 
			
		||||
      std::vector<int> pcoor;
 | 
			
		||||
      std::vector<int> lcoor;
 | 
			
		||||
      GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
 | 
			
		||||
      rank = RankFromProcessorCoor(pcoor);
 | 
			
		||||
      i_idx= iIndex(lcoor);
 | 
			
		||||
      o_idx= oIndex(lcoor);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
 | 
			
		||||
    {
 | 
			
		||||
      gcoor.resize(_ndimension);
 | 
			
		||||
@@ -191,24 +210,6 @@ public:
 | 
			
		||||
      gcoor.resize(_ndimension);
 | 
			
		||||
      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
 | 
			
		||||
    }
 | 
			
		||||
    void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
 | 
			
		||||
    {
 | 
			
		||||
      pcoor.resize(_ndimension);
 | 
			
		||||
      lcoor.resize(_ndimension);
 | 
			
		||||
      for(int mu=0;mu<_ndimension;mu++){
 | 
			
		||||
	pcoor[mu] = gcoor[mu]/_ldimensions[mu];
 | 
			
		||||
	lcoor[mu] = gcoor[mu]%_ldimensions[mu];
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
 | 
			
		||||
    {
 | 
			
		||||
      std::vector<int> pcoor;
 | 
			
		||||
      std::vector<int> lcoor;
 | 
			
		||||
      GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
 | 
			
		||||
      rank = RankFromProcessorCoor(pcoor);
 | 
			
		||||
      i_idx= iIndex(lcoor);
 | 
			
		||||
      o_idx= oIndex(lcoor);
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -25,7 +25,9 @@ namespace Grid {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class vobj>
 | 
			
		||||
    inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
 | 
			
		||||
    inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) 
 | 
			
		||||
      //    inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) 
 | 
			
		||||
      //->decltype(innerProduct(left._odata[0],right._odata[0]))
 | 
			
		||||
    {
 | 
			
		||||
      typedef typename vobj::scalar_type scalar;
 | 
			
		||||
      decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
 | 
			
		||||
@@ -54,6 +56,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      vsum=zero;
 | 
			
		||||
      ssum=zero;
 | 
			
		||||
      //FIXME make this loop parallelisable
 | 
			
		||||
      for(int ss=0;ss<arg._grid->oSites(); ss++){
 | 
			
		||||
	vsum = vsum + arg._odata[ss];
 | 
			
		||||
      }
 | 
			
		||||
 
 | 
			
		||||
@@ -241,5 +241,37 @@ public:
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
void extract(const vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
 | 
			
		||||
  int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  
 | 
			
		||||
  extracted.resize(Nsimd);
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar_type *> pointers(Nsimd);
 | 
			
		||||
  for(int i=0;i<Nsimd;i++) 
 | 
			
		||||
    pointers[i] =(scalar_type *)& extracted[i];
 | 
			
		||||
  
 | 
			
		||||
  extract(vec,pointers);
 | 
			
		||||
}
 | 
			
		||||
template<class vobj> inline 
 | 
			
		||||
void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
			
		||||
  typedef typename vobj::vector_type vector_type ;
 | 
			
		||||
  
 | 
			
		||||
  int Nsimd=vobj::vector_type::Nsimd();
 | 
			
		||||
  assert(extracted.size()==Nsimd);
 | 
			
		||||
 | 
			
		||||
  std::vector<scalar_type *> pointers(Nsimd);
 | 
			
		||||
  for(int i=0;i<Nsimd;i++) 
 | 
			
		||||
    pointers[i] =(scalar_type *)& extracted[i];
 | 
			
		||||
  
 | 
			
		||||
  merge(vec,pointers);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user