mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-02 21:14:32 +00:00 
			
		
		
		
	Merge branch 'develop' of github.com:paboyle/Grid into develop
This commit is contained in:
		@@ -308,32 +308,34 @@ namespace Grid {
 | 
			
		||||
    public:
 | 
			
		||||
      SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
 | 
			
		||||
      virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
 | 
			
		||||
	GridLogIterative.TimingMode(1);
 | 
			
		||||
	std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl;
 | 
			
		||||
	n2 = Mpc(in,out);
 | 
			
		||||
	std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl;
 | 
			
		||||
	ComplexD dot= innerProduct(in,out);
 | 
			
		||||
	std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl;
 | 
			
		||||
	n1 = real(dot);
 | 
			
		||||
      }
 | 
			
		||||
      virtual void HermOp(const Field &in, Field &out){
 | 
			
		||||
	std::cout << GridLogIterative << " HermOp "<<std::endl;
 | 
			
		||||
	Mpc(in,out);
 | 
			
		||||
      }
 | 
			
		||||
      virtual  RealD Mpc      (const Field &in, Field &out) {
 | 
			
		||||
	Field tmp(in._grid);
 | 
			
		||||
	Field tmp2(in._grid);
 | 
			
		||||
 | 
			
		||||
	std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
 | 
			
		||||
	_Mat.Mooee(in,out);
 | 
			
		||||
	_Mat.Mooee(out,tmp);
 | 
			
		||||
	std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
 | 
			
		||||
 | 
			
		||||
	_Mat.Meooe(in,out);
 | 
			
		||||
	_Mat.Meooe(out,tmp2);
 | 
			
		||||
	std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl;
 | 
			
		||||
 | 
			
		||||
	return axpy_norm(out,-1.0,tmp2,tmp);
 | 
			
		||||
#if 0
 | 
			
		||||
	//... much prefer conventional Schur norm
 | 
			
		||||
	_Mat.Meooe(in,tmp);
 | 
			
		||||
	_Mat.MooeeInv(tmp,out);
 | 
			
		||||
	_Mat.Meooe(out,tmp);
 | 
			
		||||
	_Mat.Mooee(in,out);
 | 
			
		||||
        return axpy_norm(out,-1.0,tmp,out);
 | 
			
		||||
#endif
 | 
			
		||||
	RealD nn=axpy_norm(out,-1.0,tmp2,tmp);
 | 
			
		||||
	std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl;
 | 
			
		||||
	return nn;
 | 
			
		||||
      }
 | 
			
		||||
      virtual  RealD MpcDag   (const Field &in, Field &out){
 | 
			
		||||
	return Mpc(in,out);
 | 
			
		||||
 
 | 
			
		||||
@@ -123,11 +123,14 @@ namespace Grid {
 | 
			
		||||
      Field   tmp(grid);
 | 
			
		||||
      Field  Mtmp(grid);
 | 
			
		||||
      Field resid(fgrid);
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
 | 
			
		||||
      pickCheckerboard(Even,src_e,in);
 | 
			
		||||
      pickCheckerboard(Odd ,src_o,in);
 | 
			
		||||
      pickCheckerboard(Even,sol_e,out);
 | 
			
		||||
      pickCheckerboard(Odd ,sol_o,out);
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
 | 
			
		||||
    
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      // src_o = (source_o - Moe MeeInv source_e)
 | 
			
		||||
@@ -144,6 +147,7 @@ namespace Grid {
 | 
			
		||||
      //////////////////////////////////////////////////////////////
 | 
			
		||||
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
 | 
			
		||||
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 | 
			
		||||
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl;
 | 
			
		||||
 | 
			
		||||
      ///////////////////////////////////////////////////
 | 
			
		||||
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
 | 
			
		||||
@@ -152,15 +156,16 @@ namespace Grid {
 | 
			
		||||
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
 | 
			
		||||
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
 | 
			
		||||
     
 | 
			
		||||
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
 | 
			
		||||
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
 | 
			
		||||
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
 | 
			
		||||
      std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
 | 
			
		||||
 | 
			
		||||
      // Verify the unprec residual
 | 
			
		||||
      _Matrix.M(out,resid); 
 | 
			
		||||
      resid = resid-in;
 | 
			
		||||
      RealD ns = norm2(in);
 | 
			
		||||
      RealD nr = norm2(resid);
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
 | 
			
		||||
    }     
 | 
			
		||||
  };
 | 
			
		||||
 
 | 
			
		||||
@@ -134,8 +134,18 @@ void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) 
 | 
			
		||||
{
 | 
			
		||||
  _ndimension = processors.size();
 | 
			
		||||
  assert(_ndimension = parent._ndimension);
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
  int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
 | 
			
		||||
  std::vector<int> parent_processor_coor(_ndimension,0);
 | 
			
		||||
  std::vector<int> parent_processors    (_ndimension,1);
 | 
			
		||||
 | 
			
		||||
  // Can make 5d grid from 4d etc...
 | 
			
		||||
  int pad = _ndimension-parent_ndimension;
 | 
			
		||||
  for(int d=0;d<parent_ndimension;d++){
 | 
			
		||||
    parent_processor_coor[pad+d]=parent._processor_coor[d];
 | 
			
		||||
    parent_processors    [pad+d]=parent._processors[d];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // split the communicator
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -154,9 +164,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
 | 
			
		||||
  std::vector<int> ssize(_ndimension); // coor of split within parent
 | 
			
		||||
 | 
			
		||||
  for(int d=0;d<_ndimension;d++){
 | 
			
		||||
    ccoor[d] = parent._processor_coor[d] % processors[d];
 | 
			
		||||
    scoor[d] = parent._processor_coor[d] / processors[d];
 | 
			
		||||
    ssize[d] = parent._processors[d]     / processors[d];
 | 
			
		||||
    ccoor[d] = parent_processor_coor[d] % processors[d];
 | 
			
		||||
    scoor[d] = parent_processor_coor[d] / processors[d];
 | 
			
		||||
    ssize[d] = parent_processors[d]     / processors[d];
 | 
			
		||||
  }
 | 
			
		||||
  int crank;  // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
 | 
			
		||||
  // Mpi uses the reverse Lexico convention to us
 | 
			
		||||
@@ -166,38 +176,36 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
 | 
			
		||||
  MPI_Comm comm_split;
 | 
			
		||||
  if ( Nchild > 1 ) { 
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
    std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
 | 
			
		||||
    std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
 | 
			
		||||
    for(int d=0;d<parent._processors.size();d++)  std::cout << parent._processors[d] << " ";
 | 
			
		||||
    std::cout<<std::endl;
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
 | 
			
		||||
    for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
 | 
			
		||||
    std::cout<<std::endl;
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< _ndimension <<"]    ";
 | 
			
		||||
    for(int d=0;d<processors.size();d++)  std::cout << parent._processor_coor[d] << " ";
 | 
			
		||||
    std::cout<<std::endl;
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
 | 
			
		||||
    for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
 | 
			
		||||
    std::cout<<std::endl;
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage<<" new coor ["<< _ndimension <<"]    ";
 | 
			
		||||
    for(int d=0;d<processors.size();d++)  std::cout << parent._processor_coor[d] << " ";
 | 
			
		||||
    std::cout<<std::endl;
 | 
			
		||||
    */
 | 
			
		||||
    if(0){
 | 
			
		||||
      std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
 | 
			
		||||
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processors[d] << " ";
 | 
			
		||||
      std::cout<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
      std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
 | 
			
		||||
      for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
 | 
			
		||||
      std::cout<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
      std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"]    ";
 | 
			
		||||
      for(int d=0;d<parent._ndimension;d++)  std::cout << parent._processor_coor[d] << " ";
 | 
			
		||||
      std::cout<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
      std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"]    ";
 | 
			
		||||
      for(int d=0;d<processors.size();d++)  std::cout << scoor[d] << " ";
 | 
			
		||||
      std::cout<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
      std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"]    ";
 | 
			
		||||
      for(int d=0;d<processors.size();d++)  std::cout << ccoor[d] << " ";
 | 
			
		||||
      std::cout<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Declare victory
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    /*
 | 
			
		||||
    std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
 | 
			
		||||
	      << Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
 | 
			
		||||
    */
 | 
			
		||||
    //    std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
 | 
			
		||||
    //	      << Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
 | 
			
		||||
  } else {
 | 
			
		||||
    comm_split=parent.communicator;
 | 
			
		||||
    srank = 0;
 | 
			
		||||
@@ -207,6 +215,17 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
 | 
			
		||||
  // Set up from the new split communicator
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  InitFromMPICommunicator(processors,comm_split);
 | 
			
		||||
 | 
			
		||||
  if(0){ 
 | 
			
		||||
    std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
 | 
			
		||||
    for(int d=0;d<processors.size();d++){
 | 
			
		||||
      std::cout << d<< " " << _processor_coor[d] <<" " <<  ccoor[d]<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  for(int d=0;d<processors.size();d++){
 | 
			
		||||
    assert(_processor_coor[d] == ccoor[d] );
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -231,7 +250,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &proc
 | 
			
		||||
  MPI_Comm_rank(communicator,&_processor);
 | 
			
		||||
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
 | 
			
		||||
 | 
			
		||||
  if ( communicator_base != communicator_world ) {
 | 
			
		||||
  if ( 0 && (communicator_base != communicator_world) ) {
 | 
			
		||||
    std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
 | 
			
		||||
 
 | 
			
		||||
@@ -606,7 +606,7 @@ CartesianCommunicator::~CartesianCommunicator()
 | 
			
		||||
  MPI_Finalized(&MPI_is_finalised);
 | 
			
		||||
  if (communicator && !MPI_is_finalised) {
 | 
			
		||||
    MPI_Comm_free(&communicator);
 | 
			
		||||
    for(int i=0;i<  communicator_halo.size();i++){
 | 
			
		||||
    for(int i=0;i<communicator_halo.size();i++){
 | 
			
		||||
      MPI_Comm_free(&communicator_halo[i]);
 | 
			
		||||
    }
 | 
			
		||||
  }  
 | 
			
		||||
 
 | 
			
		||||
@@ -50,26 +50,22 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
 | 
			
		||||
    half.checkerboard = cb;
 | 
			
		||||
    int ssh=0;
 | 
			
		||||
    //parallel_for
 | 
			
		||||
    for(int ss=0;ss<full._grid->oSites();ss++){
 | 
			
		||||
      std::vector<int> coor;
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0;ss<full._grid->oSites();ss++){
 | 
			
		||||
      int cbos;
 | 
			
		||||
      
 | 
			
		||||
      std::vector<int> coor;
 | 
			
		||||
      full._grid->oCoorFromOindex(coor,ss);
 | 
			
		||||
      cbos=half._grid->CheckerBoard(coor);
 | 
			
		||||
      
 | 
			
		||||
      if (cbos==cb) {
 | 
			
		||||
	int ssh=half._grid->oIndex(coor);
 | 
			
		||||
	half._odata[ssh] = full._odata[ss];
 | 
			
		||||
	ssh++;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
 | 
			
		||||
    int cb = half.checkerboard;
 | 
			
		||||
    int ssh=0;
 | 
			
		||||
    //parallel_for
 | 
			
		||||
    for(int ss=0;ss<full._grid->oSites();ss++){
 | 
			
		||||
    parallel_for(int ss=0;ss<full._grid->oSites();ss++){
 | 
			
		||||
      std::vector<int> coor;
 | 
			
		||||
      int cbos;
 | 
			
		||||
 | 
			
		||||
@@ -77,8 +73,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
 | 
			
		||||
      cbos=half._grid->CheckerBoard(coor);
 | 
			
		||||
      
 | 
			
		||||
      if (cbos==cb) {
 | 
			
		||||
	int ssh=half._grid->oIndex(coor);
 | 
			
		||||
	full._odata[ss]=half._odata[ssh];
 | 
			
		||||
	ssh++;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
@@ -698,30 +694,6 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Communicate between grids
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
//
 | 
			
		||||
// All to all plan
 | 
			
		||||
//
 | 
			
		||||
// Subvolume on fine grid is v.    Vectors a,b,c,d 
 | 
			
		||||
//
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// SIMPLEST CASE:
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Mesh of nodes (2) ; subdivide to  1 subdivisions
 | 
			
		||||
//
 | 
			
		||||
// Lex ord:   
 | 
			
		||||
//          N0 va0 vb0  N1 va1 vb1 
 | 
			
		||||
//
 | 
			
		||||
// For each dimension do an all to all
 | 
			
		||||
//
 | 
			
		||||
// full AllToAll(0)
 | 
			
		||||
//          N0 va0 va1    N1 vb0 vb1
 | 
			
		||||
//
 | 
			
		||||
// REARRANGE
 | 
			
		||||
//          N0 va01       N1 vb01
 | 
			
		||||
//
 | 
			
		||||
// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
 | 
			
		||||
// NB: Easiest to programme if keep in lex order.
 | 
			
		||||
//
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// SIMPLE CASE:
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -755,9 +727,17 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
 | 
			
		||||
//
 | 
			
		||||
// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
 | 
			
		||||
// NB: Easiest to programme if keep in lex order.
 | 
			
		||||
//
 | 
			
		||||
/////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 *  Let chunk = (fvol*nvec)/sP be size of a chunk.         ( Divide lexico vol * nvec into fP/sP = M chunks )
 | 
			
		||||
 *  
 | 
			
		||||
 *  2nd A2A (over sP nodes; subdivide the fP into sP chunks of M)
 | 
			
		||||
 * 
 | 
			
		||||
 *     node 0     1st chunk of node 0M..(1M-1); 2nd chunk of node 0M..(1M-1)..   data chunk x M x sP = fL / sP * M * sP = fL * M growth
 | 
			
		||||
 *     node 1     1st chunk of node 1M..(2M-1); 2nd chunk of node 1M..(2M-1)..
 | 
			
		||||
 *     node 2     1st chunk of node 2M..(3M-1); 2nd chunk of node 2M..(3M-1)..
 | 
			
		||||
 *     node 3     1st chunk of node 3M..(3M-1); 2nd chunk of node 2M..(3M-1)..
 | 
			
		||||
 *  etc...
 | 
			
		||||
 */
 | 
			
		||||
template<class Vobj>
 | 
			
		||||
void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 | 
			
		||||
{
 | 
			
		||||
@@ -816,57 +796,58 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 | 
			
		||||
 | 
			
		||||
  int nvec = nvector; // Counts down to 1 as we collapse dims
 | 
			
		||||
  std::vector<int> ldims = full_grid->_ldimensions;
 | 
			
		||||
  std::vector<int> lcoor(ndim);
 | 
			
		||||
 | 
			
		||||
  for(int d=ndim-1;d>=0;d--){
 | 
			
		||||
 | 
			
		||||
    if ( ratio[d] != 1 ) {
 | 
			
		||||
 | 
			
		||||
      full_grid ->AllToAll(d,alldata,tmpdata);
 | 
			
		||||
      //      std::cout << GridLogMessage << "Grid_split: dim " <<d<<" ratio "<<ratio[d]<<" nvec "<<nvec<<" procs "<<split_grid->_processors[d]<<std::endl;
 | 
			
		||||
      //      for(int v=0;v<nvec;v++){
 | 
			
		||||
      //	std::cout << "Grid_split: alldata["<<v<<"] " << alldata[v] <<std::endl;
 | 
			
		||||
      //	std::cout << "Grid_split: tmpdata["<<v<<"] " << tmpdata[v] <<std::endl;
 | 
			
		||||
      //      }
 | 
			
		||||
      //////////////////////////////////////////
 | 
			
		||||
      //Local volume for this dimension is expanded by ratio of processor extents
 | 
			
		||||
      // Number of vectors is decreased by same factor
 | 
			
		||||
      // Rearrange to lexico for bigger volume
 | 
			
		||||
      //////////////////////////////////////////
 | 
			
		||||
      nvec    /= ratio[d];
 | 
			
		||||
      if ( split_grid->_processors[d] > 1 ) {
 | 
			
		||||
	alldata=tmpdata;
 | 
			
		||||
	split_grid->AllToAll(d,alldata,tmpdata);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      auto rdims = ldims; rdims[d]  *=   ratio[d];
 | 
			
		||||
      auto rsites= lsites*ratio[d];
 | 
			
		||||
      for(int v=0;v<nvec;v++){
 | 
			
		||||
      auto rdims = ldims; 
 | 
			
		||||
      auto     M = ratio[d];
 | 
			
		||||
      auto rsites= lsites*M;// increases rsites by M
 | 
			
		||||
      nvec      /= M;       // Reduce nvec by subdivision factor
 | 
			
		||||
      rdims[d]  *= M;       // increase local dim by same factor
 | 
			
		||||
 | 
			
		||||
	// For loop over each site within old subvol
 | 
			
		||||
	for(int lsite=0;lsite<lsites;lsite++){
 | 
			
		||||
      int sP =   split_grid->_processors[d];
 | 
			
		||||
      int fP =    full_grid->_processors[d];
 | 
			
		||||
 | 
			
		||||
	  Lexicographic::CoorFromIndex(lcoor, lsite, ldims);	  
 | 
			
		||||
      int fvol   = lsites;
 | 
			
		||||
      
 | 
			
		||||
      int chunk  = (nvec*fvol)/sP;          assert(chunk*sP == nvec*fvol);
 | 
			
		||||
 | 
			
		||||
	  for(int r=0;r<ratio[d];r++){ // ratio*nvec terms
 | 
			
		||||
      // Loop over reordered data post A2A
 | 
			
		||||
      parallel_for(int c=0;c<chunk;c++){
 | 
			
		||||
	for(int m=0;m<M;m++){
 | 
			
		||||
	  for(int s=0;s<sP;s++){
 | 
			
		||||
	    
 | 
			
		||||
	    // addressing; use lexico
 | 
			
		||||
	    int lex_r;
 | 
			
		||||
	    uint64_t lex_c        = c+chunk*m+chunk*M*s;
 | 
			
		||||
	    uint64_t lex_fvol_vec = c+chunk*s;
 | 
			
		||||
	    uint64_t lex_fvol     = lex_fvol_vec%fvol;
 | 
			
		||||
	    uint64_t lex_vec      = lex_fvol_vec/fvol;
 | 
			
		||||
 | 
			
		||||
	    auto rcoor = lcoor;	    rcoor[d]  += r*ldims[d];
 | 
			
		||||
	    // which node sets an adder to the coordinate
 | 
			
		||||
	    std::vector<int> coor(ndim);
 | 
			
		||||
	    Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);	  
 | 
			
		||||
	    coor[d] += m*ldims[d];
 | 
			
		||||
	    Lexicographic::IndexFromCoor(coor, lex_r, rdims);	  
 | 
			
		||||
	    lex_r += lex_vec * rsites;
 | 
			
		||||
 | 
			
		||||
	    int rsite; Lexicographic::IndexFromCoor(rcoor, rsite, rdims);	  
 | 
			
		||||
	    rsite += v * rsites;
 | 
			
		||||
	    // LexicoFind coordinate & vector number within split lattice
 | 
			
		||||
	    alldata[lex_r] = tmpdata[lex_c];
 | 
			
		||||
 | 
			
		||||
	    int rmul=nvec*lsites;
 | 
			
		||||
	    int vmul=     lsites;
 | 
			
		||||
	    alldata[rsite] = tmpdata[lsite+r*rmul+v*vmul];
 | 
			
		||||
	    //	    if ( lsite==0 ) {
 | 
			
		||||
	    //	      std::cout << "Grid_split: grow alldata["<<rsite<<"] " << alldata[rsite] << " <- tmpdata["<< lsite+r*rmul+v*vmul<<"] "<<tmpdata[lsite+r*rmul+v*vmul]  <<std::endl;
 | 
			
		||||
	    //	    }	      
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      ldims[d]*= ratio[d];
 | 
			
		||||
      lsites  *= ratio[d];
 | 
			
		||||
 | 
			
		||||
      if ( split_grid->_processors[d] > 1 ) {
 | 
			
		||||
	tmpdata = alldata;
 | 
			
		||||
	split_grid->AllToAll(d,tmpdata,alldata);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  vectorizeFromLexOrdArray(alldata,split);    
 | 
			
		||||
@@ -937,59 +918,61 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 | 
			
		||||
  /////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Start from split grid and work towards full grid
 | 
			
		||||
  /////////////////////////////////////////////////////////////////
 | 
			
		||||
  std::vector<int> lcoor(ndim);
 | 
			
		||||
  std::vector<int> rcoor(ndim);
 | 
			
		||||
 | 
			
		||||
  int nvec = 1;
 | 
			
		||||
  lsites = split_grid->lSites();
 | 
			
		||||
  std::vector<int> ldims = split_grid->_ldimensions;
 | 
			
		||||
  uint64_t rsites        = split_grid->lSites();
 | 
			
		||||
  std::vector<int> rdims = split_grid->_ldimensions;
 | 
			
		||||
 | 
			
		||||
  //  for(int d=ndim-1;d>=0;d--){
 | 
			
		||||
  for(int d=0;d<ndim;d++){
 | 
			
		||||
 | 
			
		||||
    if ( ratio[d] != 1 ) {
 | 
			
		||||
 | 
			
		||||
      auto     M = ratio[d];
 | 
			
		||||
 | 
			
		||||
      if ( split_grid->_processors[d] > 1 ) {
 | 
			
		||||
	tmpdata = alldata;
 | 
			
		||||
	split_grid->AllToAll(d,tmpdata,alldata);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      //////////////////////////////////////////
 | 
			
		||||
      //Local volume for this dimension is expanded by ratio of processor extents
 | 
			
		||||
      // Number of vectors is decreased by same factor
 | 
			
		||||
      // Rearrange to lexico for bigger volume
 | 
			
		||||
      //////////////////////////////////////////
 | 
			
		||||
      auto rsites= lsites/ratio[d];
 | 
			
		||||
      auto rdims = ldims; rdims[d]/=ratio[d];
 | 
			
		||||
 | 
			
		||||
      for(int v=0;v<nvec;v++){
 | 
			
		||||
 | 
			
		||||
	// rsite, rcoor --> smaller local volume
 | 
			
		||||
	// lsite, lcoor --> bigger original (single node?) volume
 | 
			
		||||
	// For loop over each site within smaller subvol
 | 
			
		||||
	for(int rsite=0;rsite<rsites;rsite++){
 | 
			
		||||
 | 
			
		||||
	  Lexicographic::CoorFromIndex(rcoor, rsite, rdims);	  
 | 
			
		||||
	  int lsite;
 | 
			
		||||
 | 
			
		||||
	  for(int r=0;r<ratio[d];r++){ 
 | 
			
		||||
 | 
			
		||||
	    lcoor = rcoor; lcoor[d] += r*rdims[d];
 | 
			
		||||
	    Lexicographic::IndexFromCoor(lcoor, lsite, ldims); lsite += v * lsites;
 | 
			
		||||
 | 
			
		||||
	    int rmul=nvec*rsites;
 | 
			
		||||
	    int vmul=     rsites;
 | 
			
		||||
	    tmpdata[rsite+r*rmul+v*vmul]=alldata[lsite];
 | 
			
		||||
 | 
			
		||||
      int sP =   split_grid->_processors[d];
 | 
			
		||||
      int fP =    full_grid->_processors[d];
 | 
			
		||||
      
 | 
			
		||||
      auto ldims = rdims;  ldims[d]  /= M;  // Decrease local dims by same factor
 | 
			
		||||
      auto lsites= rsites/M;                // Decreases rsites by M
 | 
			
		||||
      
 | 
			
		||||
      int fvol   = lsites;
 | 
			
		||||
      int chunk  = (nvec*fvol)/sP;          assert(chunk*sP == nvec*fvol);
 | 
			
		||||
	
 | 
			
		||||
      {
 | 
			
		||||
	// Loop over reordered data post A2A
 | 
			
		||||
	for(int c=0;c<chunk;c++){
 | 
			
		||||
	  for(int m=0;m<M;m++){
 | 
			
		||||
	    for(int s=0;s<sP;s++){
 | 
			
		||||
	      
 | 
			
		||||
	      // addressing; use lexico
 | 
			
		||||
	      int lex_r;
 | 
			
		||||
	      uint64_t lex_c = c+chunk*m+chunk*M*s;
 | 
			
		||||
	      uint64_t lex_fvol_vec = c+chunk*s;
 | 
			
		||||
	      uint64_t lex_fvol     = lex_fvol_vec%fvol;
 | 
			
		||||
	      uint64_t lex_vec      = lex_fvol_vec/fvol;
 | 
			
		||||
	      
 | 
			
		||||
	      // which node sets an adder to the coordinate
 | 
			
		||||
	      std::vector<int> coor(ndim);
 | 
			
		||||
	      Lexicographic::CoorFromIndex(coor, lex_fvol, ldims);	  
 | 
			
		||||
	      coor[d] += m*ldims[d];
 | 
			
		||||
	      Lexicographic::IndexFromCoor(coor, lex_r, rdims);	  
 | 
			
		||||
	      lex_r += lex_vec * rsites;
 | 
			
		||||
	      
 | 
			
		||||
	      // LexicoFind coordinate & vector number within split lattice
 | 
			
		||||
	      tmpdata[lex_c] = alldata[lex_r];
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      nvec   *= ratio[d];
 | 
			
		||||
      ldims[d]=rdims[d];
 | 
			
		||||
      lsites  =rsites;
 | 
			
		||||
 | 
			
		||||
      if ( split_grid->_processors[d] > 1 ) {
 | 
			
		||||
	split_grid->AllToAll(d,tmpdata,alldata);
 | 
			
		||||
	tmpdata=alldata;
 | 
			
		||||
      }
 | 
			
		||||
      full_grid ->AllToAll(d,tmpdata,alldata);
 | 
			
		||||
      rdims[d]/= M;
 | 
			
		||||
      rsites  /= M;
 | 
			
		||||
      nvec    *= M;       // Increase nvec by subdivision factor
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -997,12 +980,12 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj>   & split)
 | 
			
		||||
  for(int v=0;v<nvector;v++){
 | 
			
		||||
    assert(v<full.size());
 | 
			
		||||
    parallel_for(int site=0;site<lsites;site++){
 | 
			
		||||
      assert(v*lsites+site < alldata.size());
 | 
			
		||||
      scalardata[site] = alldata[v*lsites+site];
 | 
			
		||||
    }
 | 
			
		||||
    vectorizeFromLexOrdArray(scalardata,full[v]);    
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user