Merge branch 'feature/scidac-wp1' of https://github.com/paboyle/Grid into feature/scidac-wp1

Optimised parameters for 16^3
Eliminate older inefficient coarsening implementation
2025-06-21 17:22:03 +01:00 · 2023-10-05 16:57:59 -04:00 · 2023-10-05 16:56:55 -04:00 · 2023-10-05 16:56:15 -04:00 · 2023-10-05 16:55:48 -04:00 · 2023-10-05 16:55:19 -04:00
15 changed files with 432 additions and 183 deletions
--- a/Grid/algorithms/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/GeneralCoarsenedMatrix.h
@ -226,6 +226,7 @@ public:
  void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
  {
    int nfound=0;
    std::cout << " ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
    for(int p=0;p<geom.npoint;p++){
      for(int pp=0;pp<CopyMe.geom.npoint;pp++){
 	// Search for the same relative shift
@ -233,12 +234,12 @@ public:
 	if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
 	  _A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
 	  _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
 	  ExchangeCoarseLinks();
 	  nfound++;
 	}
      }
    }
    assert(nfound==geom.npoint);
    ExchangeCoarseLinks();
  }
  GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
@ -271,7 +272,8 @@ public:
  }
  void Mdag (const CoarseVector &in, CoarseVector &out)
  {
-    Mult(_Adag,in,out);
+    if ( hermitian ) M(in,out);
    else Mult(_Adag,in,out);
  }
  void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
  {
@ -298,10 +300,10 @@ public:
    const int Nsimd = CComplex::Nsimd();
    int osites=pin.Grid()->oSites();
-    int gsites=pin.Grid()->gSites();
+    //    int gsites=pin.Grid()->gSites();
-    RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites;
+    RealD flops = 1.0* npoint * nbasis * nbasis * 8 * osites;
-    RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint;
+    RealD bytes = (1.0*osites*sizeof(siteMatrix)*npoint+2.0*osites*sizeof(siteVector))*npoint;
    //    for(int point=0;point<npoint;point++){
    //      conformable(A[point],pin);
@ -343,20 +345,20 @@ public:
    text+=usecond();
    ttot+=usecond();
-    std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Kernel "<< flops/tmult<<" mflop/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel "<< flops/tmult<<" mflop/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Kernel "<< bytes/tmult<<" MB/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel "<< bytes/tmult<<" MB/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse bytes   "<< bytes/1e6<<" MB"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse bytes   "<< bytes/1e6<<" MB"<<std::endl;
  };
  void PopulateAdag(void)
  {
-    for(int bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
+    for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
      Coordinate bcoor;
      CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
@ -412,10 +414,10 @@ public:
     *
     *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
     */
-  void CoarsenOperatorColoured(LinearOperatorBase<Lattice<Fobj> > &linop,
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
-			       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
  {
-    std::cout << GridLogMessage<< "CoarsenMatrixColoured "<< std::endl;
+    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
    GridBase *grid = FineGrid();
    RealD tproj=0.0;
@ -540,9 +542,14 @@ public:
      auto sval = peekSite(_A[p],coor);
    }
-    PopulateAdag();
+    // Only needed if nonhermitian
    if ( ! hermitian ) {
      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
      PopulateAdag();
    }
    // Need to write something to populate Adag from A
    std::cout << GridLogMessage<<"ExchangeCoarseLinks  "<<std::endl;
    ExchangeCoarseLinks();
    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
@ -552,6 +559,7 @@ public:
  }
  void ExchangeCoarseLinks(void){
    for(int p=0;p<geom.npoint;p++){
      std::cout << "Exchange "<<p<<std::endl;
      _A[p] = Cell.Exchange(_A[p]);
      _Adag[p]= Cell.Exchange(_Adag[p]);
    }
--- a/Grid/algorithms/iterative/AdefGeneric.h
+++ b/Grid/algorithms/iterative/AdefGeneric.h
@ -42,46 +42,30 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   */
 NAMESPACE_BEGIN(Grid);
-template<class Field, class CoarseField, class Aggregation>
+template<class Field>
-class TwoLevelADEF2 : public LinearFunction<Field>
+class TwoLevelCG : public LinearFunction<Field>
 {
 public:
  RealD   Tolerance;
  Integer MaxIterations;
  const int mmax = 1;
  GridBase *grid;
  GridBase *coarsegrid;
  // Fine operator, Smoother, CoarseSolver
  LinearOperatorBase<Field>   &_FineLinop;
  LinearFunction<Field>   &_Smoother;
  LinearFunction<CoarseField> &_CoarseSolver;
  LinearFunction<CoarseField> &_CoarseSolverPrecise;
  // Need something that knows how to get from Coarse to fine and back again
  //  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
  //  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
  Aggregation &_Aggregates;                    
  // more most opertor functions
-  TwoLevelADEF2(RealD tol,
+  TwoLevelCG(RealD tol,
-		   Integer maxit,
+	     Integer maxit,
-		   LinearOperatorBase<Field>   &FineLinop,
+	     LinearOperatorBase<Field>   &FineLinop,
-		   LinearFunction<Field>   &Smoother,
+	     LinearFunction<Field>       &Smoother,
-		   LinearFunction<CoarseField>  &CoarseSolver,
+	     GridBase *fine) : 
 		   LinearFunction<CoarseField>  &CoarseSolverPrecise,
 		   Aggregation &Aggregates
 		   ) : 
      Tolerance(tol), 
      MaxIterations(maxit),
      _FineLinop(FineLinop),
-      _Smoother(Smoother),
+      _Smoother(Smoother)
      _CoarseSolver(CoarseSolver),
      _CoarseSolverPrecise(CoarseSolverPrecise),
      _Aggregates(Aggregates)
  {
-    coarsegrid = Aggregates.CoarseGrid;
+    grid       = fine;
    grid       = Aggregates.FineGrid;
  };
  virtual void operator() (const Field &src, Field &psi)
@ -195,47 +179,8 @@ class TwoLevelADEF2 : public LinearFunction<Field>
 public:
-  virtual void PcgM1(Field & in, Field & out)
+  virtual void PcgM1(Field & in, Field & out)     =0;
-  {
+  virtual void Vstart(Field & x,const Field & src)=0;
    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
    Field tmp(grid);
    Field Min(grid);
    CoarseField PleftProj(coarsegrid);
    CoarseField PleftMss_proj(coarsegrid);
    GridStopWatch SmootherTimer;
    GridStopWatch MatrixTimer;
    SmootherTimer.Start();
    _Smoother(in,Min);
    SmootherTimer.Stop();
    MatrixTimer.Start();
    _FineLinop.HermOp(Min,out);
    MatrixTimer.Stop();
    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min
    GridStopWatch ProjTimer;
    GridStopWatch CoarseTimer;
    GridStopWatch PromTimer;
    ProjTimer.Start();
    _Aggregates.ProjectToSubspace(PleftProj,tmp);     
    ProjTimer.Stop();
    CoarseTimer.Start();
    _CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
    CoarseTimer.Stop();
    PromTimer.Start();
    _Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
    PromTimer.Stop();
    std::cout << GridLogMessage << "PcgM1 breakdown "<<std::endl;
    std::cout << GridLogMessage << "\tSmoother   " << SmootherTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tProj       " << ProjTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tCoarse     " << CoarseTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tProm       " << PromTimer.Elapsed() <<std::endl;
    axpy(out,1.0,Min,tmp); // Min+tmp
  }
  virtual void PcgM2(const Field & in, Field & out) {
    out=in;
@ -249,6 +194,89 @@ class TwoLevelADEF2 : public LinearFunction<Field>
    return dd;
  }
  /////////////////////////////////////////////////////////////////////
  // Only Def1 has non-trivial Vout.
  /////////////////////////////////////////////////////////////////////
  virtual void   Vout  (Field & in, Field & out,Field & src){
    out = in;
  }
 };
 template<class Field, class CoarseField, class Aggregation>
 class TwoLevelADEF2 : public TwoLevelCG<Field>
 {
 public:
  ///////////////////////////////////////////////////////////////////////////////////
  // Need something that knows how to get from Coarse to fine and back again
  //  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
  //  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
  ///////////////////////////////////////////////////////////////////////////////////
  GridBase *coarsegrid;
  Aggregation &_Aggregates;                    
  LinearFunction<CoarseField> &_CoarseSolver;
  LinearFunction<CoarseField> &_CoarseSolverPrecise;
  ///////////////////////////////////////////////////////////////////////////////////
  // more most opertor functions
  TwoLevelADEF2(RealD tol,
 		Integer maxit,
 		LinearOperatorBase<Field>   &FineLinop,
 		LinearFunction<Field>   &Smoother,
 		LinearFunction<CoarseField>  &CoarseSolver,
 		LinearFunction<CoarseField>  &CoarseSolverPrecise,
 		Aggregation &Aggregates
 		) :
    TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
      _CoarseSolver(CoarseSolver),
      _CoarseSolverPrecise(CoarseSolverPrecise),
      _Aggregates(Aggregates)
  {
    coarsegrid = Aggregates.CoarseGrid;
  };
  virtual void PcgM1(Field & in, Field & out)
  {
    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
    Field tmp(this->grid);
    Field Min(this->grid);
    CoarseField PleftProj(this->coarsegrid);
    CoarseField PleftMss_proj(this->coarsegrid);
    GridStopWatch SmootherTimer;
    GridStopWatch MatrixTimer;
    SmootherTimer.Start();
    this->_Smoother(in,Min);
    SmootherTimer.Stop();
    MatrixTimer.Start();
    this->_FineLinop.HermOp(Min,out);
    MatrixTimer.Stop();
    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min
    GridStopWatch ProjTimer;
    GridStopWatch CoarseTimer;
    GridStopWatch PromTimer;
    ProjTimer.Start();
    this->_Aggregates.ProjectToSubspace(PleftProj,tmp);     
    ProjTimer.Stop();
    CoarseTimer.Start();
    this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
    CoarseTimer.Stop();
    PromTimer.Start();
    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
    PromTimer.Stop();
    std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
    std::cout << GridLogPerformance << "\tSmoother   " << SmootherTimer.Elapsed() <<std::endl;
    std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
    std::cout << GridLogPerformance << "\tProj       " << ProjTimer.Elapsed() <<std::endl;
    std::cout << GridLogPerformance << "\tCoarse     " << CoarseTimer.Elapsed() <<std::endl;
    std::cout << GridLogPerformance << "\tProm       " << PromTimer.Elapsed() <<std::endl;
    axpy(out,1.0,Min,tmp); // Min+tmp
  }
  virtual void Vstart(Field & x,const Field & src)
  {
    ///////////////////////////////////
@ -262,65 +290,69 @@ class TwoLevelADEF2 : public LinearFunction<Field>
    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
    //                   = 0 
    ///////////////////////////////////
-    Field r(grid);
+    Field r(this->grid);
-    Field mmp(grid);
+    Field mmp(this->grid);
-    CoarseField PleftProj(coarsegrid);
+    CoarseField PleftProj(this->coarsegrid);
-    CoarseField PleftMss_proj(coarsegrid);
+    CoarseField PleftMss_proj(this->coarsegrid);
-    _Aggregates.ProjectToSubspace(PleftProj,src);     
+    this->_Aggregates.ProjectToSubspace(PleftProj,src);     
-    _CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    _Aggregates.PromoteFromSubspace(PleftMss_proj,x);  
+    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x);  
  }
  /////////////////////////////////////////////////////////////////////
  // Only Def1 has non-trivial Vout.
  /////////////////////////////////////////////////////////////////////
  virtual void   Vout  (Field & in, Field & out,Field & src){
    out = in;
  }
 };
-template<class Field, class CoarseField, class Aggregation>
+template<class Field>
-class TwoLevelADEF1ev : public TwoLevelADEF2<Field,CoarseField,Aggregation>
+class TwoLevelADEF1defl : public TwoLevelCG<Field>
 {
-  TwoLevelADEF1ev(RealD tol,
+public:
  const std::vector<Field> &evec;
  const std::vector<RealD> &eval;
  TwoLevelADEF1defl(RealD tol,
 		   Integer maxit,
 		   LinearOperatorBase<Field>   &FineLinop,
 		   LinearFunction<Field>   &Smoother,
-		   LinearFunction<CoarseField>  &CoarseSolver,
+		   std::vector<Field> &_evec,
-		   LinearFunction<CoarseField>  &CoarseSolverPrecise,
+		   std::vector<RealD> &_eval) : 
-		   Aggregation &Aggregates ) : 
+    TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
-    TwoLevelADEF2<Field,CoarseField,Aggregation>(tol,maxit,FineLinop,Smoother,CoarseSolver,CoarseSolverPrecise,Aggregates)
+    evec(_evec),
    eval(_eval)
  {};
  // Can just inherit existing Vstart
  // Can just inherit existing Vout
  // Can just inherit existing M2
  // Can just inherit existing M3
  // Simple vstart - do nothing
  virtual void Vstart(Field & x,const Field & src){};
  // Override PcgM1
  virtual void PcgM1(Field & in, Field & out)
  {
-    CoarseField PleftProj(this->coarsegrid);
+    int N=evec.size();
    CoarseField PleftMss_proj(this->coarsegrid);
    Field tmp(this->grid);
    Field Pin(this->grid);
    Field Qin(this->grid);
    //MP  + Q = M(1-AQ) + Q = M
    // // If we are eigenvector deflating in coarse space
    // // Q   = Sum_i |phi_i> 1/lambda_i <phi_i|
    // // A Q = Sum_i |phi_i> <phi_i|
    // // M(1-AQ) = M(1-proj) + Q
-    this->_Aggregates.ProjectToSubspace(PleftProj,in);     
+    Qin.Checkerboard()=in.Checkerboard();
-    this->_Aggregates.PromoteFromSubspace(PleftProj,tmp);// tmp = Qin
+    Qin = Zero();
    Pin = in;
    for (int i=0;i<N;i++) {
      const Field& tmp = evec[i];
      auto ip = TensorRemove(innerProduct(tmp,in));
      axpy(Qin, ip / eval[i],tmp,Qin);
      axpy(Pin, -ip ,tmp,Pin);
    }
    Pin = in - tmp;
    this->_Smoother(Pin,out);
-    this->_CoarseSolver(PleftProj,PleftMss_proj); 
+    out = out + Qin;
    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Qin
    out = out + tmp;
  }
 };
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@ -70,8 +70,8 @@ public:
  Coordinate _istride;    // Inner stride i.e. within simd lane
  int _osites;                  // _isites*_osites = product(dimensions).
  int _isites;
-  int _fsites;                  // _isites*_osites = product(dimensions).
+  int64_t _fsites;                  // _isites*_osites = product(dimensions).
-  int _gsites;
+  int64_t _gsites;
  Coordinate _slice_block;// subslice information
  Coordinate _slice_stride;
  Coordinate _slice_nblock;
@ -183,7 +183,7 @@ public:
  inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
  inline int oSites(void) const { return _osites; };
  inline int lSites(void) const { return _isites*_osites; }; 
-  inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
+  inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; 
  inline int Nd    (void) const { return _ndimension;};
  inline const Coordinate LocalStarts(void)             { return _lstart;    };
@ -214,7 +214,7 @@ public:
  ////////////////////////////////////////////////////////////////
  // Global addressing
  ////////////////////////////////////////////////////////////////
-  void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
+  void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
    assert(gidx< gSites());
    Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
  }
@ -222,7 +222,7 @@ public:
    assert(lidx<lSites());
    Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
  }
-  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
+  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
    gidx=0;
    int mult=1;
    for(int mu=0;mu<_ndimension;mu++) {
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -360,7 +360,7 @@ public:
 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
  typedef typename vobj::scalar_object sobj;
-  for(int g=0;g<o.Grid()->_gsites;g++){
+  for(int64_t g=0;g<o.Grid()->_gsites;g++){
    Coordinate gcoor;
    o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@ -432,7 +432,7 @@ public:
 #if 1
    thread_for( lidx, _grid->lSites(), {
-	int gidx;
+	int64_t gidx;
 	int o_idx;
 	int i_idx;
 	int rank;
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -471,13 +471,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
  vobj zz = Zero();
-  accelerator_for(sc,coarse->oSites(),1,{
+  accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{
      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate
-      vobj cd = zz;
+      auto cd = coalescedRead(zz);
      for(int sb=0;sb<blockVol;sb++){
@ -488,10 +488,10 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
 	Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
-	cd=cd+fineData_p[sf];
+	cd=cd+coalescedRead(fineData_p[sf]);
      }
-      coarseData_p[sc] = cd;
+      coalescedWrite(coarseData_p[sc],cd);
    });
  return;
@ -1054,7 +1054,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
  Coordinate fcoor(nd);
  Coordinate ccoor(nd);
-  for(int g=0;g<fg->gSites();g++){
+  for(int64_t g=0;g<fg->gSites();g++){
    fg->GlobalIndexToGlobalCoor(g,fcoor);
    for(int d=0;d<nd;d++){
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -86,7 +86,9 @@ public:
    // expand up one dim at a time
    for(int d=0;d<dims;d++){
-      plocal[d] += 2*depth; 
+      if ( processors[d] > 1 ) { 
 	plocal[d] += 2*depth; 
      }
      for(int d=0;d<dims;d++){
 	global[d] = plocal[d]*processors[d];
@ -98,11 +100,17 @@ public:
  template<class vobj>
  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
  {
    Coordinate processors=unpadded_grid->_processors;
    Lattice<vobj> out(unpadded_grid);
    Coordinate local     =unpadded_grid->LocalDimensions();
-    Coordinate fll(dims,depth); // depends on the MPI spread
+    // depends on the MPI spread      
    Coordinate fll(dims,depth);
    Coordinate tll(dims,0); // depends on the MPI spread
    for(int d=0;d<dims;d++){
      if( processors[d]==1 ) fll[d]=0;
    }
    localCopyRegion(in,out,fll,tll,local);
    return out;
  }
@ -121,6 +129,7 @@ public:
  template<class vobj>
  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
    Coordinate processors=unpadded_grid->_processors;
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
    Lattice<vobj>  padded(new_grid);
@ -134,35 +143,47 @@ public:
    double tins=0, tshift=0;
-    // Middle bit
+    int islocal = 0 ;
-    double t = usecond();
+    if ( processors[dim] == 1 ) islocal = 1;
-    for(int x=0;x<local[dim];x++){
+
-      InsertSliceLocal(in,padded,x,depth+x,dim);
+    if ( islocal ) {
      double t = usecond();
      for(int x=0;x<local[dim];x++){
 	InsertSliceLocal(in,padded,x,x,dim);
      }
      tins += usecond() - t;
    } else { 
      // Middle bit
      double t = usecond();
      for(int x=0;x<local[dim];x++){
 	InsertSliceLocal(in,padded,x,depth+x,dim);
      }
      tins += usecond() - t;
      // High bit
      t = usecond();
      shifted = cshift.Cshift(in,dim,depth);
      tshift += usecond() - t;
      t=usecond();
      for(int x=0;x<depth;x++){
 	InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
      }
      tins += usecond() - t;
      // Low bit
      t = usecond();
      shifted = cshift.Cshift(in,dim,-depth);
      tshift += usecond() - t;
      t = usecond();
      for(int x=0;x<depth;x++){
 	InsertSliceLocal(shifted,padded,x,x,dim);
      }
      tins += usecond() - t;
    }
    tins += usecond() - t;
    // High bit
    t = usecond();
    shifted = cshift.Cshift(in,dim,depth);
    tshift += usecond() - t;
    t=usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
    }
    tins += usecond() - t;
    // Low bit
    t = usecond();
    shifted = cshift.Cshift(in,dim,-depth);
    tshift += usecond() - t;
    t = usecond();
    for(int x=0;x<depth;x++){
      InsertSliceLocal(shifted,padded,x,x,dim);
    }
    tins += usecond() - t;
    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
    return padded;
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@ -8,7 +8,7 @@ namespace Grid{
  public:
    template<class coor_t>
-    static accelerator_inline void CoorFromIndex (coor_t& coor,int index,const coor_t &dims){
+    static accelerator_inline void CoorFromIndex (coor_t& coor,int64_t index,const coor_t &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=0;d<nd;d++){
@ -18,28 +18,45 @@ namespace Grid{
    }
    template<class coor_t>
-    static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
+    static accelerator_inline void IndexFromCoor (const coor_t& coor,int64_t &index,const coor_t &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=0;d<nd;d++){
-	index = index+stride*coor[d];
+	index = index+(int64_t)stride*coor[d];
 	stride=stride*dims[d];
      }
    }
    template<class coor_t>
    static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
      int64_t index64;
      IndexFromCoor(coor,index64,dims);
      assert(index64<2*1024*1024*1024LL);
      index = (int) index64;
    }
    template<class coor_t>
-    static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
+    static inline void IndexFromCoorReversed (const coor_t& coor,int64_t &index,const coor_t &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=nd-1;d>=0;d--){
-	index = index+stride*coor[d];
+	index = index+(int64_t)stride*coor[d];
 	stride=stride*dims[d];
      }
    }
    template<class coor_t>
-    static inline void CoorFromIndexReversed (coor_t& coor,int index,const coor_t &dims){
+    static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
      int64_t index64;
      IndexFromCoorReversed(coor,index64,dims);
      if ( index64>=2*1024*1024*1024LL ){
 	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
      }
      assert(index64<2*1024*1024*1024LL);
      index = (int) index64;
    }
    template<class coor_t>
    static inline void CoorFromIndexReversed (coor_t& coor,int64_t index,const coor_t &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=nd-1;d>=0;d--){
--- a/systems/Frontier/benchmarks/bench2.slurm
+++ b/systems/Frontier/benchmarks/bench2.slurm
@ -0,0 +1,43 @@
 #!/bin/bash -l
 #SBATCH --job-name=bench
 ##SBATCH --partition=small-g
 #SBATCH --nodes=2
 #SBATCH --ntasks-per-node=8
 #SBATCH --cpus-per-task=7
 #SBATCH --gpus-per-node=8
 #SBATCH --time=00:10:00
 #SBATCH --account=phy157_dwf
 #SBATCH --gpu-bind=none
 #SBATCH --exclusive
 #SBATCH --mem=0
 cat << EOF > select_gpu
 #!/bin/bash
 export GPU_MAP=(0 1 2 3 7 6 5 4)
 export NUMA_MAP=(3 3 1 1 2 2 0 0)
 export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
 export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
 export HIP_VISIBLE_DEVICES=\$GPU
 unset ROCR_VISIBLE_DEVICES
 echo RANK \$SLURM_LOCALID using GPU \$GPU    
 exec numactl -m \$NUMA -N \$NUMA \$*
 EOF
 chmod +x ./select_gpu
 root=$HOME/Frontier/Grid/systems/Frontier/
 source ${root}/sourceme.sh
 export OMP_NUM_THREADS=7
 export MPICH_GPU_SUPPORT_ENABLED=1
 export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
 for vol in 32.32.32.64
 do
 srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
 srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
 srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
 srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
 done
--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@ -0,0 +1,23 @@
 CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
 ../../configure --enable-comms=mpi-auto \
 --with-lime=$CLIME \
 --enable-unified=no \
 --enable-shm=nvlink \
 --enable-tracing=timer \
 --enable-accelerator=hip \
 --enable-gen-simd-width=64 \
 --disable-gparity \
 --disable-fermion-reps \
 --enable-simd=GPU \
 --enable-accelerator-cshift \
 --with-gmp=$OLCF_GMP_ROOT \
 --with-fftw=$FFTW_DIR/.. \
 --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
 --disable-fermion-reps \
 CXX=hipcc MPICXX=mpicxx \
 CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include -L/lib64 " \
 LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
--- a/systems/Frontier/mpiwrapper.sh
+++ b/systems/Frontier/mpiwrapper.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 lrank=$SLURM_LOCALID
 lgpu=(0 1 2 3 7 6 5 4)
 export ROCR_VISIBLE_DEVICES=${lgpu[$lrank]}
 echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES "
 $*
--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@ -0,0 +1,13 @@
 . /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
 spack load c-lime
 #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib
 module load emacs 
 module load PrgEnv-gnu
 module load rocm
 module load cray-mpich/8.1.23
 module load gmp
 module load cray-fftw
 module load craype-accel-amd-gfx90a
 export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
 #Hack for lib
 #export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
--- a/systems/Frontier/wrap.sh
+++ b/systems/Frontier/wrap.sh
@ -0,0 +1,9 @@
 #!/bin/sh
 export HIP_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES
 unset ROCR_VISIBLE_DEVICES
 #rank=$SLURM_PROCID
 #rocprof -d rocprof.$rank -o rocprof.$rank/results.rank$SLURM_PROCID.csv --sys-trace $@
 $@
--- a/tests/debug/Test_general_coarse.cc
+++ b/tests/debug/Test_general_coarse.cc
@ -144,12 +144,7 @@ int main (int argc, char ** argv)
  HermOpAdaptor<LatticeFermionD> HOA(HermDefOp);
  int pp=16;
-  //  LittleDiracOpCol.CoarsenOperator(HOA,Aggregates);
+  LittleDiracOp.CoarsenOperator(HOA,Aggregates);
  //  std::cout << "LittleDiracOp old " << LittleDiracOpCol._A[pp]<<std::endl;
  LittleDiracOp.CoarsenOperatorColoured(HOA,Aggregates);
  //  LittleDiracOp.ExchangeCoarseLinks();
  //  std::cout << "LittleDiracOp new " << LittleDiracOp._A[pp]<<std::endl;
  ///////////////////////////////////////////////////
  // Test the operator
--- a/tests/debug/Test_general_coarse_hdcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg.cc
@ -86,6 +86,12 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);
  const int Ls=16;
  const int nbasis = 40;
  const int cb = 0 ;
  RealD mass=0.01;
  RealD M5=1.8;
  RealD b=1.5;
  RealD c=0.5;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
@ -121,10 +127,6 @@ int main (int argc, char ** argv)
  NerscIO::readConfiguration(Umu,header,file);
  //////////////////////// Fermion action //////////////////////////////////
  RealD mass=0.01;
  RealD M5=1.8;
  RealD b=1.5;
  RealD c=0.5;
  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
@ -143,8 +145,6 @@ int main (int argc, char ** argv)
  ////////////////////////////////////////////////////////////
  ///////////// Coarse basis and Little Dirac Operator ///////
  ////////////////////////////////////////////////////////////
  const int nbasis = 40;
  const int cb = 0 ;
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
@ -154,6 +154,18 @@ int main (int argc, char ** argv)
  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse5d,FrbGrid,cb);
 #if 1
  Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
 				     95.0,0.1,
 				     //				     400,200,200 -- 48 iters
 				     //				     600,200,200 -- 38 iters, 162s
 				     //				     600,200,100 -- 38 iters, 169s
 				     //				     600,200,50  -- 88 iters. 370s 
 				     600,
 				     200,
 				     100,
 				     0.0);
 #else
  Aggregates.CreateSubspaceChebyshev(RNG5,
 				     HermOpEO,
 				     nbasis,
@ -172,18 +184,21 @@ int main (int argc, char ** argv)
 				     //				     0.1,  // nbasis 16 -- 142; sloppy solve
 				     0.1,  // nbasis 24 
 				     300);
 #endif
  ////////////////////////////////////////////////////////////
  // Need to check about red-black grid coarsening
  ////////////////////////////////////////////////////////////
  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
-  LittleDiracOp.CoarsenOperatorColoured(FineHermOp,Aggregates);
+  LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
  // Try projecting to one hop only
  std::cout << " projecting coarse matrix "<<std::endl;
  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
  LittleDiracOpProj.ProjectNearestNeighbour(0.5,LittleDiracOp);
  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
  HermMatrix CoarseOp (LittleDiracOp);
  HermMatrix CoarseOpProj (LittleDiracOpProj);
  //////////////////////////////////////////
  // Build a coarse lanczos
@ -200,12 +215,14 @@ int main (int argc, char ** argv)
  std::vector<RealD>            eval(Nm);
  std::vector<CoarseVector>     evec(Nm,Coarse5d);
  CoarseVector c_src(Coarse5d); c_src=1.0;
  CoarseVector c_res(Coarse5d); 
  PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src);
  IRL.calc(eval,evec,c_src,Nconv);
  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
  //////////////////////////////////////////
  // Build a coarse space solver
  //////////////////////////////////////////
@ -216,6 +233,52 @@ int main (int argc, char ** argv)
  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
  c_res=Zero();
  HPDSolve(c_src,c_res); 
  //////////////////////////////////////////////////////////////////////////
  // Deflated (with real op EV's) solve for the projected coarse op
  // Work towards ADEF1 in the coarse space
  //////////////////////////////////////////////////////////////////////////
  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
  c_res=Zero();
  HPDSolveProj(c_src,c_res);
  //////////////////////////////////////////////////////////////////////
  // Coarse ADEF1 with deflation space
  //////////////////////////////////////////////////////////////////////
  //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(2.0,40.,8,CoarseOpProj);  // 311
  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(2.0,36.,12,CoarseOpProj);  // 311
  ////////////////////////////////////////////////////////
  // CG, Cheby mode spacing 200,200
  // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
  // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s
  // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
  ////////////////////////////////////////////////////////
  // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs 
  ////////////////////////////////////////////////////////
  // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain
  // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
  // HDCG 38 iters 162s
  //
  // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs 
  // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain
  // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
  // HDCG 38 iters 169s
  TwoLevelADEF1defl<CoarseVector>
    cADEF1(1.0e-8, 100,
 	   CoarseOp,
 	   CoarseSmoother,
 	   evec,eval);
  c_res=Zero();
  cADEF1(c_src,c_res);
  cADEF1.Tolerance = 4.0e-2;
  c_res=Zero();
  cADEF1(c_src,c_res);
  //////////////////////////////////////////
  // Build a smoother
@ -242,10 +305,6 @@ int main (int argc, char ** argv)
  //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)  
  // Standard CG
  //      result=Zero();
  //      CGfine(HermOpEO, src, result);
  for(int l=0;l<los.size();l++){
    RealD lo = los[l];
@ -271,9 +330,25 @@ int main (int argc, char ** argv)
      result=Zero();
      HDCG(src,result);
      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
 	HDCGdefl(1.0e-8, 3000,
 	     FineHermOp,
 	     Smoother,
 	     cADEF1,
 	     HPDSolve,
 	     Aggregates);
      result=Zero();
      HDCGdefl(src,result);
    }
  }
  // Standard CG
  result=Zero();
  CGfine(HermOpEO, src, result);
  Grid_finalize();
  return 0;
 }
Author	SHA1	Message	Date
Peter Boyle	3bc2da5321	Merge branch 'feature/scidac-wp1' of https://github.com/paboyle/Grid into feature/scidac-wp1	2023-10-05 16:57:59 -04:00
Peter Boyle	2d710d6bfd	Optimised parameters for 16^3	2023-10-05 16:56:55 -04:00
Peter Boyle	6532b7f32b	Eliminate older inefficient coarsening implementation	2023-10-05 16:56:15 -04:00
Peter Boyle	7b41b92d99	Only need to bad non-local dimensions	2023-10-05 16:55:48 -04:00
Peter Boyle	dd557af84b	ADEF1 and ADEF2 2 level CG	2023-10-05 16:55:19 -04:00
Peter Boyle	59b9d0e030	coalesceRead the blockSum	2023-10-05 16:54:48 -04:00
Peter Boyle	b82eee4733	Hermitian dealing with	2023-10-05 16:54:14 -04:00
Peter Boyle	6a87487544	Running on Frontier, fix RNG big volume y2k, affecting 5D RNG	2023-10-05 16:50:59 -04:00
Peter Boyle	fcf5023845	Running on Frontier	2023-10-05 16:50:59 -04:00
Peter Boyle	c8adad6d8b	First runs on Summit. PopulateAdag needs work	2023-10-05 16:50:54 -04:00