Merge branch 'feature/scidac-wp1' of https://github.com/paboyle/Grid into feature/scidac-wp1

Optimised parameters for 16^3
Eliminate older inefficient coarsening implementation
2025-07-01 22:17:08 +01:00 · 2023-10-05 16:57:59 -04:00 · 2023-10-05 16:56:55 -04:00 · 2023-10-05 16:56:15 -04:00 · 2023-10-05 16:55:48 -04:00 · 2023-10-05 16:55:19 -04:00
15 changed files with 432 additions and 183 deletions
--- a/Grid/algorithms/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/GeneralCoarsenedMatrix.h
@ -226,6 +226,7 @@ public:
  void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
  {
    int nfound=0;
+    std::cout << " ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
    for(int p=0;p<geom.npoint;p++){
      for(int pp=0;pp<CopyMe.geom.npoint;pp++){
 	// Search for the same relative shift
@ -233,12 +234,12 @@ public:
 	if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
 	  _A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
 	  _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
-	  ExchangeCoarseLinks();
 	  nfound++;
 	}
      }
    }
    assert(nfound==geom.npoint);
+    ExchangeCoarseLinks();
  }
  
  GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
@ -271,7 +272,8 @@ public:
  }
  void Mdag (const CoarseVector &in, CoarseVector &out)
  {
-    Mult(_Adag,in,out);
+    if ( hermitian ) M(in,out);
+    else Mult(_Adag,in,out);
  }
  void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
  {
@ -298,10 +300,10 @@ public:
    const int Nsimd = CComplex::Nsimd();
    
    int osites=pin.Grid()->oSites();
-    int gsites=pin.Grid()->gSites();
+    //    int gsites=pin.Grid()->gSites();

-    RealD flops = 1.0* npoint * nbasis * nbasis * 8 * gsites;
-    RealD bytes = (1.0*osites*sizeof(siteMatrix)+2.0*osites*sizeof(siteVector))*npoint;
+    RealD flops = 1.0* npoint * nbasis * nbasis * 8 * osites;
+    RealD bytes = (1.0*osites*sizeof(siteMatrix)*npoint+2.0*osites*sizeof(siteVector))*npoint;
      
    //    for(int point=0;point<npoint;point++){
    //      conformable(A[point],pin);
@ -343,20 +345,20 @@ public:
    text+=usecond();
    ttot+=usecond();

-    std::cout << GridLogMessage<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Kernel "<< flops/tmult<<" mflop/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse Kernel "<< bytes/tmult<<" MB/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
-    std::cout << GridLogMessage<<"Coarse bytes   "<< bytes/1e6<<" MB"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel "<< flops/tmult<<" mflop/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel "<< bytes/tmult<<" MB/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse bytes   "<< bytes/1e6<<" MB"<<std::endl;
  };

  void PopulateAdag(void)
  {
-    for(int bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
+    for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
      Coordinate bcoor;
      CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
      
@ -412,10 +414,10 @@ public:
     *
     *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
     */
-  void CoarsenOperatorColoured(LinearOperatorBase<Lattice<Fobj> > &linop,
-			       Aggregation<Fobj,CComplex,nbasis> & Subspace)
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
  {
-    std::cout << GridLogMessage<< "CoarsenMatrixColoured "<< std::endl;
+    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
    GridBase *grid = FineGrid();

    RealD tproj=0.0;
@ -540,9 +542,14 @@ public:
      auto sval = peekSite(_A[p],coor);
    }

-    PopulateAdag();
+    // Only needed if nonhermitian
+    if ( ! hermitian ) {
+      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
+      PopulateAdag();
+    }

    // Need to write something to populate Adag from A
+    std::cout << GridLogMessage<<"ExchangeCoarseLinks  "<<std::endl;
    ExchangeCoarseLinks();
    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
@ -552,6 +559,7 @@ public:
  }
  void ExchangeCoarseLinks(void){
    for(int p=0;p<geom.npoint;p++){
+      std::cout << "Exchange "<<p<<std::endl;
      _A[p] = Cell.Exchange(_A[p]);
      _Adag[p]= Cell.Exchange(_Adag[p]);
    }
--- a/Grid/algorithms/iterative/AdefGeneric.h
+++ b/Grid/algorithms/iterative/AdefGeneric.h
@ -42,46 +42,30 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   */
 NAMESPACE_BEGIN(Grid);

-template<class Field, class CoarseField, class Aggregation>
-class TwoLevelADEF2 : public LinearFunction<Field>
+template<class Field>
+class TwoLevelCG : public LinearFunction<Field>
 {
 public:
  RealD   Tolerance;
  Integer MaxIterations;
-  const int mmax = 1;
  GridBase *grid;
-  GridBase *coarsegrid;

  // Fine operator, Smoother, CoarseSolver
  LinearOperatorBase<Field>   &_FineLinop;
  LinearFunction<Field>   &_Smoother;
-  LinearFunction<CoarseField> &_CoarseSolver;
-  LinearFunction<CoarseField> &_CoarseSolverPrecise;
-
-  // Need something that knows how to get from Coarse to fine and back again
-  //  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
-  //  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
-  Aggregation &_Aggregates;                    
  
  // more most opertor functions
-  TwoLevelADEF2(RealD tol,
-		   Integer maxit,
-		   LinearOperatorBase<Field>   &FineLinop,
-		   LinearFunction<Field>   &Smoother,
-		   LinearFunction<CoarseField>  &CoarseSolver,
-		   LinearFunction<CoarseField>  &CoarseSolverPrecise,
-		   Aggregation &Aggregates
-		   ) : 
+  TwoLevelCG(RealD tol,
+	     Integer maxit,
+	     LinearOperatorBase<Field>   &FineLinop,
+	     LinearFunction<Field>       &Smoother,
+	     GridBase *fine) : 
      Tolerance(tol), 
      MaxIterations(maxit),
      _FineLinop(FineLinop),
-      _Smoother(Smoother),
-      _CoarseSolver(CoarseSolver),
-      _CoarseSolverPrecise(CoarseSolverPrecise),
-      _Aggregates(Aggregates)
+      _Smoother(Smoother)
  {
-    coarsegrid = Aggregates.CoarseGrid;
-    grid       = Aggregates.FineGrid;
+    grid       = fine;
  };
  
  virtual void operator() (const Field &src, Field &psi)
@ -195,47 +179,8 @@ class TwoLevelADEF2 : public LinearFunction<Field>

 public:

-  virtual void PcgM1(Field & in, Field & out)
-  {
-    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
-
-    Field tmp(grid);
-    Field Min(grid);
-    CoarseField PleftProj(coarsegrid);
-    CoarseField PleftMss_proj(coarsegrid);
-
-    GridStopWatch SmootherTimer;
-    GridStopWatch MatrixTimer;
-    SmootherTimer.Start();
-    _Smoother(in,Min);
-    SmootherTimer.Stop();
-
-    MatrixTimer.Start();
-    _FineLinop.HermOp(Min,out);
-    MatrixTimer.Stop();
-    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min
-
-    GridStopWatch ProjTimer;
-    GridStopWatch CoarseTimer;
-    GridStopWatch PromTimer;
-    ProjTimer.Start();
-    _Aggregates.ProjectToSubspace(PleftProj,tmp);     
-    ProjTimer.Stop();
-    CoarseTimer.Start();
-    _CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
-    CoarseTimer.Stop();
-    PromTimer.Start();
-    _Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
-    PromTimer.Stop();
-    std::cout << GridLogMessage << "PcgM1 breakdown "<<std::endl;
-    std::cout << GridLogMessage << "\tSmoother   " << SmootherTimer.Elapsed() <<std::endl;
-    std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
-    std::cout << GridLogMessage << "\tProj       " << ProjTimer.Elapsed() <<std::endl;
-    std::cout << GridLogMessage << "\tCoarse     " << CoarseTimer.Elapsed() <<std::endl;
-    std::cout << GridLogMessage << "\tProm       " << PromTimer.Elapsed() <<std::endl;
-
-    axpy(out,1.0,Min,tmp); // Min+tmp
-  }
+  virtual void PcgM1(Field & in, Field & out)     =0;
+  virtual void Vstart(Field & x,const Field & src)=0;

  virtual void PcgM2(const Field & in, Field & out) {
    out=in;
@ -249,6 +194,89 @@ class TwoLevelADEF2 : public LinearFunction<Field>
    return dd;
  }

+  /////////////////////////////////////////////////////////////////////
+  // Only Def1 has non-trivial Vout.
+  /////////////////////////////////////////////////////////////////////
+  virtual void   Vout  (Field & in, Field & out,Field & src){
+    out = in;
+  }
+
+};
+  
+template<class Field, class CoarseField, class Aggregation>
+class TwoLevelADEF2 : public TwoLevelCG<Field>
+{
+ public:
+  ///////////////////////////////////////////////////////////////////////////////////
+  // Need something that knows how to get from Coarse to fine and back again
+  //  void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
+  //  void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
+  ///////////////////////////////////////////////////////////////////////////////////
+  GridBase *coarsegrid;
+  Aggregation &_Aggregates;                    
+  LinearFunction<CoarseField> &_CoarseSolver;
+  LinearFunction<CoarseField> &_CoarseSolverPrecise;
+  ///////////////////////////////////////////////////////////////////////////////////
+  
+  // more most opertor functions
+  TwoLevelADEF2(RealD tol,
+		Integer maxit,
+		LinearOperatorBase<Field>   &FineLinop,
+		LinearFunction<Field>   &Smoother,
+		LinearFunction<CoarseField>  &CoarseSolver,
+		LinearFunction<CoarseField>  &CoarseSolverPrecise,
+		Aggregation &Aggregates
+		) :
+    TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
+      _CoarseSolver(CoarseSolver),
+      _CoarseSolverPrecise(CoarseSolverPrecise),
+      _Aggregates(Aggregates)
+  {
+    coarsegrid = Aggregates.CoarseGrid;
+  };
+
+  virtual void PcgM1(Field & in, Field & out)
+  {
+    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
+
+    Field tmp(this->grid);
+    Field Min(this->grid);
+    CoarseField PleftProj(this->coarsegrid);
+    CoarseField PleftMss_proj(this->coarsegrid);
+
+    GridStopWatch SmootherTimer;
+    GridStopWatch MatrixTimer;
+    SmootherTimer.Start();
+    this->_Smoother(in,Min);
+    SmootherTimer.Stop();
+
+    MatrixTimer.Start();
+    this->_FineLinop.HermOp(Min,out);
+    MatrixTimer.Stop();
+    axpy(tmp,-1.0,out,in);          // tmp  = in - A Min
+
+    GridStopWatch ProjTimer;
+    GridStopWatch CoarseTimer;
+    GridStopWatch PromTimer;
+    ProjTimer.Start();
+    this->_Aggregates.ProjectToSubspace(PleftProj,tmp);     
+    ProjTimer.Stop();
+    CoarseTimer.Start();
+    this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
+    CoarseTimer.Stop();
+    PromTimer.Start();
+    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
+    PromTimer.Stop();
+    std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
+    std::cout << GridLogPerformance << "\tSmoother   " << SmootherTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tProj       " << ProjTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tCoarse     " << CoarseTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\tProm       " << PromTimer.Elapsed() <<std::endl;
+
+    axpy(out,1.0,Min,tmp); // Min+tmp
+  }
+
  virtual void Vstart(Field & x,const Field & src)
  {
    ///////////////////////////////////
@ -262,65 +290,69 @@ class TwoLevelADEF2 : public LinearFunction<Field>
    //                   = src_s - (A guess)_s - src_s  + (A guess)_s 
    //                   = 0 
    ///////////////////////////////////
-    Field r(grid);
-    Field mmp(grid);
-    CoarseField PleftProj(coarsegrid);
-    CoarseField PleftMss_proj(coarsegrid);
+    Field r(this->grid);
+    Field mmp(this->grid);
+    CoarseField PleftProj(this->coarsegrid);
+    CoarseField PleftMss_proj(this->coarsegrid);

-    _Aggregates.ProjectToSubspace(PleftProj,src);     
-    _CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
-    _Aggregates.PromoteFromSubspace(PleftMss_proj,x);  
+    this->_Aggregates.ProjectToSubspace(PleftProj,src);     
+    this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x);  

  }

-  /////////////////////////////////////////////////////////////////////
-  // Only Def1 has non-trivial Vout.
-  /////////////////////////////////////////////////////////////////////
-  virtual void   Vout  (Field & in, Field & out,Field & src){
-    out = in;
-  }
 };

-template<class Field, class CoarseField, class Aggregation>
-class TwoLevelADEF1ev : public TwoLevelADEF2<Field,CoarseField,Aggregation>
+template<class Field>
+class TwoLevelADEF1defl : public TwoLevelCG<Field>
 {
-  TwoLevelADEF1ev(RealD tol,
+public:
+  const std::vector<Field> &evec;
+  const std::vector<RealD> &eval;
+  
+  TwoLevelADEF1defl(RealD tol,
 		   Integer maxit,
 		   LinearOperatorBase<Field>   &FineLinop,
 		   LinearFunction<Field>   &Smoother,
-		   LinearFunction<CoarseField>  &CoarseSolver,
-		   LinearFunction<CoarseField>  &CoarseSolverPrecise,
-		   Aggregation &Aggregates ) : 
-    TwoLevelADEF2<Field,CoarseField,Aggregation>(tol,maxit,FineLinop,Smoother,CoarseSolver,CoarseSolverPrecise,Aggregates)
+		   std::vector<Field> &_evec,
+		   std::vector<RealD> &_eval) : 
+    TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
+    evec(_evec),
+    eval(_eval)
  {};

-  // Can just inherit existing Vstart
  // Can just inherit existing Vout
  // Can just inherit existing M2
  // Can just inherit existing M3
+
+  // Simple vstart - do nothing
+  virtual void Vstart(Field & x,const Field & src){};
+
  // Override PcgM1
  virtual void PcgM1(Field & in, Field & out)
  {
-    CoarseField PleftProj(this->coarsegrid);
-    CoarseField PleftMss_proj(this->coarsegrid);
-    Field tmp(this->grid);
+    int N=evec.size();
    Field Pin(this->grid);
+    Field Qin(this->grid);

    //MP  + Q = M(1-AQ) + Q = M
    // // If we are eigenvector deflating in coarse space
    // // Q   = Sum_i |phi_i> 1/lambda_i <phi_i|
    // // A Q = Sum_i |phi_i> <phi_i|
    // // M(1-AQ) = M(1-proj) + Q
-    this->_Aggregates.ProjectToSubspace(PleftProj,in);     
-    this->_Aggregates.PromoteFromSubspace(PleftProj,tmp);// tmp = Qin
+    Qin.Checkerboard()=in.Checkerboard();
+    Qin = Zero();
+    Pin = in;
+    for (int i=0;i<N;i++) {
+      const Field& tmp = evec[i];
+      auto ip = TensorRemove(innerProduct(tmp,in));
+      axpy(Qin, ip / eval[i],tmp,Qin);
+      axpy(Pin, -ip ,tmp,Pin);
+    }

-    Pin = in - tmp;
    this->_Smoother(Pin,out);

-    this->_CoarseSolver(PleftProj,PleftMss_proj); 
-    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Qin
-
-    out = out + tmp;
+    out = out + Qin;
  }
 };

--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@ -70,8 +70,8 @@ public:
  Coordinate _istride;    // Inner stride i.e. within simd lane
  int _osites;                  // _isites*_osites = product(dimensions).
  int _isites;
-  int _fsites;                  // _isites*_osites = product(dimensions).
-  int _gsites;
+  int64_t _fsites;                  // _isites*_osites = product(dimensions).
+  int64_t _gsites;
  Coordinate _slice_block;// subslice information
  Coordinate _slice_stride;
  Coordinate _slice_nblock;
@ -183,7 +183,7 @@ public:
  inline int Nsimd(void)  const { return _isites; };// Synonymous with iSites
  inline int oSites(void) const { return _osites; };
  inline int lSites(void) const { return _isites*_osites; }; 
-  inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; 
+  inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; 
  inline int Nd    (void) const { return _ndimension;};

  inline const Coordinate LocalStarts(void)             { return _lstart;    };
@ -214,7 +214,7 @@ public:
  ////////////////////////////////////////////////////////////////
  // Global addressing
  ////////////////////////////////////////////////////////////////
-  void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
+  void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
    assert(gidx< gSites());
    Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
  }
@ -222,7 +222,7 @@ public:
    assert(lidx<lSites());
    Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
  }
-  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
+  void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){
    gidx=0;
    int mult=1;
    for(int mu=0;mu<_ndimension;mu++) {
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -360,7 +360,7 @@ public:

 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
  typedef typename vobj::scalar_object sobj;
-  for(int g=0;g<o.Grid()->_gsites;g++){
+  for(int64_t g=0;g<o.Grid()->_gsites;g++){

    Coordinate gcoor;
    o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@ -432,7 +432,7 @@ public:
 #if 1
    thread_for( lidx, _grid->lSites(), {

-	int gidx;
+	int64_t gidx;
 	int o_idx;
 	int i_idx;
 	int rank;
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
@ -471,13 +471,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)

  vobj zz = Zero();
  
-  accelerator_for(sc,coarse->oSites(),1,{
+  accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{

      // One thread per sub block
      Coordinate coor_c(_ndimension);
      Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate

-      vobj cd = zz;
+      auto cd = coalescedRead(zz);
      
      for(int sb=0;sb<blockVol;sb++){

@ -488,10 +488,10 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
 	for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
 	Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);

-	cd=cd+fineData_p[sf];
+	cd=cd+coalescedRead(fineData_p[sf]);
      }

-      coarseData_p[sc] = cd;
+      coalescedWrite(coarseData_p[sc],cd);

    });
  return;
@ -1054,7 +1054,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)

  Coordinate fcoor(nd);
  Coordinate ccoor(nd);
-  for(int g=0;g<fg->gSites();g++){
+  for(int64_t g=0;g<fg->gSites();g++){

    fg->GlobalIndexToGlobalCoor(g,fcoor);
    for(int d=0;d<nd;d++){
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -86,8 +86,10 @@ public:
    // expand up one dim at a time
    for(int d=0;d<dims;d++){

-      plocal[d] += 2*depth; 
-
+      if ( processors[d] > 1 ) { 
+	plocal[d] += 2*depth; 
+      }
+      
      for(int d=0;d<dims;d++){
 	global[d] = plocal[d]*processors[d];
      }
@ -98,11 +100,17 @@ public:
  template<class vobj>
  inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
  {
+    Coordinate processors=unpadded_grid->_processors;
+
    Lattice<vobj> out(unpadded_grid);

    Coordinate local     =unpadded_grid->LocalDimensions();
-    Coordinate fll(dims,depth); // depends on the MPI spread
+    // depends on the MPI spread      
+    Coordinate fll(dims,depth);
    Coordinate tll(dims,0); // depends on the MPI spread
+    for(int d=0;d<dims;d++){
+      if( processors[d]==1 ) fll[d]=0;
+    }
    localCopyRegion(in,out,fll,tll,local);
    return out;
  }
@ -121,6 +129,7 @@ public:
  template<class vobj>
  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
  {
+    Coordinate processors=unpadded_grid->_processors;
    GridBase *old_grid = in.Grid();
    GridCartesian *new_grid = grids[dim];//These are new grids
    Lattice<vobj>  padded(new_grid);
@ -133,36 +142,48 @@ public:
    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;

    double tins=0, tshift=0;
-    
-    // Middle bit
-    double t = usecond();
-    for(int x=0;x<local[dim];x++){
-      InsertSliceLocal(in,padded,x,depth+x,dim);
-    }
-    tins += usecond() - t;
-    
-    // High bit
-    t = usecond();
-    shifted = cshift.Cshift(in,dim,depth);
-    tshift += usecond() - t;

-    t=usecond();
-    for(int x=0;x<depth;x++){
-      InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
-    }
-    tins += usecond() - t;
-    
-    // Low bit
-    t = usecond();
-    shifted = cshift.Cshift(in,dim,-depth);
-    tshift += usecond() - t;
-    
-    t = usecond();
-    for(int x=0;x<depth;x++){
-      InsertSliceLocal(shifted,padded,x,x,dim);
-    }
-    tins += usecond() - t;
+    int islocal = 0 ;
+    if ( processors[dim] == 1 ) islocal = 1;

+    if ( islocal ) {
+      
+      double t = usecond();
+      for(int x=0;x<local[dim];x++){
+	InsertSliceLocal(in,padded,x,x,dim);
+      }
+      tins += usecond() - t;
+      
+    } else { 
+      // Middle bit
+      double t = usecond();
+      for(int x=0;x<local[dim];x++){
+	InsertSliceLocal(in,padded,x,depth+x,dim);
+      }
+      tins += usecond() - t;
+    
+      // High bit
+      t = usecond();
+      shifted = cshift.Cshift(in,dim,depth);
+      tshift += usecond() - t;
+
+      t=usecond();
+      for(int x=0;x<depth;x++){
+	InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
+      }
+      tins += usecond() - t;
+    
+      // Low bit
+      t = usecond();
+      shifted = cshift.Cshift(in,dim,-depth);
+      tshift += usecond() - t;
+    
+      t = usecond();
+      for(int x=0;x<depth;x++){
+	InsertSliceLocal(shifted,padded,x,x,dim);
+      }
+      tins += usecond() - t;
+    }
    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
    
    return padded;
--- a/Grid/util/Lexicographic.h
+++ b/Grid/util/Lexicographic.h
@ -8,7 +8,7 @@ namespace Grid{
  public:

    template<class coor_t>
-    static accelerator_inline void CoorFromIndex (coor_t& coor,int index,const coor_t &dims){
+    static accelerator_inline void CoorFromIndex (coor_t& coor,int64_t index,const coor_t &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=0;d<nd;d++){
@ -18,28 +18,45 @@ namespace Grid{
    }

    template<class coor_t>
-    static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
+    static accelerator_inline void IndexFromCoor (const coor_t& coor,int64_t &index,const coor_t &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=0;d<nd;d++){
-	index = index+stride*coor[d];
+	index = index+(int64_t)stride*coor[d];
 	stride=stride*dims[d];
      }
    }
+    template<class coor_t>
+    static accelerator_inline void IndexFromCoor (const coor_t& coor,int &index,const coor_t &dims){
+      int64_t index64;
+      IndexFromCoor(coor,index64,dims);
+      assert(index64<2*1024*1024*1024LL);
+      index = (int) index64;
+    }

    template<class coor_t>
-    static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
+    static inline void IndexFromCoorReversed (const coor_t& coor,int64_t &index,const coor_t &dims){
      int nd=dims.size();
      int stride=1;
      index=0;
      for(int d=nd-1;d>=0;d--){
-	index = index+stride*coor[d];
+	index = index+(int64_t)stride*coor[d];
 	stride=stride*dims[d];
      }
    }
    template<class coor_t>
-    static inline void CoorFromIndexReversed (coor_t& coor,int index,const coor_t &dims){
+    static inline void IndexFromCoorReversed (const coor_t& coor,int &index,const coor_t &dims){
+      int64_t index64;
+      IndexFromCoorReversed(coor,index64,dims);
+      if ( index64>=2*1024*1024*1024LL ){
+	std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
+      }
+      assert(index64<2*1024*1024*1024LL);
+      index = (int) index64;
+    }
+    template<class coor_t>
+    static inline void CoorFromIndexReversed (coor_t& coor,int64_t index,const coor_t &dims){
      int nd= dims.size();
      coor.resize(nd);
      for(int d=nd-1;d>=0;d--){
--- a/systems/Frontier/benchmarks/bench2.slurm
+++ b/systems/Frontier/benchmarks/bench2.slurm
@ -0,0 +1,43 @@
+#!/bin/bash -l
+#SBATCH --job-name=bench
+##SBATCH --partition=small-g
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=7
+#SBATCH --gpus-per-node=8
+#SBATCH --time=00:10:00
+#SBATCH --account=phy157_dwf
+#SBATCH --gpu-bind=none
+#SBATCH --exclusive
+#SBATCH --mem=0
+
+cat << EOF > select_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3 7 6 5 4)
+export NUMA_MAP=(3 3 1 1 2 2 0 0)
+export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
+export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
+export HIP_VISIBLE_DEVICES=\$GPU
+unset ROCR_VISIBLE_DEVICES
+echo RANK \$SLURM_LOCALID using GPU \$GPU    
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+
+chmod +x ./select_gpu
+
+root=$HOME/Frontier/Grid/systems/Frontier/
+source ${root}/sourceme.sh
+
+export OMP_NUM_THREADS=7
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
+
+for vol in 32.32.32.64
+do
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.ov.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol  > log.shm1.ov.$vol
+
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol  > log.shm0.seq.$vol
+srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
+done
+
--- a/systems/Frontier/config-command
+++ b/systems/Frontier/config-command
@ -0,0 +1,23 @@
+CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
+../../configure --enable-comms=mpi-auto \
+--with-lime=$CLIME \
+--enable-unified=no \
+--enable-shm=nvlink \
+--enable-tracing=timer \
+--enable-accelerator=hip \
+--enable-gen-simd-width=64 \
+--disable-gparity \
+--disable-fermion-reps \
+--enable-simd=GPU \
+--enable-accelerator-cshift \
+--with-gmp=$OLCF_GMP_ROOT \
+--with-fftw=$FFTW_DIR/.. \
+--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
+--disable-fermion-reps \
+CXX=hipcc MPICXX=mpicxx \
+CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -std=c++14 -I${MPICH_DIR}/include -L/lib64 " \
+ LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
+
+
+
+
--- a/systems/Frontier/mpiwrapper.sh
+++ b/systems/Frontier/mpiwrapper.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+lrank=$SLURM_LOCALID
+lgpu=(0 1 2 3 7 6 5 4)
+
+export ROCR_VISIBLE_DEVICES=${lgpu[$lrank]}
+
+echo "`hostname` - $lrank device=$ROCR_VISIBLE_DEVICES "
+
+$*
+
+
+
--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@ -0,0 +1,13 @@
+. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
+spack load c-lime
+#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib
+module load emacs 
+module load PrgEnv-gnu
+module load rocm
+module load cray-mpich/8.1.23
+module load gmp
+module load cray-fftw
+module load craype-accel-amd-gfx90a
+export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
+#Hack for lib
+#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
--- a/systems/Frontier/wrap.sh
+++ b/systems/Frontier/wrap.sh
@ -0,0 +1,9 @@
+#!/bin/sh
+
+export HIP_VISIBLE_DEVICES=$ROCR_VISIBLE_DEVICES
+unset ROCR_VISIBLE_DEVICES
+
+#rank=$SLURM_PROCID
+#rocprof -d rocprof.$rank -o rocprof.$rank/results.rank$SLURM_PROCID.csv --sys-trace $@
+
+$@
--- a/tests/debug/Test_general_coarse.cc
+++ b/tests/debug/Test_general_coarse.cc
@ -144,12 +144,7 @@ int main (int argc, char ** argv)
  HermOpAdaptor<LatticeFermionD> HOA(HermDefOp);

  int pp=16;
-  //  LittleDiracOpCol.CoarsenOperator(HOA,Aggregates);
-  //  std::cout << "LittleDiracOp old " << LittleDiracOpCol._A[pp]<<std::endl;
-  LittleDiracOp.CoarsenOperatorColoured(HOA,Aggregates);
-  //  LittleDiracOp.ExchangeCoarseLinks();
-
-  //  std::cout << "LittleDiracOp new " << LittleDiracOp._A[pp]<<std::endl;
+  LittleDiracOp.CoarsenOperator(HOA,Aggregates);
  
  ///////////////////////////////////////////////////
  // Test the operator
--- a/tests/debug/Test_general_coarse_hdcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg.cc
@ -86,6 +86,12 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);

  const int Ls=16;
+  const int nbasis = 40;
+  const int cb = 0 ;
+  RealD mass=0.01;
+  RealD M5=1.8;
+  RealD b=1.5;
+  RealD c=0.5;

  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
@ -121,10 +127,6 @@ int main (int argc, char ** argv)
  NerscIO::readConfiguration(Umu,header,file);

  //////////////////////// Fermion action //////////////////////////////////
-  RealD mass=0.01;
-  RealD M5=1.8;
-  RealD b=1.5;
-  RealD c=0.5;
  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);

  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
@ -143,8 +145,6 @@ int main (int argc, char ** argv)
  ////////////////////////////////////////////////////////////
  ///////////// Coarse basis and Little Dirac Operator ///////
  ////////////////////////////////////////////////////////////
-  const int nbasis = 40;
-  const int cb = 0 ;
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;

@ -154,6 +154,18 @@ int main (int argc, char ** argv)
  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse5d,FrbGrid,cb);
+#if 1
+  Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+				     95.0,0.1,
+				     //				     400,200,200 -- 48 iters
+				     //				     600,200,200 -- 38 iters, 162s
+				     //				     600,200,100 -- 38 iters, 169s
+				     //				     600,200,50  -- 88 iters. 370s 
+				     600,
+				     200,
+				     100,
+				     0.0);
+#else
  Aggregates.CreateSubspaceChebyshev(RNG5,
 				     HermOpEO,
 				     nbasis,
@ -172,19 +184,22 @@ int main (int argc, char ** argv)
 				     //				     0.1,  // nbasis 16 -- 142; sloppy solve
 				     0.1,  // nbasis 24 
 				     300);
+#endif
  ////////////////////////////////////////////////////////////
  // Need to check about red-black grid coarsening
  ////////////////////////////////////////////////////////////
  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
-  LittleDiracOp.CoarsenOperatorColoured(FineHermOp,Aggregates);
+  LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);

  // Try projecting to one hop only
+  std::cout << " projecting coarse matrix "<<std::endl;
  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
  LittleDiracOpProj.ProjectNearestNeighbour(0.5,LittleDiracOp);

  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
  HermMatrix CoarseOp (LittleDiracOp);
-
+  HermMatrix CoarseOpProj (LittleDiracOpProj);
+  
  //////////////////////////////////////////
  // Build a coarse lanczos
  //////////////////////////////////////////
@ -200,11 +215,13 @@ int main (int argc, char ** argv)
  std::vector<RealD>            eval(Nm);
  std::vector<CoarseVector>     evec(Nm,Coarse5d);
  CoarseVector c_src(Coarse5d); c_src=1.0;
+  CoarseVector c_res(Coarse5d); 

  PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src);

  IRL.calc(eval,evec,c_src,Nconv);
  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+
  
  //////////////////////////////////////////
  // Build a coarse space solver
@ -216,7 +233,53 @@ int main (int argc, char ** argv)

  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
+  c_res=Zero();
+  HPDSolve(c_src,c_res); 

+  //////////////////////////////////////////////////////////////////////////
+  // Deflated (with real op EV's) solve for the projected coarse op
+  // Work towards ADEF1 in the coarse space
+  //////////////////////////////////////////////////////////////////////////
+  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
+  c_res=Zero();
+  HPDSolveProj(c_src,c_res);
+
+  //////////////////////////////////////////////////////////////////////
+  // Coarse ADEF1 with deflation space
+  //////////////////////////////////////////////////////////////////////
+  //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(2.0,40.,8,CoarseOpProj);  // 311
+  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(2.0,36.,12,CoarseOpProj);  // 311
+
+  ////////////////////////////////////////////////////////
+  // CG, Cheby mode spacing 200,200
+  // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
+  // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s
+  // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
+  ////////////////////////////////////////////////////////
+  // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs 
+  ////////////////////////////////////////////////////////
+  // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain
+  // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
+  // HDCG 38 iters 162s
+  //
+  // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs 
+  // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain
+  // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
+  // HDCG 38 iters 169s
+
+  TwoLevelADEF1defl<CoarseVector>
+    cADEF1(1.0e-8, 100,
+	   CoarseOp,
+	   CoarseSmoother,
+	   evec,eval);
+
+  c_res=Zero();
+  cADEF1(c_src,c_res);
+  
+  cADEF1.Tolerance = 4.0e-2;
+  c_res=Zero();
+  cADEF1(c_src,c_res);
+  
  //////////////////////////////////////////
  // Build a smoother
  //////////////////////////////////////////
@ -242,10 +305,6 @@ int main (int argc, char ** argv)
  //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)  

-  // Standard CG
-  //      result=Zero();
-  //      CGfine(HermOpEO, src, result);
-
  for(int l=0;l<los.size();l++){

    RealD lo = los[l];
@ -271,8 +330,24 @@ int main (int argc, char ** argv)

      result=Zero();
      HDCG(src,result);
+
+      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
+	HDCGdefl(1.0e-8, 3000,
+	     FineHermOp,
+	     Smoother,
+	     cADEF1,
+	     HPDSolve,
+	     Aggregates);
+      
+      result=Zero();
+      HDCGdefl(src,result);
+      
    }
  }
+
+  // Standard CG
+  result=Zero();
+  CGfine(HermOpEO, src, result);
  
  Grid_finalize();
  return 0;
Author	SHA1	Message	Date
Peter Boyle	3bc2da5321	Merge branch 'feature/scidac-wp1' of https://github.com/paboyle/Grid into feature/scidac-wp1	2023-10-05 16:57:59 -04:00
Peter Boyle	2d710d6bfd	Optimised parameters for 16^3	2023-10-05 16:56:55 -04:00
Peter Boyle	6532b7f32b	Eliminate older inefficient coarsening implementation	2023-10-05 16:56:15 -04:00
Peter Boyle	7b41b92d99	Only need to bad non-local dimensions	2023-10-05 16:55:48 -04:00
Peter Boyle	dd557af84b	ADEF1 and ADEF2 2 level CG	2023-10-05 16:55:19 -04:00
Peter Boyle	59b9d0e030	coalesceRead the blockSum	2023-10-05 16:54:48 -04:00
Peter Boyle	b82eee4733	Hermitian dealing with	2023-10-05 16:54:14 -04:00
Peter Boyle	6a87487544	Running on Frontier, fix RNG big volume y2k, affecting 5D RNG	2023-10-05 16:50:59 -04:00
Peter Boyle	fcf5023845	Running on Frontier	2023-10-05 16:50:59 -04:00
Peter Boyle	c8adad6d8b	First runs on Summit. PopulateAdag needs work	2023-10-05 16:50:54 -04:00