Run at physical mass

Transfer code to Frontier now
Checkpoint restore the setup
2025-11-25 17:09:32 +00:00 · 2023-10-06 21:20:21 -04:00 · 2023-10-06 21:03:34 -04:00 · 2023-10-06 21:03:08 -04:00 · 2023-10-06 21:02:04 -04:00 · 2023-10-06 21:01:37 -04:00
10 changed files with 486 additions and 1724 deletions
--- a/Grid/algorithms/CoarsenedMatrix.h
+++ b/Grid/algorithms/CoarsenedMatrix.h
--- a/Grid/algorithms/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/GeneralCoarsenedMatrix.h
@@ -1,573 +0,0 @@
-/*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <pboyle@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-*************************************************************************************/
-/*  END LEGAL */
-#pragma once
-
-#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
-
-#include <Grid/lattice/PaddedCell.h>
-#include <Grid/stencil/GeneralLocalStencil.h>
-
-NAMESPACE_BEGIN(Grid);
-
-// Fixme need coalesced read gpermute
-template<class vobj> void gpermute(vobj & inout,int perm){
-  vobj tmp=inout;
-  if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
-  if (perm & 0x2 ) { permute(inout,tmp,1); tmp=inout;}
-  if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
-  if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
-}
-
-/////////////////////////////////////////////////////////////////
-// Reuse Aggregation class from CoarsenedMatrix for now
-// Might think about *smoothed* Aggregation
-// Equivalent of Geometry class in cartesian case
-/////////////////////////////////////////////////////////////////
-class NonLocalStencilGeometry {
-public:
-  int depth;
-  int hops;
-  int npoint;
-  std::vector<Coordinate> shifts;
-  Coordinate stencil_size;
-  Coordinate stencil_lo;
-  Coordinate stencil_hi;
-  GridCartesian *grid;
-  GridCartesian *Grid() {return grid;};
-  int Depth(void){return 1;};   // Ghost zone depth
-  int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil
-
-  virtual int DimSkip(void) =0;
-
-  virtual ~NonLocalStencilGeometry() {};
-
-  int  Reverse(int point)
-  {
-    int Nd = Grid()->Nd();
-    Coordinate shft = shifts[point];
-    Coordinate rev(Nd);
-    for(int mu=0;mu<Nd;mu++) rev[mu]= -shft[mu];
-    for(int p=0;p<npoint;p++){
-      if(rev==shifts[p]){
-	return p;
-      }
-    }
-    assert(0);
-    return -1;
-  }
-  void BuildShifts(void)
-  {
-    this->shifts.resize(0);
-    int Nd = this->grid->Nd();
-
-    int dd = this->DimSkip();
-    for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){
-    for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){
-    for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){
-    for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){
-      Coordinate sft(Nd,0);
-      sft[dd+0] = s0;
-      sft[dd+1] = s1;
-      sft[dd+2] = s2;
-      sft[dd+3] = s3;
-      int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3);
-      if(nhops<=this->hops) this->shifts.push_back(sft);
-    }}}}
-    this->npoint = this->shifts.size();
-    std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<<std::endl;
-  }
-  
-  NonLocalStencilGeometry(GridCartesian *_coarse_grid,int _hops) : grid(_coarse_grid), hops(_hops)
-  {
-    Coordinate latt = grid->GlobalDimensions();
-    stencil_size.resize(grid->Nd());
-    stencil_lo.resize(grid->Nd());
-    stencil_hi.resize(grid->Nd());
-    for(int d=0;d<grid->Nd();d++){
-     if ( latt[d] == 1 ) {
-      stencil_lo[d] = 0;
-      stencil_hi[d] = 0;
-      stencil_size[d]= 1;
-     } else if ( latt[d] == 2 ) {
-      stencil_lo[d] = -1;
-      stencil_hi[d] = 0;
-      stencil_size[d]= 2;
-     } else if ( latt[d] > 2 ) {
-       stencil_lo[d] = -1;
-       stencil_hi[d] =  1;
-       stencil_size[d]= 3;
-     }
-    }
-  };
-
-};
-
-// Need to worry about red-black now
-class NonLocalStencilGeometry4D : public NonLocalStencilGeometry {
-public:
-  virtual int DimSkip(void) { return 0;};
-  NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops) { };
-  virtual ~NonLocalStencilGeometry4D() {};
-};
-class NonLocalStencilGeometry5D : public NonLocalStencilGeometry {
-public:
-  virtual int DimSkip(void) { return 1; }; 
-  NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops)  { };
-  virtual ~NonLocalStencilGeometry5D() {};
-};
-/*
- * Bunch of different options classes
- */
-class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
-public:
-  NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,4)
-  {
-    this->BuildShifts();
-  };
-};
-class NextToNextToNextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,4)
-  {
-    this->BuildShifts();
-  };
-};
-class NextToNearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
-public:
-  NextToNearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,2)
-  {
-    this->BuildShifts();
-  };
-};
-class NextToNearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NextToNearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,2)
-  {
-    this->BuildShifts();
-  };
-};
-class NearestStencilGeometry4D : public  NonLocalStencilGeometry4D {
-public:
-  NearestStencilGeometry4D(GridCartesian *Coarse) :  NonLocalStencilGeometry4D(Coarse,1)
-  {
-    this->BuildShifts();
-  };
-};
-class NearestStencilGeometry5D : public  NonLocalStencilGeometry5D {
-public:
-  NearestStencilGeometry5D(GridCartesian *Coarse) :  NonLocalStencilGeometry5D(Coarse,1)
-  {
-    this->BuildShifts();
-  };
-};
-
-// Fine Object == (per site) type of fine field
-// nbasis      == number of deflation vectors
-template<class Fobj,class CComplex,int nbasis>
-class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > >  {
-public:
-
-  typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
-  typedef iVector<CComplex,nbasis >           siteVector;
-  typedef iMatrix<CComplex,nbasis >           siteMatrix;
-  typedef Lattice<iScalar<CComplex> >         CoarseComplexField;
-  typedef Lattice<siteVector>                 CoarseVector;
-  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
-  typedef iMatrix<CComplex,nbasis >  Cobj;
-  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
-  typedef Lattice<Fobj >        FineField;
-  typedef CoarseVector Field;
-  ////////////////////
-  // Data members
-  ////////////////////
-  int hermitian;
-  GridBase      *       _FineGrid; 
-  GridCartesian *       _CoarseGrid; 
-  NonLocalStencilGeometry &geom;
-  PaddedCell Cell;
-  GeneralLocalStencil Stencil;
-  
-  std::vector<CoarseMatrix> _A;
-  std::vector<CoarseMatrix> _Adag;
-
-  ///////////////////////
-  // Interface
-  ///////////////////////
-  GridBase      * Grid(void)           { return _FineGrid; };   // this is all the linalg routines need to know
-  GridBase      * FineGrid(void)       { return _FineGrid; };   // this is all the linalg routines need to know
-  GridCartesian * CoarseGrid(void)     { return _CoarseGrid; };   // this is all the linalg routines need to know
-
-
-  void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
-  {
-    int nfound=0;
-    std::cout << " ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
-    for(int p=0;p<geom.npoint;p++){
-      for(int pp=0;pp<CopyMe.geom.npoint;pp++){
- 	// Search for the same relative shift
-	// Avoids brutal handling of Grid pointers
-	if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
-	  _A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
-	  _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
-	  nfound++;
-	}
-      }
-    }
-    assert(nfound==geom.npoint);
-    ExchangeCoarseLinks();
-  }
-  
-  GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
-    : geom(_geom),
-      _FineGrid(FineGrid),
-      _CoarseGrid(CoarseGrid),
-      hermitian(1),
-      Cell(_geom.Depth(),_CoarseGrid),
-      Stencil(Cell.grids.back(),geom.shifts)
-  {
-    {
-      int npoint = _geom.npoint;
-      autoView( Stencil_v  , Stencil, AcceleratorRead);
-      int osites=Stencil.Grid()->oSites();
-      for(int ss=0;ss<osites;ss++){
-	for(int point=0;point<npoint;point++){
-	  auto SE = Stencil_v.GetEntry(point,ss);
-	  int o = SE->_offset;
-	  assert( o< osites);
-	}
-      }    
-    }
-
-    _A.resize(geom.npoint,CoarseGrid);
-    _Adag.resize(geom.npoint,CoarseGrid);
-  }
-  void M (const CoarseVector &in, CoarseVector &out)
-  {
-    Mult(_A,in,out);
-  }
-  void Mdag (const CoarseVector &in, CoarseVector &out)
-  {
-    if ( hermitian ) M(in,out);
-    else Mult(_Adag,in,out);
-  }
-  void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
-  {
-    RealD tviews=0;
-    RealD ttot=0;
-    RealD tmult=0;
-    RealD texch=0;
-    RealD text=0;
-    ttot=-usecond();
-    conformable(CoarseGrid(),in.Grid());
-    conformable(in.Grid(),out.Grid());
-    out.Checkerboard() = in.Checkerboard();
-    CoarseVector tin=in;
-
-    texch-=usecond();
-    CoarseVector pin  = Cell.Exchange(tin);
-    texch+=usecond();
-
-    CoarseVector pout(pin.Grid()); pout=Zero();
-
-    int npoint = geom.npoint;
-    typedef LatticeView<Cobj> Aview;
-      
-    const int Nsimd = CComplex::Nsimd();
-    
-    int osites=pin.Grid()->oSites();
-    //    int gsites=pin.Grid()->gSites();
-
-    RealD flops = 1.0* npoint * nbasis * nbasis * 8 * osites;
-    RealD bytes = (1.0*osites*sizeof(siteMatrix)*npoint+2.0*osites*sizeof(siteVector))*npoint;
-      
-    //    for(int point=0;point<npoint;point++){
-    //      conformable(A[point],pin);
-    //    }
-
-    {
-      tviews-=usecond();
-      autoView( in_v , pin, AcceleratorRead);
-      autoView( out_v , pout, AcceleratorWrite);
-      autoView( Stencil_v  , Stencil, AcceleratorRead);
-      tviews+=usecond();
-      
-      for(int point=0;point<npoint;point++){
-	tviews-=usecond();
-	autoView( A_v, A[point],AcceleratorRead);
-	tviews+=usecond();
-	tmult-=usecond();
-	accelerator_for(sss, osites*nbasis, Nsimd, {
-
-	    typedef decltype(coalescedRead(in_v[0]))    calcVector;
-
-	    int ss = sss/nbasis;
-	    int b  = sss%nbasis;
-
-	    auto SE  = Stencil_v.GetEntry(point,ss);
-	    auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
-	    auto res = out_v(ss)(b);
-	    for(int bb=0;bb<nbasis;bb++) {
-	      res = res + coalescedRead(A_v[ss](b,bb))*nbr(bb);
-	    }
-	    coalescedWrite(out_v[ss](b),res);
-	});
-
-	tmult+=usecond();
-      }
-    }
-    text-=usecond();
-    out = Cell.Extract(pout);
-    text+=usecond();
-    ttot+=usecond();
-
-    std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel "<< flops/tmult<<" mflop/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel "<< bytes/tmult<<" MB/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse bytes   "<< bytes/1e6<<" MB"<<std::endl;
-  };
-
-  void PopulateAdag(void)
-  {
-    for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
-      Coordinate bcoor;
-      CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
-      
-      for(int p=0;p<geom.npoint;p++){
-	Coordinate scoor = bcoor;
-	for(int mu=0;mu<bcoor.size();mu++){
-	  int L = CoarseGrid()->GlobalDimensions()[mu];
-	  scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic
-	}
-	// Flip to poke/peekLocalSite and not too bad
-	auto link = peekSite(_A[p],scoor);
-	int pp = geom.Reverse(p);
-	pokeSite(adj(link),_Adag[pp],bcoor);
-      }
-    }
-  }
-  /////////////////////////////////////////////////////////////
-  // 
-  // A) Only reduced flops option is to use a padded cell of depth 4
-  // and apply MpcDagMpc in the padded cell.
-  //
-  // Makes for ONE application of MpcDagMpc per vector instead of 30 or 80.
-  // With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio
-  // Cost is 81x more, same as stencil size.
-  //
-  // But: can eliminate comms and do as local dirichlet.
-  //
-  // Local exchange gauge field once.
-  // Apply to all vectors, local only computation.
-  // Must exchange ghost subcells in reverse process of PaddedCell to take inner products
-  //
-  // B) Can reduce cost: pad by 1, apply Deo      (4^4+6^4+8^4+8^4 )/ (4x 4^4)
-  //                     pad by 2, apply Doe
-  //                     pad by 3, apply Deo
-  //                     then break out 8x directions; cost is ~10x MpcDagMpc per vector
-  //
-  // => almost factor of 10 in setup cost, excluding data rearrangement
-  //
-  // Intermediates -- ignore the corner terms, leave approximate and force Hermitian
-  // Intermediates -- pad by 2 and apply 1+8+24 = 33 times.
-  /////////////////////////////////////////////////////////////
-
-    //////////////////////////////////////////////////////////
-    // BFM HDCG style approach: Solve a system of equations to get Aij
-    //////////////////////////////////////////////////////////
-    /*
-     *     Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM.
-     *
-     *     conj(phases[block]) proj[k][ block*Nvec+j ] =  \sum_ball  e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} > 
-     *                                                 =  \sum_ball e^{iqk.delta} A_ji
-     *
-     *     Must invert matrix M_k,l = e^[i q_k . delta_l]
-     *
-     *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
-     */
-  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
-		       Aggregation<Fobj,CComplex,nbasis> & Subspace)
-  {
-    std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
-    GridBase *grid = FineGrid();
-
-    RealD tproj=0.0;
-    RealD teigen=0.0;
-    RealD tmat=0.0;
-    RealD tphase=0.0;
-    RealD tinv=0.0;
-
-    /////////////////////////////////////////////////////////////
-    // Orthogonalise the subblocks over the basis
-    /////////////////////////////////////////////////////////////
-    CoarseScalar InnerProd(CoarseGrid()); 
-    blockOrthogonalise(InnerProd,Subspace.subspace);
-
-    const int npoint = geom.npoint;
-      
-    Coordinate clatt = CoarseGrid()->GlobalDimensions();
-    int Nd = CoarseGrid()->Nd();
-
-      /*
-       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
-       *     Matrix index i is mapped to this shift via 
-       *               geom.shifts[i]
-       *
-       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
-       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
-       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
-       *       = M_{kl} A_ji^{b.b+l}
-       *
-       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
-       *  
-       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
-       *
-       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
-       */
-    teigen-=usecond();
-    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
-    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
-    ComplexD ci(0.0,1.0);
-    for(int k=0;k<npoint;k++){ // Loop over momenta
-
-      for(int l=0;l<npoint;l++){ // Loop over nbr relative
-	ComplexD phase(0.0,0.0);
-	for(int mu=0;mu<Nd;mu++){
-	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	  phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
-	}
-	phase=exp(phase*ci);
-	Mkl(k,l) = phase;
-      }
-    }
-    invMkl = Mkl.inverse();
-    teigen+=usecond();
-
-    ///////////////////////////////////////////////////////////////////////
-    // Now compute the matrix elements of linop between the orthonormal
-    // set of vectors.
-    ///////////////////////////////////////////////////////////////////////
-    FineField phaV(grid); // Phased block basis vector
-    FineField MphaV(grid);// Matrix applied
-    CoarseVector coarseInner(CoarseGrid());
-
-    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
-    std::vector<CoarseVector>          FT(npoint,CoarseGrid());
-    for(int i=0;i<nbasis;i++){// Loop over basis vectors
-      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
-      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
-	/////////////////////////////////////////////////////
-	// Stick a phase on every block
-	/////////////////////////////////////////////////////
-	tphase-=usecond();
-	CoarseComplexField coor(CoarseGrid());
-	CoarseComplexField pha(CoarseGrid());	pha=Zero();
-	for(int mu=0;mu<Nd;mu++){
-	  LatticeCoordinate(coor,mu);
-	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
-	  pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
-	}
-	pha  =exp(pha*ci);
-	phaV=Zero();
-	blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
-	tphase+=usecond();
-
-	/////////////////////////////////////////////////////////////////////
-	// Multiple phased subspace vector by matrix and project to subspace
-	// Remove local bulk phase to leave relative phases
-	/////////////////////////////////////////////////////////////////////
-	tmat-=usecond();
-	linop.Op(phaV,MphaV);
-	tmat+=usecond();
-
-	tproj-=usecond();
-	blockProject(coarseInner,MphaV,Subspace.subspace);
-	coarseInner = conjugate(pha) * coarseInner;
-
-	ComputeProj[p] = coarseInner;
-	tproj+=usecond();
-
-      }
-
-      tinv-=usecond();
-      for(int k=0;k<npoint;k++){
-	FT[k] = Zero();
-	for(int l=0;l<npoint;l++){
-	  FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
-	}
-      
-	int osites=CoarseGrid()->oSites();
-	autoView( A_v  , _A[k], AcceleratorWrite);
-	autoView( FT_v  , FT[k], AcceleratorRead);
-	accelerator_for(sss, osites, 1, {
-	    for(int j=0;j<nbasis;j++){
-	      A_v[sss](j,i) = FT_v[sss](j);
-	    }
-        });
-      }
-      tinv+=usecond();
-    }
-
-    for(int p=0;p<geom.npoint;p++){
-      Coordinate coor({0,0,0,0,0});
-      auto sval = peekSite(_A[p],coor);
-    }
-
-    // Only needed if nonhermitian
-    if ( ! hermitian ) {
-      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
-      PopulateAdag();
-    }
-
-    // Need to write something to populate Adag from A
-    std::cout << GridLogMessage<<"ExchangeCoarseLinks  "<<std::endl;
-    ExchangeCoarseLinks();
-    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator phase  "<<tphase<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator mat    "<<tmat <<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator proj   "<<tproj<<" us"<<std::endl;
-    std::cout << GridLogMessage<<"CoarsenOperator inv    "<<tinv<<" us"<<std::endl;
-  }
-  void ExchangeCoarseLinks(void){
-    for(int p=0;p<geom.npoint;p++){
-      std::cout << "Exchange "<<p<<std::endl;
-      _A[p] = Cell.Exchange(_A[p]);
-      _Adag[p]= Cell.Exchange(_Adag[p]);
-    }
-  }
-  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
-};
-
-
-NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/AdefGeneric.h
+++ b/Grid/algorithms/iterative/AdefGeneric.h
@@ -38,7 +38,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
   * Vstart = P^Tx + Qb
   * M1 = P^TM + Q
   * M2=M3=1
-   * Vout = x
   */
 NAMESPACE_BEGIN(Grid);

@@ -68,14 +67,13 @@ class TwoLevelCG : public LinearFunction<Field>
    grid       = fine;
  };
  
-  virtual void operator() (const Field &src, Field &psi)
+  virtual void operator() (const Field &src, Field &x)
  {
    Field resid(grid);
    RealD f;
    RealD rtzp,rtz,a,d,b;
    RealD rptzp;
    
-    Field x(grid); 
    Field p(grid);
    Field z(grid);
    Field tmp(grid);
@@ -85,7 +83,6 @@ class TwoLevelCG : public LinearFunction<Field>
    Field rp (grid);
    
    //Initial residual computation & set up
-    RealD guess = norm2(psi);
    double tn;

    GridStopWatch HDCGTimer;
@@ -165,14 +162,22 @@ class TwoLevelCG : public LinearFunction<Field>
 	RealD  srcnorm = sqrt(norm2(src));
 	RealD  tmpnorm = sqrt(norm2(tmp));
 	RealD  true_residual = tmpnorm/srcnorm;
-	std::cout<<GridLogMessage<<"HDCG: true residual is "<<true_residual
-		 <<" solution "<<xnorm<<" source "<<srcnorm<<std::endl;
+	std::cout<<GridLogMessage
+		 <<"HDCG: true residual is "<<true_residual
+		 <<" solution "<<xnorm
+		 <<" source "<<srcnorm
+		 <<" mmp "<<mmpnorm	  
+		 <<std::endl;

 	return;
      }

    }
-    std::cout << "HDCG: Pcg not converged"<<std::endl;
+    std::cout<<GridLogMessage<<"HDCG: not converged"<<std::endl;
+    RealD  xnorm   = sqrt(norm2(x));
+    RealD  srcnorm = sqrt(norm2(src));
+    std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
+    
    return ;
  }
  
@@ -197,9 +202,6 @@ class TwoLevelCG : public LinearFunction<Field>
  /////////////////////////////////////////////////////////////////////
  // Only Def1 has non-trivial Vout.
  /////////////////////////////////////////////////////////////////////
-  virtual void   Vout  (Field & in, Field & out,Field & src){
-    out = in;
-  }

 };
  
@@ -221,13 +223,13 @@ class TwoLevelADEF2 : public TwoLevelCG<Field>
  // more most opertor functions
  TwoLevelADEF2(RealD tol,
 		Integer maxit,
-		LinearOperatorBase<Field>   &FineLinop,
-		LinearFunction<Field>   &Smoother,
+		LinearOperatorBase<Field>    &FineLinop,
+		LinearFunction<Field>        &Smoother,
 		LinearFunction<CoarseField>  &CoarseSolver,
 		LinearFunction<CoarseField>  &CoarseSolverPrecise,
 		Aggregation &Aggregates
 		) :
-    TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
+      TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
      _CoarseSolver(CoarseSolver),
      _CoarseSolverPrecise(CoarseSolverPrecise),
      _Aggregates(Aggregates)
@@ -321,12 +323,13 @@ public:
    eval(_eval)
  {};

-  // Can just inherit existing Vout
  // Can just inherit existing M2
  // Can just inherit existing M3

  // Simple vstart - do nothing
-  virtual void Vstart(Field & x,const Field & src){ x=src; };
+  virtual void Vstart(Field & x,const Field & src){
+    x=src; // Could apply Q
+  };

  // Override PcgM1
  virtual void PcgM1(Field & in, Field & out)
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -457,7 +457,7 @@ until convergence
 	    std::vector<Field>& evec,
 	    Field& w,int Nm,int k)
  {
-    std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
+    std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
    const RealD tiny = 1.0e-20;
    assert( k< Nm );

@@ -487,7 +487,7 @@ until convergence

    if(k < Nm-1) evec[k+1] = w;

-    std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
+    std::cout<<GridLogIRL << "Lanczos step alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
    if ( beta < tiny ) 
      std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;

--- a/Grid/algorithms/iterative/NormalEquations.h
+++ b/Grid/algorithms/iterative/NormalEquations.h
@@ -78,7 +78,7 @@ public:
  void operator() (const Field &in, Field &out){
 
    _Guess(in,out);
-    _HermitianSolver(_Matrix,in,out);  // Mdag M out = Mdag in
+    _HermitianSolver(_Matrix,in,out);  //M out = in

  }     
 };
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@@ -191,16 +191,17 @@ public:
    out = Cell.Extract(pout);
    text+=usecond();
    ttot+=usecond();
+    
+    std::cout << GridLogDebug<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
+    std::cout << GridLogDebug<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
+    std::cout << GridLogDebug<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
+    std::cout << GridLogDebug<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
+    std::cout << GridLogDebug<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
+    std::cout << GridLogDebug<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
+    std::cout << GridLogDebug<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
+    std::cout << GridLogDebug<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
+    std::cout << GridLogDebug<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;

-    std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
-    std::cout << GridLogPerformance<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
  };

  void PopulateAdag(void)
--- a/Grid/algorithms/multigrid/MultiGrid.h
+++ b/Grid/algorithms/multigrid/MultiGrid.h
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@@ -139,7 +139,7 @@ public:
    if(dim==0) conformable(old_grid,unpadded_grid);
    else       conformable(old_grid,grids[dim-1]);

-    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
+    //    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;

    double tins=0, tshift=0;

@@ -184,7 +184,7 @@ public:
      }
      tins += usecond() - t;
    }
-    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
+    std::cout << GridLogDebug << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
    
    return padded;
  }
--- a/tests/debug/Test_general_coarse_hdcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg.cc
@@ -50,7 +50,7 @@ void SaveOperator(Coarsened &Operator,std::string file)
 #endif
 }
 template<class Coarsened>
-void LoadOperator(Coarsened Operator,std::string file)
+void LoadOperator(Coarsened &Operator,std::string file)
 {
 #ifdef HAVE_LIME
  emptyUserRecord record;
@@ -219,7 +219,7 @@ int main (int argc, char ** argv)
  ////////////////////////////////////////////////////////////
  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);

-  bool load=false;
+  bool load=true;
  if ( load ) {
    LoadBasis(Aggregates,"Subspace.scidac");
    LoadOperator(LittleDiracOp,"LittleDiracOp.scidac");
@@ -230,7 +230,7 @@ int main (int argc, char ** argv)
 				       //				     600,200,200 -- 38 iters, 162s
 				       //				     600,200,100 -- 38 iters, 169s
 				       //				     600,200,50  -- 88 iters. 370s 
-				       600,
+				       800,
 				       200,
 				       100,
 				       0.0);
@@ -241,16 +241,16 @@ int main (int argc, char ** argv)
  
  // Try projecting to one hop only
  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
-  LittleDiracOpProj.ProjectNearestNeighbour(0.2,LittleDiracOp);
+  LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n

  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
-  HermMatrix CoarseOp (LittleDiracOp);
+  HermMatrix CoarseOp     (LittleDiracOp);
  HermMatrix CoarseOpProj (LittleDiracOpProj);
  
  //////////////////////////////////////////
  // Build a coarse lanczos
  //////////////////////////////////////////
-  Chebyshev<CoarseVector>      IRLCheby(0.5,60.0,71);  // 1 iter
+  Chebyshev<CoarseVector>      IRLCheby(0.2,40.0,71);  // 1 iter
  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
  PlainHermOp<CoarseVector>    IRLOp    (CoarseOp);
  int Nk=48;
@@ -270,7 +270,6 @@ int main (int argc, char ** argv)
  IRL.calc(eval,evec,c_src,Nconv);
  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);

-  
  //////////////////////////////////////////
  // Build a coarse space solver
  //////////////////////////////////////////
@@ -283,7 +282,8 @@ int main (int argc, char ** argv)
  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
  c_res=Zero();
  HPDSolve(c_src,c_res); c_ref = c_res;
-
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
  //////////////////////////////////////////////////////////////////////////
  // Deflated (with real op EV's) solve for the projected coarse op
  // Work towards ADEF1 in the coarse space
@@ -291,13 +291,21 @@ int main (int argc, char ** argv)
  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
  c_res=Zero();
  HPDSolveProj(c_src,c_res);
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
  c_res = c_res - c_ref;
  std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;

  //////////////////////////////////////////////////////////////////////
  // Coarse ADEF1 with deflation space
  //////////////////////////////////////////////////////////////////////
-  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(4.0,45.,16,CoarseOpProj);  // 311
+  ChebyshevSmoother<CoarseVector,HermMatrix >
+    CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence
+    //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  //
+  //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s
+  //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s
+  //    CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter
+  //  CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter
  //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj);  // 311

  ////////////////////////////////////////////////////////
@@ -318,19 +326,25 @@ int main (int argc, char ** argv)
  // HDCG 38 iters 169s

  TwoLevelADEF1defl<CoarseVector>
-    cADEF1(1.0e-8, 100,
+    cADEF1(1.0e-8, 500,
 	   CoarseOp,
 	   CoarseSmoother,
 	   evec,eval);

  c_res=Zero();
  cADEF1(c_src,c_res);
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
  c_res = c_res - c_ref;
  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
  
-  cADEF1.Tolerance = 1.0e-9;
+  //  cADEF1.Tolerance = 4.0e-2;
+  //  cADEF1.Tolerance = 1.0e-1;
+  cADEF1.Tolerance = 5.0e-2;
  c_res=Zero();
  cADEF1(c_src,c_res);
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
  c_res = c_res - c_ref;
  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
  
@@ -375,16 +389,13 @@ int main (int argc, char ** argv)
      // Build a HDCG solver
      //////////////////////////////////////////
      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
-	HDCG(1.0e-8, 3000,
+	HDCG(1.0e-8, 100,
 	     FineHermOp,
 	     Smoother,
 	     HPDSolveSloppy,
 	     HPDSolve,
 	     Aggregates);

-      result=Zero();
-      HDCG(src,result);
-
      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
 	HDCGdefl(1.0e-8, 100,
 		 FineHermOp,
@@ -395,6 +406,10 @@ int main (int argc, char ** argv)
      
      result=Zero();
      HDCGdefl(src,result);
+
+      result=Zero();
+      HDCG(src,result);
+
      
    }
  }
--- a/tests/debug/Test_general_coarse_hdcg_phys.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys.cc
@@ -0,0 +1,423 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_general_coarse_hdcg.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+//#include <Grid/algorithms/GeneralCoarsenedMatrix.h>
+#include <Grid/algorithms/iterative/AdefGeneric.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Coarsened>
+void SaveOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Operator.Grid()->IsBoss());
+  assert(Operator._A.size()==Operator.geom.npoint);
+  WR.open(file);
+  for(int p=0;p<Operator._A.size();p++){
+    auto tmp = Operator.Cell.Extract(Operator._A[p]);
+    WR.writeScidacFieldRecord(tmp,record);
+  }
+  WR.close();
+#endif
+}
+template<class Coarsened>
+void LoadOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  Grid::ScidacReader RD ;
+  RD.open(file);
+  assert(Operator._A.size()==Operator.geom.npoint);
+  for(int p=0;p<Operator.geom.npoint;p++){
+    conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
+    RD.readScidacFieldRecord(Operator._A[p],record);
+  }    
+  RD.close();
+  Operator.ExchangeCoarseLinks();
+#endif
+}
+template<class aggregation>
+void SaveBasis(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg.FineGrid->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    WR.writeScidacFieldRecord(Agg.subspace[b],record);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void LoadBasis(aggregation &Agg, std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    RD.readScidacFieldRecord(Agg.subspace[b],record);
+  }    
+  RD.close();
+#endif
+}
+
+
+template<class Field> class TestSolver : public LinearFunction<Field> {
+public:
+  TestSolver() {};
+  void operator() (const Field &in, Field &out){    out = Zero();  }     
+};
+
+
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+// Want Op in CoarsenOp to call MatPcDagMatPc
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
+  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
+  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
+  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+};
+template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  Chebyshev<Field> Cheby;
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {
+    std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    tmp = in;
+    Cheby(_SmootherOperator,tmp,out);
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+  const int nbasis = 40;
+  const int cb = 0 ;
+  RealD mass=0.00078;
+  RealD M5=1.8;
+  RealD b=1.5;
+  RealD c=0.5;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid with 4^4 cell
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
+							    GridDefaultSimd(Nd,vComplex::Nsimd()),
+							    GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  ///////////////////////// RNGs /////////////////////////////////
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  ///////////////////////// Configuration /////////////////////////////////
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  //////////////////////// Fermion action //////////////////////////////////
+  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+
+  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
+
+  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
+  HermFineMatrix FineHermOp(HermOpEO);
+  
+  LatticeFermion result(FrbGrid); result=Zero();
+
+  LatticeFermion    src(FrbGrid); random(RNG5,src);
+
+  // Run power method on FineHermOp
+  PowerMethod<LatticeFermion>       PM;   PM(HermOpEO,src);
+
+ 
+  ////////////////////////////////////////////////////////////
+  ///////////// Coarse basis and Little Dirac Operator ///////
+  ////////////////////////////////////////////////////////////
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
+  NearestStencilGeometry5D geom_nn(Coarse5d);
+  
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse5d,FrbGrid,cb);
+
+  ////////////////////////////////////////////////////////////
+  // Need to check about red-black grid coarsening
+  ////////////////////////////////////////////////////////////
+  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
+
+  bool load=true;
+  if ( load ) {
+    LoadBasis(Aggregates,"Subspace.scidac");
+    LoadOperator(LittleDiracOp,"LittleDiracOp.scidac");
+  } else {
+    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+				       95.0,0.1,
+				       //				     400,200,200 -- 48 iters
+				       //				     600,200,200 -- 38 iters, 162s
+				       //				     600,200,100 -- 38 iters, 169s
+				       //				     600,200,50  -- 88 iters. 370s 
+				       800,
+				       200,
+				       100,
+				       0.0);
+    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
+    SaveBasis(Aggregates,"Subspace.scidac");
+    SaveOperator(LittleDiracOp,"LittleDiracOp.scidac");
+  }
+  
+  // Try projecting to one hop only
+  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
+  LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
+
+  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
+  HermMatrix CoarseOp     (LittleDiracOp);
+  HermMatrix CoarseOpProj (LittleDiracOpProj);
+  
+  //////////////////////////////////////////
+  // Build a coarse lanczos
+  //////////////////////////////////////////
+  Chebyshev<CoarseVector>      IRLCheby(0.2,40.0,71);  // 1 iter
+  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
+  PlainHermOp<CoarseVector>    IRLOp    (CoarseOp);
+  int Nk=48;
+  int Nm=64;
+  int Nstop=Nk;
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-5,20);
+
+  int Nconv;
+  std::vector<RealD>            eval(Nm);
+  std::vector<CoarseVector>     evec(Nm,Coarse5d);
+  CoarseVector c_src(Coarse5d); c_src=1.0;
+  CoarseVector c_res(Coarse5d); 
+  CoarseVector c_ref(Coarse5d); 
+
+  PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src);
+
+  IRL.calc(eval,evec,c_src,Nconv);
+  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+
+  //////////////////////////////////////////
+  // Build a coarse space solver
+  //////////////////////////////////////////
+  int maxit=20000;
+  ConjugateGradient<CoarseVector>  CG(1.0e-8,maxit,false);
+  ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,10000,false);
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+
+  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
+  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
+  c_res=Zero();
+  HPDSolve(c_src,c_res); c_ref = c_res;
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
+  //////////////////////////////////////////////////////////////////////////
+  // Deflated (with real op EV's) solve for the projected coarse op
+  // Work towards ADEF1 in the coarse space
+  //////////////////////////////////////////////////////////////////////////
+  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
+  c_res=Zero();
+  HPDSolveProj(c_src,c_res);
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
+  c_res = c_res - c_ref;
+  std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Coarse ADEF1 with deflation space
+  //////////////////////////////////////////////////////////////////////
+  ChebyshevSmoother<CoarseVector,HermMatrix >
+    CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence
+    //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  //
+  //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s
+  //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s
+  //    CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter
+  //  CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter
+  //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj);  // 311
+
+  ////////////////////////////////////////////////////////
+  // CG, Cheby mode spacing 200,200
+  // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
+  // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s
+  // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
+  ////////////////////////////////////////////////////////
+  // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs 
+  ////////////////////////////////////////////////////////
+  // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain
+  // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
+  // HDCG 38 iters 162s
+  //
+  // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs 
+  // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain
+  // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
+  // HDCG 38 iters 169s
+
+  TwoLevelADEF1defl<CoarseVector>
+    cADEF1(1.0e-8, 500,
+	   CoarseOp,
+	   CoarseSmoother,
+	   evec,eval);
+
+  c_res=Zero();
+  cADEF1(c_src,c_res);
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
+  c_res = c_res - c_ref;
+  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
+  
+  //  cADEF1.Tolerance = 4.0e-2;
+  //  cADEF1.Tolerance = 1.0e-1;
+  cADEF1.Tolerance = 5.0e-2;
+  c_res=Zero();
+  cADEF1(c_src,c_res);
+  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
+  c_res = c_res - c_ref;
+  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
+  
+  //////////////////////////////////////////
+  // Build a smoother
+  //////////////////////////////////////////
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp);  //383
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp);  //328
+  //  std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1
+  //  std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations
+  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters
+  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters
+
+  //
+  // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40
+  // Need to measure cost of coarse space.
+  //
+  // -- i) Reduce coarse residual   -- 0.04
+  // -- ii) Lanczos on coarse space -- done
+  // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and
+  //         use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec
+  //
+  std::vector<RealD> los({3.0}); // Nbasis 40 == 36,36 iters
+
+  //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
+  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)  
+
+  for(int l=0;l<los.size();l++){
+
+    RealD lo = los[l];
+
+    for(int o=0;o<ords.size();o++){
+
+      ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false);
+      HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
+      
+      //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
+      ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,ords[o],FineHermOp);  // 311
+
+      //////////////////////////////////////////
+      // Build a HDCG solver
+      //////////////////////////////////////////
+      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
+	HDCG(1.0e-8, 100,
+	     FineHermOp,
+	     Smoother,
+	     HPDSolveSloppy,
+	     HPDSolve,
+	     Aggregates);
+
+      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
+	HDCGdefl(1.0e-8, 100,
+		 FineHermOp,
+		 Smoother,
+		 cADEF1,
+		 HPDSolve,
+		 Aggregates);
+      
+      result=Zero();
+      HDCGdefl(src,result);
+
+      result=Zero();
+      HDCG(src,result);
+
+      
+    }
+  }
+
+  // Standard CG
+  result=Zero();
+  CGfine(HermOpEO, src, result);
+  
+  Grid_finalize();
+  return 0;
+}
Author	SHA1	Message	Date
Peter Boyle	2111e7ab5f	Run at physical mass	2023-10-06 21:20:21 -04:00
Peter Boyle	d29abfdcaf	Transfer code to Frontier now	2023-10-06 21:03:34 -04:00
Peter Boyle	a751c42cc5	Checkpoint restore the setup	2023-10-06 21:03:08 -04:00
Peter Boyle	6a3bc9865e	Verbose change	2023-10-06 21:02:04 -04:00
Peter Boyle	4d5f7e4377	Verbose change	2023-10-06 21:01:37 -04:00
Peter Boyle	78b117fb78	Comment fix	2023-10-06 21:01:15 -04:00
Peter Boyle	ded63a1319	Verbose change/pretty print	2023-10-06 21:00:53 -04:00
Peter Boyle	df3e4d1e9c	Return fix	2023-10-06 21:00:21 -04:00