Improved, works on 48^3 moving to multiRHS optimisations

Use 5.3.0
Allow expression as argument to norm2
2025-06-21 01:02:02 +01:00 · 2023-11-15 18:03:05 -05:00 · 2023-11-15 18:01:38 -05:00 · 2023-11-15 18:00:44 -05:00 · 2023-11-15 18:00:07 -05:00 · 2023-11-15 17:58:38 -05:00
12 changed files with 677 additions and 71 deletions
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@ -145,6 +145,44 @@ public:
  }
 };

+////////////////////////////////////////////////////////////////////
+// Create a shifted HermOp
+////////////////////////////////////////////////////////////////////
+template<class Field>
+class ShiftedHermOpLinearOperator : public LinearOperatorBase<Field> {
+  LinearOperatorBase<Field> &_Mat;
+  RealD _shift;
+public:
+  ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    assert(0);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+  };
+  void Op     (const Field &in, Field &out){
+    assert(0);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    assert(0);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+    _Mat.HermOp(in,out);
+    out = out + _shift*in;
+  }
+};
+
+
 ////////////////////////////////////////////////////////////////////
 // Wrap an already herm matrix
 ////////////////////////////////////////////////////////////////////
--- a/Grid/algorithms/approx/MultiShiftFunction.h
+++ b/Grid/algorithms/approx/MultiShiftFunction.h
@ -40,7 +40,7 @@ public:
  RealD norm;
  RealD lo,hi;

-  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
+  MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), tolerances(n), lo(_lo), hi(_hi) {;};
  RealD approx(RealD x);
  void csv(std::ostream &out);
  void gnuplot(std::ostream &out);
--- a/Grid/algorithms/iterative/AdefGeneric.h
+++ b/Grid/algorithms/iterative/AdefGeneric.h
@ -69,6 +69,7 @@ class TwoLevelCG : public LinearFunction<Field>
  
  virtual void operator() (const Field &src, Field &x)
  {
+#if 0
    Field resid(grid);
    RealD f;
    RealD rtzp,rtz,a,d,b;
@ -113,6 +114,7 @@ class TwoLevelCG : public LinearFunction<Field>
    RealD ssq =  norm2(src);
    RealD rsq =  ssq*Tolerance*Tolerance;

+    GRID_TRACE("MultiGrid TwoLevel ");
    std::cout<<GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" target rsq "<<rsq<<" ssq "<<ssq<<std::endl;
    
    for (int k=1;k<=MaxIterations;k++){
@ -179,6 +181,162 @@ class TwoLevelCG : public LinearFunction<Field>
    std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
    
    return ;
+#else
+  RealD f;
+  RealD rtzp,rtz,a,d,b;
+  RealD rptzp;
+
+  /////////////////////////////
+  // Set up history vectors
+  /////////////////////////////
+  int mmax = 20;
+  std::vector<Field> p(mmax,grid);
+  std::vector<Field> mmp(mmax,grid);
+  std::vector<RealD> pAp(mmax);
+  Field z(grid);
+  Field tmp(grid);
+  Field  mp (grid);
+  Field  r  (grid);
+  Field  mu (grid);
+
+  //Initial residual computation & set up
+  RealD guess   = norm2(x);
+  RealD src_nrm = norm2(src);
+
+  if ( src_nrm == 0.0 ) {
+    std::cout << GridLogMessage<<"HDCG: fPcg given trivial source norm "<<src_nrm<<std::endl;
+    x=Zero();
+  }
+  RealD tn;
+
+  GridStopWatch HDCGTimer;
+  HDCGTimer.Start();
+  //////////////////////////
+  // x0 = Vstart -- possibly modify guess
+  //////////////////////////
+  Vstart(x,src);
+
+  // r0 = b -A x0
+  _FineLinop.HermOp(x,mmp[0]);
+  axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
+  {
+    double n1 = norm2(x);
+    double n2 = norm2(mmp[0]);
+    double n3 = norm2(r);
+    std::cout<<GridLogMessage<<"x,vstart,r = "<<n1<<" "<<n2<<" "<<n3<<std::endl;
+  }
+
+  //////////////////////////////////
+  // Compute z = M1 x
+  //////////////////////////////////
+  PcgM1(r,z);
+  rtzp =real(innerProduct(r,z));
+
+  ///////////////////////////////////////
+  // Solve for Mss mu = P A z and set p = z-mu
+  // Def2: p = 1 - Q Az = Pright z 
+  // Other algos M2 is trivial
+  ///////////////////////////////////////
+  PcgM2(z,p[0]);
+
+  RealD ssq =  norm2(src);
+  RealD rsq =  ssq*Tolerance*Tolerance;
+
+  std::cout << GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" rsq "<<rsq<<"\n";
+
+  Field pp(grid);
+
+  for (int k=0;k<=MaxIterations;k++){
+    
+    int peri_k  = k % mmax;
+    int peri_kp = (k+1) % mmax;
+
+    rtz=rtzp;
+    d= PcgM3(p[peri_k],mmp[peri_k]);
+    a = rtz/d;
+    
+    // Memorise this
+    pAp[peri_k] = d;
+
+    
+    axpy(x,a,p[peri_k],x);
+    RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
+
+    // Compute z = M x
+    PcgM1(r,z);
+
+    {
+      RealD n1,n2;
+      n1=norm2(r);
+      n2=norm2(z);
+      std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : vector r,z "<<n1<<" "<<n2<<"\n";
+    }
+    rtzp =real(innerProduct(r,z));
+    std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : inner rtzp "<<rtzp<<"\n";
+
+    //    PcgM2(z,p[0]);
+    PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
+
+    p[peri_kp]=mu;
+
+    // Standard search direction  p -> z + b p    ; b = 
+    b = (rtzp)/rtz;
+
+    int northog;
+    // k=zero  <=> peri_kp=1;        northog = 1
+    // k=1     <=> peri_kp=2;        northog = 2
+    // ...               ...                  ...
+    // k=mmax-2<=> peri_kp=mmax-1;   northog = mmax-1
+    // k=mmax-1<=> peri_kp=0;        northog = 1
+
+    //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
+    northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
+    
+    std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
+    for(int back=0; back < northog; back++){
+      int peri_back = (k-back)%mmax;
+      RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
+      RealD beta = -pbApk/pAp[peri_back];
+      axpy(p[peri_kp],beta,p[peri_back],p[peri_kp]);
+    }
+
+    RealD rrn=sqrt(rn/ssq);
+    RealD rtn=sqrt(rtz/ssq);
+    RealD rtnp=sqrt(rtzp/ssq);
+
+    std::cout<<GridLogMessage<<"HDCG: fPcg k= "<<k<<" residual = "<<rrn<<"\n";
+
+    // Stopping condition
+    if ( rn <= rsq ) { 
+
+      HDCGTimer.Stop();
+      std::cout<<GridLogMessage<<"HDCG: fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
+      
+      _FineLinop.HermOp(x,mmp[0]);			  
+      axpy(tmp,-1.0,src,mmp[0]);
+      
+      RealD  mmpnorm = sqrt(norm2(mmp[0]));
+      RealD  xnorm   = sqrt(norm2(x));
+      RealD  srcnorm = sqrt(norm2(src));
+      RealD  tmpnorm = sqrt(norm2(tmp));
+      RealD  true_residual = tmpnorm/srcnorm;
+      std::cout<<GridLogMessage
+	       <<"HDCG: true residual is "<<true_residual
+	       <<" solution "<<xnorm
+	       <<" source "<<srcnorm
+	       <<" mmp "<<mmpnorm	  
+	       <<std::endl;
+      
+      return;
+    }
+
+  }
+  HDCGTimer.Stop();
+  std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
+  RealD  xnorm   = sqrt(norm2(x));
+  RealD  srcnorm = sqrt(norm2(src));
+  std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
+#endif
  }
  

@ -239,6 +397,7 @@ class TwoLevelADEF2 : public TwoLevelCG<Field>

  virtual void PcgM1(Field & in, Field & out)
  {
+    GRID_TRACE("MultiGridPreconditioner ");
    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]

    Field tmp(this->grid);
@ -334,6 +493,7 @@ public:
  // Override PcgM1
  virtual void PcgM1(Field & in, Field & out)
  {
+    GRID_TRACE("EvecPreconditioner ");
    int N=evec.size();
    Field Pin(this->grid);
    Field Qin(this->grid);
--- a/Grid/algorithms/iterative/ConjugateGradient.h
+++ b/Grid/algorithms/iterative/ConjugateGradient.h
@ -207,7 +207,8 @@ public:

    TrueResidual = sqrt(norm2(p)/ssq);

-    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;
+    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
+	      <<" residual "<< TrueResidual<< std::endl;

    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;
--- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h
@ -144,7 +144,7 @@ public:
    for(int s=0;s<nshift;s++){
      rsq[s] = cp * mresidual[s] * mresidual[s];
      std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
-	       <<" target resid "<<rsq[s]<<std::endl;
+	       <<" target resid^2 "<<rsq[s]<<std::endl;
      ps[s] = src;
    }
    // r and p for primary
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -79,14 +79,16 @@ template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public Imp
    RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);

    std::cout.precision(13);
-    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
-	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
-	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
-	     <<std::endl;

    int conv=0;
    if( (vv<eresid*eresid) ) conv = 1;

+    std::cout<<GridLogIRL  << "[" << std::setw(3)<<j<<"] "
+	     <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
+	     <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
+	     <<" target " << eresid*eresid << " conv " <<conv
+	     <<std::endl;
+
    return conv;
  }
 };
--- a/Grid/algorithms/multigrid/Aggregates.h
+++ b/Grid/algorithms/multigrid/Aggregates.h
@ -32,6 +32,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+inline RealD AggregatePowerLaw(RealD x)
+{
+  //  return std::pow(x,-4);
+  //  return std::pow(x,-3);
+  return std::pow(x,-5);
+}
+
 template<class Fobj,class CComplex,int nbasis>
 class Aggregation {
 public:
@ -246,9 +253,54 @@ public:
      // Initial matrix element
      hermop.Op(noise,Mn);
      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
      // Filter
      Chebyshev<FineField> Cheb(lo,hi,orderfilter);
      Cheb(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+
+      // Refine
+      Chebyshev<FineField> PowerLaw(lo,hi,1000,AggregatePowerLaw);
+      noise = Mn;
+      PowerLaw(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+
+      // normalise
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+    }
+
+  }
+
+  virtual void CreateSubspaceChebyshevPowerLaw(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+					       int nn,
+					       double hi,
+					       int orderfilter
+					       ) {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" [0,"<<hi<<"]"<<std::endl;
+    std::cout << GridLogMessage<<" Chebyshev subspace pure noise  : nbasis "<<nn<<std::endl;
+
+    for(int b =0;b<nbasis;b++)
+    {
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+
+      // Initial matrix element
+      hermop.Op(noise,Mn);
+      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+      // Filter
+      Chebyshev<FineField> Cheb(0.0,hi,orderfilter,AggregatePowerLaw);
+      Cheb(hermop,noise,Mn);
      // normalise
      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
      subspace[b]   = Mn;
@ -258,5 +310,72 @@ public:

  }

+  virtual void CreateSubspaceMultishift(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &hermop,
+					double Lo,double tol,int maxit)
+  {
+
+    RealD scale;
+
+    FineField noise(FineGrid);
+    FineField Mn(FineGrid);
+    FineField tmp(FineGrid);
+
+    // New normalised noise
+    std::cout << GridLogMessage<<" Multishift subspace : Lo "<<Lo<<std::endl;
+
+    // Filter
+    // [ 1/6(x+Lo)  - 1/2(x+2Lo) + 1/2(x+3Lo)  -1/6(x+4Lo) = Lo^3 /[ (x+1Lo)(x+2Lo)(x+3Lo)(x+4Lo) ]
+    //
+    // 1/(x+Lo)  - 1/(x+2 Lo)
+    double epsilon      = Lo/3;
+    std::vector<RealD> alpha({1.0/6.0,-1.0/2.0,1.0/2.0,-1.0/6.0});
+    std::vector<RealD> shifts({Lo,Lo+epsilon,Lo+2*epsilon,Lo+3*epsilon});
+    std::vector<RealD> tols({tol,tol,tol,tol});
+    std::cout << "sizes "<<alpha.size()<<" "<<shifts.size()<<" "<<tols.size()<<std::endl;
+
+    MultiShiftFunction msf(4,0.0,95.0);
+    std::cout << "msf constructed "<<std::endl;
+    msf.poles=shifts;
+    msf.residues=alpha;
+    msf.tolerances=tols;
+    msf.norm=0.0;
+    msf.order=alpha.size();
+    ConjugateGradientMultiShift<FineField> MSCG(maxit,msf);
+    
+    for(int b =0;b<nbasis;b++)
+    {
+      gaussian(RNG,noise);
+      scale = std::pow(norm2(noise),-0.5); 
+      noise=noise*scale;
+
+      // Initial matrix element
+      hermop.Op(noise,Mn);
+      if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
+
+      MSCG(hermop,noise,Mn);
+      scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale;
+      subspace[b]   = Mn;
+      hermop.Op(Mn,tmp); 
+      std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
+
+    }
+
+  }
+  virtual void RefineSubspace(LinearOperatorBase<FineField> &hermop,
+			      double Lo,double tol,int maxit)
+  {
+    FineField tmp(FineGrid);
+    for(int b =0;b<nbasis;b++)
+    {
+      RealD MirsShift = Lo;
+      ConjugateGradient<FineField>  CGsloppy(tol,maxit,false);
+      ShiftedHermOpLinearOperator<FineField> ShiftedFineHermOp(hermop,MirsShift);
+      CGsloppy(hermop,subspace[b],tmp);
+      subspace[b]=tmp;
+    }
+  }
+
+  
+  
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@ -72,6 +72,17 @@ public:
  GridBase      * FineGrid(void)       { return _FineGrid; };   // this is all the linalg routines need to know
  GridCartesian * CoarseGrid(void)     { return _CoarseGrid; };   // this is all the linalg routines need to know

+  void ShiftMatrix(RealD shift)
+  {
+    int Nd=_FineGrid->Nd(); 
+    Coordinate zero_shift(Nd,0);
+    for(int p=0;p<geom.npoint;p++){
+      if ( zero_shift==geom.shifts[p] ) {
+	_A[p] = _A[p]+shift;
+	_Adag[p] = _Adag[p]+shift;
+      }
+    }    
+  }
  void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
  {
    int nfound=0;
@ -101,17 +112,7 @@ public:
  {
    {
      int npoint = _geom.npoint;
-      autoView( Stencil_v  , Stencil, AcceleratorRead);
-      int osites=Stencil.Grid()->oSites();
-      for(int ss=0;ss<osites;ss++){
-	for(int point=0;point<npoint;point++){
-	  auto SE = Stencil_v.GetEntry(point,ss);
-	  int o = SE->_offset;
-	  assert( o< osites);
-	}
-      }    
    }
-
    _A.resize(geom.npoint,CoarseGrid);
    _Adag.resize(geom.npoint,CoarseGrid);
  }
@ -127,6 +128,7 @@ public:
  void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
  {
    RealD tviews=0;    RealD ttot=0;    RealD tmult=0;   RealD texch=0;    RealD text=0; RealD ttemps=0; RealD tcopy=0;
+    RealD tmult2=0;

    ttot=-usecond();
    conformable(CoarseGrid(),in.Grid());
@ -188,16 +190,17 @@ public:
 	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
 	  int32_t ss   = spb/(nbasis*npoint);
 	  int32_t bp   = spb%(nbasis*npoint);
-	  int32_t b    = bp/npoint;
-	  int32_t point= bp%npoint;
+	  int32_t point= bp/nbasis;
+	  int32_t b    = bp%nbasis;
 	  auto SE  = Stencil_v.GetEntry(point,ss);
 	  auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
-	  auto res = coalescedRead(Aview_p[point][ss](b,0))*nbr(0);
+	  auto res = coalescedRead(Aview_p[point][ss](0,b))*nbr(0);
 	  for(int bb=1;bb<nbasis;bb++) {
-	    res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+	    res = res + coalescedRead(Aview_p[point][ss](bb,b))*nbr(bb);
 	  }
 	  coalescedWrite(Vview_p[point][ss](b),res);
      });
+      tmult2-=usecond();
      accelerator_for(sb, osites*nbasis, Nsimd, {
 	  int ss = sb/nbasis;
 	  int b  = sb%nbasis;
@ -207,6 +210,7 @@ public:
 	  }
 	  coalescedWrite(out_v[ss](b),res);
      });
+      tmult2+=usecond();
      tmult+=usecond();
      for(int p=0;p<npoint;p++) {
 	AcceleratorViewContainer_h[p].ViewClose();
@ -222,15 +226,16 @@ public:
    std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<" of which mult2  "<<tmult2<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult copy  "<<tcopy<<" us"<<std::endl;
    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
    //    std::cout << GridLogPerformance<<std::endl;
-    //    std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
-    //    std::cout << GridLogPerformance<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
-    //    std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
-    //    std::cout << GridLogPerformance<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;

  };

@ -408,7 +413,7 @@ public:
 	autoView( FT_v  , FT[k], AcceleratorRead);
 	accelerator_for(sss, osites, 1, {
 	    for(int j=0;j<nbasis;j++){
-	      A_v[sss](j,i) = FT_v[sss](j);
+	      A_v[sss](i,j) = FT_v[sss](j);
 	    }
        });
      }
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@ -175,8 +175,56 @@ template<class T> using cshiftAllocator = std::allocator<T>;

 template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;           
 template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;           
-template<class T> using commVector = std::vector<T,devAllocator<T> >;
-template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
+template<class T> using commVector    = std::vector<T,devAllocator<T> >;
+template<class T> using deviceVector  = std::vector<T,devAllocator<T> >;
+template<class T> using cshiftVector  = std::vector<T,cshiftAllocator<T> >;
+
+/*
+template<class T> class vecView
+{
+ protected:
+  T * data;
+  uint64_t size;
+  ViewMode mode;
+  void * cpu_ptr;
+ public:
+  accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
+  vecView(std::vector<T> &refer_to_me,ViewMode _mode)
+  {
+    cpu_ptr = &refer_to_me[0];
+    size = refer_to_me.size();
+    mode = _mode;
+    data =(T *) MemoryManager::ViewOpen(cpu_ptr,
+					size*sizeof(T),
+					mode,
+					AdviseDefault);
+  }
+  void ViewClose(void)
+  { // Inform the manager
+    MemoryManager::ViewClose(this->cpu_ptr,this->mode);    
+  }
+};
+
+template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
+{
+  vecView<T> ret(vec,_mode); // does the open
+  return ret;                // must be closed
+}
+
+// Little autoscope assister
+template<class View> 
+class VectorViewCloser
+{
+  View v;  // Take a copy of view and call view close when I go out of scope automatically
+ public:
+  VectorViewCloser(View &_v) : v(_v) {};
+  ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose();  MemoryManager::NotifyDeletion(ptr);}
+};
+
+#define autoVecView(v_v,v,mode)					\
+  auto v_v = VectorView(v,mode);				\
+  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
+*/

 NAMESPACE_END(Grid);

--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -203,6 +203,27 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  return real(nrm); 
 }

+
+template<class Op,class T1>
+inline auto norm2(const LatticeUnaryExpression<Op,T1> & expr)  ->RealD
+{
+  return norm2(closure(expr));
+}
+
+template<class Op,class T1,class T2>
+inline auto norm2(const LatticeBinaryExpression<Op,T1,T2> & expr)      ->RealD
+{
+  return norm2(closure(expr));
+}
+
+
+template<class Op,class T1,class T2,class T3>
+inline auto norm2(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)      ->RealD
+{
+  return norm2(closure(expr));
+}
+
+
 //The global maximum of the site norm2
 template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
 {
--- a/systems/Frontier/sourceme.sh
+++ b/systems/Frontier/sourceme.sh
@ -3,7 +3,7 @@ spack load c-lime
 #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib
 module load emacs 
 module load PrgEnv-gnu
-module load rocm
+module load rocm/5.3.0
 module load cray-mpich/8.1.23
 module load gmp
 module load cray-fftw
--- a/tests/debug/Test_general_coarse_hdcg_phys.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys.cc
@ -67,6 +67,22 @@ void LoadOperator(Coarsened &Operator,std::string file)
  Operator.ExchangeCoarseLinks();
 #endif
 }
+template<class Coarsened>
+void ReLoadOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  Grid::ScidacReader RD ;
+  RD.open(file);
+  assert(Operator._A.size()==Operator.geom.npoint);
+  for(int p=0;p<Operator.geom.npoint;p++){
+    auto tmp=Operator.Cell.Extract(Operator._A[p]);
+    RD.readScidacFieldRecord(tmp,record,0);
+    Operator._A[p] = Operator.Cell.ExchangePeriodic(tmp);
+  }    
+  RD.close();
+#endif
+}
 template<class aggregation>
 void SaveBasis(aggregation &Agg,std::string file)
 {
@ -96,14 +112,6 @@ void LoadBasis(aggregation &Agg, std::string file)
 #endif
 }

-
-template<class Field> class TestSolver : public LinearFunction<Field> {
-public:
-  TestSolver() {};
-  void operator() (const Field &in, Field &out){    out = Zero();  }     
-};
-
-
 RealD InverseApproximation(RealD x){
  return 1.0/x;
 }
@ -123,7 +131,7 @@ public:
  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
 };
-template<class Field,class Matrix> class ChebyshevSmoother : public LinearFunction<Field>
+template<class Field> class ChebyshevSmoother : public LinearFunction<Field>
 {
 public:
  using LinearFunction<Field>::operator();
@ -144,12 +152,35 @@ public:
  }
 };

+template<class Field> class CGSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  int iters;
+  CGSmoother(int _iters, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    iters(_iters)
+  {
+    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
+    CG(_SmootherOperator,in,out);
+  }
+};
+
+
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

  const int Ls=24;
-  const int nbasis = 40;
+  const int nbasis = 62;
+  //  const int nbasis = 56;
+  //  const int nbasis = 44;
  const int cb = 0 ;
  RealD mass=0.00078;
  RealD M5=1.8;
@ -164,10 +195,12 @@ int main (int argc, char ** argv)
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

  // Construct a coarsened grid with 4^4 cell
+  Coordinate Block({4,4,6,4});
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
-    clatt[d] = clatt[d]/4;
+    clatt[d] = clatt[d]/Block[d];
  }
+
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
 							    GridDefaultSimd(Nd,vComplex::Nsimd()),
 							    GridDefaultMpi());;
@ -186,7 +219,7 @@ int main (int argc, char ** argv)
  LatticeGaugeField Umu(UGrid);

  FieldMetaData header;
-  std::string file("ckpoint_lat.2250");
+  std::string file("ckpoint_lat.1000");
  NerscIO::readConfiguration(Umu,header,file);

  //////////////////////// Fermion action //////////////////////////////////
@ -196,14 +229,13 @@ int main (int argc, char ** argv)

  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
  HermFineMatrix FineHermOp(HermOpEO);
-  
+
  LatticeFermion result(FrbGrid); result=Zero();

  LatticeFermion    src(FrbGrid); random(RNG5,src);

  // Run power method on FineHermOp
  PowerMethod<LatticeFermion>       PM;   PM(HermOpEO,src);
-
 
  ////////////////////////////////////////////////////////////
  ///////////// Coarse basis and Little Dirac Operator ///////
@ -223,29 +255,110 @@ int main (int argc, char ** argv)
  ////////////////////////////////////////////////////////////
  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);

-  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys.nolex.scidac");
-  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys.nolex.scidac");
+  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.rat.scidac.62");
+  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.rat.scidac.62");
+  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.rat.scidac.62");
  bool load_agg=true;
+  bool load_refine=true;
  bool load_mat=true;
  if ( load_agg ) {
    LoadBasis(Aggregates,subspace_file);
  } else {
+
+    // NBASIS=40
+    // Best so far: ord 2000 [0.01,95], 500,500  -- 466 iters
+    // slurm-398626.out:Grid : Message : 141.295253 s : 500 filt [1] <n|MdagM|n> 0.000103622063
+
+
+    //Grid : Message : 33.870465 s :  Chebyshev subspace pass-1 : ord 2000 [0.001,95]
+    //Grid : Message : 33.870485 s :  Chebyshev subspace pass-2 : nbasis40 min 1000 step 1000 lo0
+    //slurm-1482200.out : filt ~ 0.004 -- not as low mode projecting -- took 626 iters
+
+    // To try: 2000 [0.1,95]  ,2000,500,500 -- slurm-1482213.out 586 iterations
+
+    // To try: 2000 [0.01,95] ,2000,500,500 -- 469 (think I bumped 92 to 95) (??)
+    // To try: 2000 [0.025,95],2000,500,500
+    // To try: 2000 [0.005,95],2000,500,500
+
+    // NBASIS=44 -- HDCG paper was 64 vectors; AMD compiler craps out at 48
+    // To try: 2000 [0.01,95] ,2000,500,500 -- 419 lowest slurm-1482355.out
+    // To try: 2000 [0.025,95] ,2000,500,500 -- 487 
+    // To try: 2000 [0.005,95] ,2000,500,500
+    /*
+      Smoother [3,92] order 16
+slurm-1482355.out:Grid : Message : 35.239686 s :  Chebyshev subspace pass-1 : ord 2000 [0.01,95]
+slurm-1482355.out:Grid : Message : 35.239714 s :  Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0
+slurm-1482355.out:Grid : Message : 5561.305552 s : HDCG: Pcg converged in 419 iterations and 2616.202598 s
+
+slurm-1482367.out:Grid : Message : 43.157235 s :  Chebyshev subspace pass-1 : ord 2000 [0.025,95]
+slurm-1482367.out:Grid : Message : 43.157257 s :  Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0
+slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 iterations and 3131.185821 s
+    */
+		 /*
+		   Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+				       95.0,0.0075,
+				       2500,
+				       500,
+				       500,
+				       0.0);
+		 */
+
+		 /*
+		   Aggregates.CreateSubspaceChebyshevPowerLaw(RNG5,HermOpEO,nbasis,
+							      95.0,
+							      2000);
+		 */
+
+    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
+					0.0003,1.0e-5,2000); // Lo, tol, maxit
+  /*
    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
 				       95.0,0.05,
-				       1000,
-				       200,
-				       200,
+				       2000,
+				       500,
+				       500,
 				       0.0);
+ */
+    /*
+      Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+				       95.0,0.01,
+				       2000,
+				       500,
+				       500,
+				       0.0);
+    */
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); -- running slurm-1484934.out nbasis 56
+
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); <== last run
    SaveBasis(Aggregates,subspace_file);
  }
+
+  int refine=1;
+  if(refine){
+    if ( load_refine ) {
+      LoadBasis(Aggregates,refine_file);
+    } else {
+      // HDCG used Pcg to refine
+      Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000);
+      SaveBasis(Aggregates,refine_file);
+    }
+  }
+
+  Aggregates.Orthogonalise();
  if ( load_mat ) {
    LoadOperator(LittleDiracOp,ldop_file);
  } else {
    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
    SaveOperator(LittleDiracOp,ldop_file);
  }
-  
+
+  // I/O test:
+  CoarseVector c_src(Coarse5d);   random(CRNG,c_src);
+  CoarseVector c_res(Coarse5d); 
+  CoarseVector c_ref(Coarse5d);
+
  // Try projecting to one hop only
+  //  LittleDiracOp.ShiftMatrix(1.0e-4);
  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
  LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n

@ -256,21 +369,28 @@ int main (int argc, char ** argv)
  //////////////////////////////////////////
  // Build a coarse lanczos
  //////////////////////////////////////////
-  //  Chebyshev<CoarseVector>      IRLCheby(0.01,44.0,201);  // 1 iter
-  Chebyshev<CoarseVector>      IRLCheby(0.005,44.0,401);  // 1 iter
+  //  Chebyshev<CoarseVector>      IRLCheby(0.012,40.0,201);  //500 HDCG iters
+  //  int Nk=512; // Didn't save much
+  //  int Nm=640;
+  //  int Nstop=400;
+
+  //  Chebyshev<CoarseVector>      IRLCheby(0.005,40.0,201);  //319 HDCG iters @ 128//160 nk.
+  //  int Nk=128;
+  //  int Nm=160;
+  Chebyshev<CoarseVector>      IRLCheby(0.005,40.0,201);  //319 HDCG iters @ 128//160 nk.
+  int Nk=192;
+  int Nm=256;
+  int Nstop=Nk;
+  
+  //  Chebyshev<CoarseVector>      IRLCheby(0.010,45.0,201);  // 1 iter
  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
  PlainHermOp<CoarseVector>    IRLOp    (CoarseOp);
-  int Nk=160;
-  int Nm=240;
-  int Nstop=Nk;
-  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20);
+  
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1e-5,10);

  int Nconv;
  std::vector<RealD>            eval(Nm);
  std::vector<CoarseVector>     evec(Nm,Coarse5d);
-  CoarseVector c_src(Coarse5d); c_src=1.0;
-  CoarseVector c_res(Coarse5d); 
-  CoarseVector c_ref(Coarse5d); 

  PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src);

@ -280,9 +400,9 @@ int main (int argc, char ** argv)
  //////////////////////////////////////////
  // Build a coarse space solver
  //////////////////////////////////////////
-  int maxit=20000;
-  ConjugateGradient<CoarseVector>  CG(1.0e-8,maxit,false);
-  ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,10000,false);
+  int maxit=30000;
+  ConjugateGradient<CoarseVector>  CG(1.0e-10,maxit,false);
+  ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
  ZeroGuesser<CoarseVector> CoarseZeroGuesser;

  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
@ -306,8 +426,7 @@ int main (int argc, char ** argv)
  //////////////////////////////////////////////////////////////////////
  // Coarse ADEF1 with deflation space
  //////////////////////////////////////////////////////////////////////
-  ChebyshevSmoother<CoarseVector,HermMatrix >
-    CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence
+  ChebyshevSmoother<CoarseVector >  CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence
    //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  //
  //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s
  //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s
@ -375,41 +494,134 @@ int main (int argc, char ** argv)
  // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and
  //         use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec
  //
-  std::vector<RealD> los({3.0}); // Nbasis 40 == 36,36 iters
+  //
+  //
+  //
+  
+  std::vector<RealD> los({2.0,2.5}); // Nbasis 40 == 36,36 iters

  //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
-  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)  
+  //  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)
+  std::vector<int> ords({9}); // Nbasis 40 == 40 iters (320 mults)  

+ /*
+   Smoother opt @56 nbasis, 0.04 convergence, 192 evs
+ ord lo
+
+ 16   0.1  no converge -- likely sign indefinite
+ 32   0.1  no converge -- likely sign indefinite(?)
+
+ 16   0.5  422
+ 32   0.5  302
+ 
+ 8   1.0  575
+ 12  1.0  449
+ 16  1.0  375
+ 32  1.0  302
+
+ 12  3.0  476
+ 16  3.0  319
+ 32  3.0  306
+
+ Powerlaw setup 62 vecs
+slurm-1494943.out:Grid : Message : 4874.186617 s : HDCG: Pcg converged in 171 iterations and 1706.548006 s 1.0 32
+slurm-1494943.out:Grid : Message : 6490.121648 s : HDCG: Pcg converged in 194 iterations and 1616.219654 s 1.0 16
+
+ Cheby setup: 56vecs
+ -- CG smoother O(16): 487
+ 
+Power law setup, 56 vecs -- lambda^-5
+slurm-1494383.out:Grid : Message : 4377.173265 s : HDCG: Pcg converged in 204 iterations and 1153.548935 s 1.0 32
+
+Power law setup, 56 vecs -- lambda^-3
+
+slurm-1494242.out:Grid : Message : 4370.464814 s : HDCG: Pcg converged in 204 iterations and 1143.494776 s  1.0 32
+slurm-1494242.out:Grid : Message : 5432.414820 s : HDCG: Pcg converged in 237 iterations and 1061.455882 s  1.0 16
+slurm-1494242.out:Grid : Message : 6588.727977 s : HDCG: Pcg converged in 205 iterations and 1156.565210 s  0.5 32
+
+ Power law setup, 56 vecs -- lambda^-4
+ -- CG smoother    O(16): 290
+ -- Cheby smoother O(16): 218 -- getting close to the deflation level I expect 169 from BFM paper @O(7) smoother and 64 nbasis
+
+Grid : Message : 2790.797194 s : HDCG: Pcg converged in 190 iterations and 1049.563182 s 1.0 32
+Grid : Message : 3766.374396 s : HDCG: Pcg converged in 218 iterations and 975.455668 s  1.0 16
+Grid : Message : 4888.746190 s : HDCG: Pcg converged in 191 iterations and 1122.252055 s 0.5 32
+Grid : Message : 5956.679661 s : HDCG: Pcg converged in 231 iterations and 1067.812850 s 0.5 16
+
+Grid : Message : 2767.405829 s : HDCG: Pcg converged in 218 iterations and 967.214067 s -- 16
+Grid : Message : 3816.165905 s : HDCG: Pcg converged in 251 iterations and 1048.636269 s -- 12
+Grid : Message : 5121.206572 s : HDCG: Pcg converged in 318 iterations and 1304.916168 s -- 8
+
+ 
+[paboyle@login2.crusher debug]$ grep -v Memory slurm-402426.out  | grep converged | grep HDCG -- [1.0,16] cheby
+Grid : Message : 5185.521063 s : HDCG: Pcg converged in 377 iterations and 1595.843529 s
+
+[paboyle@login2.crusher debug]$ grep HDCG  slurm-402184.out | grep onver
+Grid : Message : 3760.438160 s : HDCG: Pcg converged in 422 iterations and 2129.243141 s
+Grid : Message : 5660.588015 s : HDCG: Pcg converged in 308 iterations and 1900.026821 s
+
+ 
+Grid : Message : 4238.206528 s : HDCG: Pcg converged in 575 iterations and 2657.430676 s
+Grid : Message : 6345.880344 s : HDCG: Pcg converged in 449 iterations and 2108.505208 s
+
+grep onverg slurm-401663.out | grep HDCG
+Grid : Message : 3900.817781 s : HDCG: Pcg converged in 476 iterations and 1992.591311 s
+Grid : Message : 5647.202699 s : HDCG: Pcg converged in 306 iterations and 1746.838660 s
+
+
+[paboyle@login2.crusher debug]$ grep converged slurm-401775.out | grep HDCG
+Grid : Message : 3583.177025 s : HDCG: Pcg converged in 375 iterations and 1800.896037 s
+Grid : Message : 5348.342243 s : HDCG: Pcg converged in 302 iterations and 1765.045018 s
+
+Conclusion: higher order smoother is doing better. Much better. Use a Krylov smoother instead Mirs as in BFM version.
+
+ */
+				      //
  for(int l=0;l<los.size();l++){

    RealD lo = los[l];

    for(int o=0;o<ords.size();o++){

-      ConjugateGradient<CoarseVector>  CGsloppy(5.0e-2,maxit,false);
+      ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false);
      HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
      
      //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
-      ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,ords[o],FineHermOp);  // 311
+      ChebyshevSmoother<LatticeFermionD > ChebySmooth(lo,95,ords[o],FineHermOp);  // 311

+      /*
+       * CG smooth 11 iter: 
+       slurm-403825.out:Grid : Message : 4369.824339 s : HDCG: fPcg converged in 215 iterations 3.0
+       slurm-403908.out:Grid : Message : 3955.897470 s : HDCG: fPcg converged in 236 iterations 1.0
+       slurm-404273.out:Grid : Message : 3843.792191 s : HDCG: fPcg converged in 210 iterations 2.0
+       * CG smooth 9 iter: 
+      */
+      //
+      RealD MirsShift = lo;
+      ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
+      CGSmoother<LatticeFermionD> CGsmooth(ords[o],ShiftedFineHermOp) ;
+  
      //////////////////////////////////////////
      // Build a HDCG solver
      //////////////////////////////////////////
      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
-	HDCG(1.0e-8, 100,
+	HDCG(1.0e-8, 700,
 	     FineHermOp,
-	     Smoother,
+	     //	     ChebySmooth,
+	     CGsmooth,
 	     HPDSolveSloppy,
 	     HPDSolve,
 	     Aggregates);

-      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
-	HDCGdefl(1.0e-8, 100,
+      /*
+	TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
+	HDCGdefl(1.0e-8, 700,
 		 FineHermOp,
 		 Smoother,
 		 cADEF1,
 		 HPDSolve,
 		 Aggregates);
+      */
      
      //      result=Zero();
      //      HDCGdefl(src,result);
Author	SHA1	Message	Date
Peter Boyle	09946cf1ba	Improved, works on 48^3 moving to multiRHS optimisations	2023-11-15 18:03:05 -05:00
Peter Boyle	f4fa95e7cb	Use 5.3.0	2023-11-15 18:01:38 -05:00
Peter Boyle	100e29e35e	Allow expression as argument to norm2	2023-11-15 18:00:44 -05:00
Peter Boyle	4cbe471a83	devVector	2023-11-15 18:00:07 -05:00
Peter Boyle	8bece1f861	Faster to transpose the matrix and apply with column major order	2023-11-15 17:58:38 -05:00
Peter Boyle	a3ca71ec01	Lots more setup options, still working on them	2023-11-15 17:58:04 -05:00
Peter Boyle	e0543e8af5	Implement flexible preconditioned CG	2023-11-15 17:57:39 -05:00
Peter Boyle	c1eb80d01a	Print which have converged	2023-11-15 17:57:08 -05:00
Peter Boyle	a26121d97b	Better printing	2023-11-15 17:56:45 -05:00
Peter Boyle	043031a757	Report resid on failed convergence	2023-11-15 17:56:22 -05:00
Peter Boyle	807aeebe4c	Resize tol in constructor	2023-11-15 17:55:57 -05:00
Peter Boyle	8aa1a37aad	For Mirs preconditioner solver	2023-11-15 17:55:32 -05:00