MultiRHS coarse

MultiRHS coarse space
Add 48^3 rtest
2025-07-14 03:57:06 +01:00 · 2024-01-04 12:01:17 -05:00 · 2024-01-04 12:00:53 -05:00 · 2024-01-04 12:00:01 -05:00
3 changed files with 1114 additions and 211 deletions
--- a/Grid/algorithms/iterative/AdefGeneric.h
+++ b/Grid/algorithms/iterative/AdefGeneric.h
@ -69,279 +69,339 @@ class TwoLevelCG : public LinearFunction<Field>
  
  virtual void operator() (const Field &src, Field &x)
  {
-#if 0
-    Field resid(grid);
+    std::cout << GridLogMessage<<"HDCG: fPcg starting"<<std::endl;
    RealD f;
    RealD rtzp,rtz,a,d,b;
    RealD rptzp;
-    
-    Field p(grid);
+
+    /////////////////////////////
+    // Set up history vectors
+    /////////////////////////////
+    int mmax = 5;
+    std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
+    std::vector<Field> p(mmax,grid);
+    std::vector<Field> mmp(mmax,grid);
+    std::vector<RealD> pAp(mmax);
    Field z(grid);
    Field tmp(grid);
-    Field mmp(grid);
-    Field r  (grid);
-    Field mu (grid);
-    Field rp (grid);
+    Field  mp (grid);
+    Field  r  (grid);
+    Field  mu (grid);
    
+    std::cout << GridLogMessage<<"HDCG: fPcg allocated"<<std::endl;
    //Initial residual computation & set up
-    double tn;
-
+    RealD guess   = norm2(x);
+    std::cout << GridLogMessage<<"HDCG: fPcg guess nrm "<<guess<<std::endl;
+    RealD src_nrm = norm2(src);
+    std::cout << GridLogMessage<<"HDCG: fPcg src nrm "<<src_nrm<<std::endl;
+    
+    if ( src_nrm == 0.0 ) {
+      std::cout << GridLogMessage<<"HDCG: fPcg given trivial source norm "<<src_nrm<<std::endl;
+      x=Zero();
+    }
+    RealD tn;
+    
    GridStopWatch HDCGTimer;
    HDCGTimer.Start();
    //////////////////////////
    // x0 = Vstart -- possibly modify guess
    //////////////////////////
-    x=Zero();
    Vstart(x,src);
-
+    
    // r0 = b -A x0
-    _FineLinop.HermOp(x,mmp);
-
-    axpy(r, -1.0, mmp, src);    // Recomputes r=src-x0
-    rp=r;
+    _FineLinop.HermOp(x,mmp[0]);
+    axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
+    {
+      double n1 = norm2(x);
+      double n2 = norm2(mmp[0]);
+      double n3 = norm2(r);
+      std::cout<<GridLogMessage<<"x,vstart,r = "<<n1<<" "<<n2<<" "<<n3<<std::endl;
+    }

    //////////////////////////////////
    // Compute z = M1 x
    //////////////////////////////////
    PcgM1(r,z);
    rtzp =real(innerProduct(r,z));
-
+    
    ///////////////////////////////////////
-    // Except Def2, M2 is trivial
+    // Solve for Mss mu = P A z and set p = z-mu
+    // Def2 p = 1 - Q Az = Pright z
+    // Other algos M2 is trivial
    ///////////////////////////////////////
-    p=z;
+    PcgM2(z,p[0]);

    RealD ssq =  norm2(src);
    RealD rsq =  ssq*Tolerance*Tolerance;

-    GRID_TRACE("MultiGrid TwoLevel ");
-    std::cout<<GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" target rsq "<<rsq<<" ssq "<<ssq<<std::endl;
+    std::cout << GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" rsq "<<rsq<<"\n";
+
+    Field pp(grid);
+
+    for (int k=0;k<=MaxIterations;k++){
    
-    for (int k=1;k<=MaxIterations;k++){
+      int peri_k  = k % mmax;
+      int peri_kp = (k+1) % mmax;

      rtz=rtzp;
-      d= PcgM3(p,mmp);
+      d= PcgM3(p[peri_k],mmp[peri_k]);
      a = rtz/d;
+    
+      // Memorise this
+      pAp[peri_k] = d;
+      
+      axpy(x,a,p[peri_k],x);
+      RealD rn = axpy_norm(r,-a,mmp[peri_k],r);

-      axpy(x,a,p,x);
-      RealD rn = axpy_norm(r,-a,mmp,r);
-
+      // Compute z = M x
      PcgM1(r,z);
-
-      rtzp =real(innerProduct(r,z));
-
-      int ipcg=1; // almost free inexact preconditioned CG
-      if (ipcg) {
-	rptzp =real(innerProduct(rp,z));
-      } else {
-	rptzp =0;
+      
+      {
+	RealD n1,n2;
+	n1=norm2(r);
+	n2=norm2(z);
+	std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : vector r,z "<<n1<<" "<<n2<<"\n";
      }
-      b = (rtzp-rptzp)/rtz;
+      rtzp =real(innerProduct(r,z));
+      std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : inner rtzp "<<rtzp<<"\n";

+      //    PcgM2(z,p[0]);
      PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
+      
+      p[peri_kp]=mu;

-      axpy(p,b,p,mu);  // mu = A r
+      // Standard search direction  p -> z + b p    
+      b = (rtzp)/rtz;
+      
+      int northog;
+      // k=zero  <=> peri_kp=1;        northog = 1
+      // k=1     <=> peri_kp=2;        northog = 2
+      // ...               ...                  ...
+      // k=mmax-2<=> peri_kp=mmax-1;   northog = mmax-1
+      // k=mmax-1<=> peri_kp=0;        northog = 1
+
+      //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
+      northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
+    
+      std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
+      for(int back=0; back < northog; back++){
+	int peri_back = (k-back)%mmax;
+	RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
+	RealD beta = -pbApk/pAp[peri_back];
+	axpy(p[peri_kp],beta,p[peri_back],p[peri_kp]);
+      }

      RealD rrn=sqrt(rn/ssq);
      RealD rtn=sqrt(rtz/ssq);
-      std::cout<<GridLogMessage<<"HDCG: Pcg k= "<<k<<" residual = "<<rrn<<std::endl;
+      RealD rtnp=sqrt(rtzp/ssq);

-      if ( ipcg ) {
-	axpy(rp,0.0,r,r);
-      }
+      std::cout<<GridLogMessage<<"HDCG: fPcg k= "<<k<<" residual = "<<rrn<<"\n";

      // Stopping condition
      if ( rn <= rsq ) { 

 	HDCGTimer.Stop();
-	std::cout<<GridLogMessage<<"HDCG: Pcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
-
-	_FineLinop.HermOp(x,mmp);			  
-	axpy(tmp,-1.0,src,mmp);
-
-	RealD  mmpnorm = sqrt(norm2(mmp));
+	std::cout<<GridLogMessage<<"HDCG: fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
+	
+	_FineLinop.HermOp(x,mmp[0]);			  
+	axpy(tmp,-1.0,src,mmp[0]);
+	
+	RealD  mmpnorm = sqrt(norm2(mmp[0]));
 	RealD  xnorm   = sqrt(norm2(x));
 	RealD  srcnorm = sqrt(norm2(src));
 	RealD  tmpnorm = sqrt(norm2(tmp));
 	RealD  true_residual = tmpnorm/srcnorm;
 	std::cout<<GridLogMessage
-		 <<"HDCG: true residual is "<<true_residual
-		 <<" solution "<<xnorm
-		 <<" source "<<srcnorm
-		 <<" mmp "<<mmpnorm	  
-		 <<std::endl;
-
-	return;
-      }
-
-    }
-    std::cout<<GridLogMessage<<"HDCG: not converged"<<std::endl;
-    RealD  xnorm   = sqrt(norm2(x));
-    RealD  srcnorm = sqrt(norm2(src));
-    std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
-    
-    return ;
-#else
-  RealD f;
-  RealD rtzp,rtz,a,d,b;
-  RealD rptzp;
-
-  /////////////////////////////
-  // Set up history vectors
-  /////////////////////////////
-  int mmax = 20;
-  std::vector<Field> p(mmax,grid);
-  std::vector<Field> mmp(mmax,grid);
-  std::vector<RealD> pAp(mmax);
-  Field z(grid);
-  Field tmp(grid);
-  Field  mp (grid);
-  Field  r  (grid);
-  Field  mu (grid);
-
-  //Initial residual computation & set up
-  RealD guess   = norm2(x);
-  RealD src_nrm = norm2(src);
-
-  if ( src_nrm == 0.0 ) {
-    std::cout << GridLogMessage<<"HDCG: fPcg given trivial source norm "<<src_nrm<<std::endl;
-    x=Zero();
-  }
-  RealD tn;
-
-  GridStopWatch HDCGTimer;
-  HDCGTimer.Start();
-  //////////////////////////
-  // x0 = Vstart -- possibly modify guess
-  //////////////////////////
-  Vstart(x,src);
-
-  // r0 = b -A x0
-  _FineLinop.HermOp(x,mmp[0]);
-  axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0
-  {
-    double n1 = norm2(x);
-    double n2 = norm2(mmp[0]);
-    double n3 = norm2(r);
-    std::cout<<GridLogMessage<<"x,vstart,r = "<<n1<<" "<<n2<<" "<<n3<<std::endl;
-  }
-
-  //////////////////////////////////
-  // Compute z = M1 x
-  //////////////////////////////////
-  PcgM1(r,z);
-  rtzp =real(innerProduct(r,z));
-
-  ///////////////////////////////////////
-  // Solve for Mss mu = P A z and set p = z-mu
-  // Def2: p = 1 - Q Az = Pright z 
-  // Other algos M2 is trivial
-  ///////////////////////////////////////
-  PcgM2(z,p[0]);
-
-  RealD ssq =  norm2(src);
-  RealD rsq =  ssq*Tolerance*Tolerance;
-
-  std::cout << GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" rsq "<<rsq<<"\n";
-
-  Field pp(grid);
-
-  for (int k=0;k<=MaxIterations;k++){
-    
-    int peri_k  = k % mmax;
-    int peri_kp = (k+1) % mmax;
-
-    rtz=rtzp;
-    d= PcgM3(p[peri_k],mmp[peri_k]);
-    a = rtz/d;
-    
-    // Memorise this
-    pAp[peri_k] = d;
-
-    
-    axpy(x,a,p[peri_k],x);
-    RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
-
-    // Compute z = M x
-    PcgM1(r,z);
-
-    {
-      RealD n1,n2;
-      n1=norm2(r);
-      n2=norm2(z);
-      std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : vector r,z "<<n1<<" "<<n2<<"\n";
-    }
-    rtzp =real(innerProduct(r,z));
-    std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : inner rtzp "<<rtzp<<"\n";
-
-    //    PcgM2(z,p[0]);
-    PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
-
-    p[peri_kp]=mu;
-
-    // Standard search direction  p -> z + b p    ; b = 
-    b = (rtzp)/rtz;
-
-    int northog;
-    // k=zero  <=> peri_kp=1;        northog = 1
-    // k=1     <=> peri_kp=2;        northog = 2
-    // ...               ...                  ...
-    // k=mmax-2<=> peri_kp=mmax-1;   northog = mmax-1
-    // k=mmax-1<=> peri_kp=0;        northog = 1
-
-    //    northog     = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
-    northog     = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
-    
-    std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
-    for(int back=0; back < northog; back++){
-      int peri_back = (k-back)%mmax;
-      RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
-      RealD beta = -pbApk/pAp[peri_back];
-      axpy(p[peri_kp],beta,p[peri_back],p[peri_kp]);
-    }
-
-    RealD rrn=sqrt(rn/ssq);
-    RealD rtn=sqrt(rtz/ssq);
-    RealD rtnp=sqrt(rtzp/ssq);
-
-    std::cout<<GridLogMessage<<"HDCG: fPcg k= "<<k<<" residual = "<<rrn<<"\n";
-
-    // Stopping condition
-    if ( rn <= rsq ) { 
-
-      HDCGTimer.Stop();
-      std::cout<<GridLogMessage<<"HDCG: fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
-      
-      _FineLinop.HermOp(x,mmp[0]);			  
-      axpy(tmp,-1.0,src,mmp[0]);
-      
-      RealD  mmpnorm = sqrt(norm2(mmp[0]));
-      RealD  xnorm   = sqrt(norm2(x));
-      RealD  srcnorm = sqrt(norm2(src));
-      RealD  tmpnorm = sqrt(norm2(tmp));
-      RealD  true_residual = tmpnorm/srcnorm;
-      std::cout<<GridLogMessage
 	       <<"HDCG: true residual is "<<true_residual
 	       <<" solution "<<xnorm
 	       <<" source "<<srcnorm
 	       <<" mmp "<<mmpnorm	  
 	       <<std::endl;
      
-      return;
+	return;
+      }
+
+    }
+    HDCGTimer.Stop();
+    std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
+    RealD  xnorm   = sqrt(norm2(x));
+    RealD  srcnorm = sqrt(norm2(src));
+    std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
+  }
+
+
+
+  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
+  {
+    std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
+    src[0].Grid()->Barrier();
+    int nrhs = src.size();
+    std::vector<RealD> f(nrhs);
+    std::vector<RealD> rtzp(nrhs);
+    std::vector<RealD> rtz(nrhs);
+    std::vector<RealD> a(nrhs);
+    std::vector<RealD> d(nrhs);
+    std::vector<RealD> b(nrhs);
+    std::vector<RealD> rptzp(nrhs);
+    /////////////////////////////
+    // Set up history vectors
+    /////////////////////////////
+    int mmax = 2;
+    std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
+    src[0].Grid()->Barrier();
+    std::vector<std::vector<Field> > p(nrhs);   for(int r=0;r<nrhs;r++)  p[r].resize(mmax,grid);
+    std::cout << GridLogMessage<<"HDCG: fPcg allocated p"<<std::endl;
+    src[0].Grid()->Barrier();
+    std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
+    std::cout << GridLogMessage<<"HDCG: fPcg allocated mmp"<<std::endl;
+    src[0].Grid()->Barrier();
+    std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
+    std::cout << GridLogMessage<<"HDCG: fPcg allocated pAp"<<std::endl;
+    src[0].Grid()->Barrier();
+    std::vector<Field> z(nrhs,grid);
+    std::vector<Field>  mp (nrhs,grid);
+    std::vector<Field>  r  (nrhs,grid);
+    std::vector<Field>  mu (nrhs,grid);
+    std::cout << GridLogMessage<<"HDCG: fPcg allocated z,mp,r,mu"<<std::endl;
+    src[0].Grid()->Barrier();
+
+    //Initial residual computation & set up
+    std::vector<RealD> src_nrm(nrhs);
+    for(int rhs=0;rhs<nrhs;rhs++) {
+      src_nrm[rhs]=norm2(src[rhs]);
+      assert(src_nrm[rhs]!=0.0);
+    }
+    std::vector<RealD> tn(nrhs);
+
+    GridStopWatch HDCGTimer;
+    HDCGTimer.Start();
+    //////////////////////////
+    // x0 = Vstart -- possibly modify guess
+    //////////////////////////
+    for(int rhs=0;rhs<nrhs;rhs++){
+      Vstart(x[rhs],src[rhs]);
+
+      // r0 = b -A x0
+      _FineLinop.HermOp(x[rhs],mmp[rhs][0]);
+      axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]);    // Recomputes r=src-Ax0
    }

-  }
-  HDCGTimer.Stop();
-  std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
-  RealD  xnorm   = sqrt(norm2(x));
-  RealD  srcnorm = sqrt(norm2(src));
-  std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
-#endif
+    //////////////////////////////////
+    // Compute z = M1 x
+    //////////////////////////////////
+    // This needs a multiRHS version for acceleration
+    PcgM1(r,z);
+
+    std::vector<RealD> ssq(nrhs);
+    std::vector<RealD> rsq(nrhs);
+    std::vector<Field> pp(nrhs,grid);
+
+    for(int rhs=0;rhs<nrhs;rhs++){
+      rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
+      p[rhs][0]=z[rhs];
+      ssq[rhs]=norm2(src[rhs]);
+      rsq[rhs]=  ssq[rhs]*Tolerance*Tolerance;
+      std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
+    }
+
+    std::vector<RealD> rn(nrhs);
+    for (int k=0;k<=MaxIterations;k++){
+    
+      int peri_k  = k % mmax;
+      int peri_kp = (k+1) % mmax;
+
+      for(int rhs=0;rhs<nrhs;rhs++){
+	rtz[rhs]=rtzp[rhs];
+	d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
+	a[rhs] = rtz[rhs]/d[rhs];
+    
+	// Memorise this
+	pAp[rhs][peri_k] = d[rhs];
+
+	axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
+	rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
+      }
+
+      // Compute z = M x (for *all* RHS)
+      PcgM1(r,z);
+
+      RealD max_rn=0.0;
+      for(int rhs=0;rhs<nrhs;rhs++){
+
+	rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
+
+	std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
+	
+	mu[rhs]=z[rhs];
+
+	p[rhs][peri_kp]=mu[rhs];
+
+	// Standard search direction p == z + b p 
+	b[rhs] = (rtzp[rhs])/rtz[rhs];
+
+	int northog = (k>mmax-1)?(mmax-1):k;        // This is the fCG-Tr(mmax-1) algorithm
+	std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
+	for(int back=0; back < northog; back++){
+	  int peri_back = (k-back)%mmax;
+	  RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
+	  RealD beta = -pbApk/pAp[rhs][peri_back];
+	  axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
+	}
+
+	RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
+	RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
+	RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
+	
+	std::cout<<GridLogMessage<<"HDCG: rhs "<<rhs<<"fPcg k= "<<k<<" residual = "<<rrn<<"\n";
+	if ( rrn > max_rn ) max_rn = rrn;
+      }
+
+      // Stopping condition based on worst case
+      if ( max_rn <= Tolerance ) { 
+
+	HDCGTimer.Stop();
+	std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
+
+	for(int rhs=0;rhs<nrhs;rhs++){
+	  _FineLinop.HermOp(x[rhs],mmp[rhs][0]);			  
+	  Field tmp(grid);
+	  axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
+      
+	  RealD  mmpnorm = sqrt(norm2(mmp[rhs][0]));
+	  RealD  xnorm   = sqrt(norm2(x[rhs]));
+	  RealD  srcnorm = sqrt(norm2(src[rhs]));
+	  RealD  tmpnorm = sqrt(norm2(tmp));
+	  RealD  true_residual = tmpnorm/srcnorm;
+	  std::cout<<GridLogMessage
+		   <<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
+		   <<" solution "<<xnorm
+		   <<" source "<<srcnorm
+		   <<" mmp "<<mmpnorm	  
+		   <<std::endl;
+	}
+	return;
+      }
+      
+    }
+    HDCGTimer.Stop();
+    std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
+    for(int rhs=0;rhs<nrhs;rhs++){
+      RealD  xnorm   = sqrt(norm2(x[rhs]));
+      RealD  srcnorm = sqrt(norm2(src[rhs]));
+      std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
+    }
  }
  

 public:

+  virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out)
+  {
+    std::cout << "PcgM1 default (cheat) mrhs versoin"<<std::endl;
+    for(int rhs=0;rhs<in.size();rhs++){
+      this->PcgM1(in[rhs],out[rhs]);
+    }
+  }
  virtual void PcgM1(Field & in, Field & out)     =0;
  virtual void Vstart(Field & x,const Field & src)=0;

@ -440,6 +500,7 @@ class TwoLevelADEF2 : public TwoLevelCG<Field>

  virtual void Vstart(Field & x,const Field & src)
  {
+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart "<<std::endl;
    ///////////////////////////////////
    // Choose x_0 such that 
    // x_0 = guess +  (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
@ -456,14 +517,97 @@ class TwoLevelADEF2 : public TwoLevelCG<Field>
    CoarseField PleftProj(this->coarsegrid);
    CoarseField PleftMss_proj(this->coarsegrid);

+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart projecting "<<std::endl;
    this->_Aggregates.ProjectToSubspace(PleftProj,src);     
+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart coarse solve "<<std::endl;
    this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s
+    std::cout << GridLogMessage<<"HDCG: fPcg Vstart promote "<<std::endl;
    this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x);  

  }

 };

+template<class Field, class CoarseField, class Aggregation>
+class TwoLevelADEF2mrhs : public TwoLevelADEF2<Field,CoarseField,Aggregation>
+{
+public:
+  GridBase *coarsegridmrhs;
+  LinearFunction<CoarseField> &_CoarseSolverMrhs;
+  LinearFunction<CoarseField> &_CoarseGuesser;
+  TwoLevelADEF2mrhs(RealD tol,
+		    Integer maxit,
+		    LinearOperatorBase<Field>    &FineLinop,
+		    LinearFunction<Field>        &Smoother,
+		    LinearFunction<CoarseField>  &CoarseSolver,
+		    LinearFunction<CoarseField>  &CoarseSolverPrecise,
+		    LinearFunction<CoarseField>  &CoarseSolverMrhs,
+		    LinearFunction<CoarseField>  &CoarseGuesser,
+		    GridBase *rhsgrid,
+		    Aggregation &Aggregates) :
+    TwoLevelADEF2<Field,CoarseField,Aggregation>(tol, maxit,FineLinop,Smoother,CoarseSolver,CoarseSolverPrecise,Aggregates),
+    _CoarseSolverMrhs(CoarseSolverMrhs),
+    _CoarseGuesser(CoarseGuesser)
+  {
+    coarsegridmrhs = rhsgrid;
+  };
+  
+  virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out){
+
+    int nrhs=in.size();
+    std::cout << " mrhs PcgM1 for "<<nrhs<<" right hand sides"<<std::endl;
+    // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
+    Field tmp(this->grid);
+    std::vector<Field> Min(nrhs,this->grid);
+    CoarseField PleftProj(this->coarsegrid);
+    CoarseField PleftMss_proj(this->coarsegrid);
+
+    CoarseField PleftProjMrhs(this->coarsegridmrhs);
+    CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
+
+    for(int rhs=0;rhs<nrhs;rhs++) {
+      this->grid->Barrier();
+      std::cout << " Calling smoother for "<<rhs<<std::endl;
+      this->grid->Barrier();
+      this->_Smoother(in[rhs],Min[rhs]);
+      this->grid->Barrier();
+      std::cout << " smoother done "<<rhs<<std::endl;
+      this->grid->Barrier();
+      this->_FineLinop.HermOp(Min[rhs],out[rhs]);
+      this->grid->Barrier();
+      std::cout << " Hermop for "<<rhs<<std::endl;
+      this->grid->Barrier();
+      axpy(tmp,-1.0,out[rhs],in[rhs]);          // tmp  = in - A Min
+      this->grid->Barrier();
+      std::cout << " axpy "<<rhs<<std::endl;
+      this->grid->Barrier();
+      this->_Aggregates.ProjectToSubspace(PleftProj,tmp);     // can optimise later
+      this->grid->Barrier();
+      std::cout << " project "<<rhs<<std::endl;
+      this->grid->Barrier();
+      InsertSlice(PleftProj,PleftProjMrhs,rhs,0);
+      this->grid->Barrier();
+      std::cout << " insert rhs "<<rhs<<std::endl;
+      this->grid->Barrier();
+      this->_CoarseGuesser(PleftProj,PleftMss_proj);
+      this->grid->Barrier();
+      std::cout << " insert guess "<<rhs<<std::endl;
+      this->grid->Barrier();
+      InsertSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
+    }
+
+    std::cout << " Coarse solve "<<std::endl;
+    this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s
+
+    for(int rhs=0;rhs<nrhs;rhs++) {
+      ExtractSlice(PleftMss_proj,PleftMss_projMrhs,rhs,0);
+      this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]  
+      axpy(out[rhs],1.0,Min[rhs],tmp); // Min+tmp
+    }
+    std::cout << " Extracted "<<std::endl;
+  }
+};
+  
 template<class Field>
 class TwoLevelADEF1defl : public TwoLevelCG<Field>
 {
--- a/tests/debug/Test_general_coarse.cc
+++ b/tests/debug/Test_general_coarse.cc
@ -243,9 +243,14 @@ int main (int argc, char ** argv)
  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});

  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
+
  
  MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
-
+  typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t;
+  
+  //////////////////////////////////////////
+  // Test against single RHS
+  //////////////////////////////////////////
  {
    GridParallelRNG          rh_CRNG(CoarseMrhs);rh_CRNG.SeedFixedIntegers(cseeds);
    CoarseVector rh_phi(CoarseMrhs);
@ -287,7 +292,22 @@ int main (int argc, char ** argv)
    std::cout << nrhs<< " srhs " << t1/ncall/nrhs <<" us"<<std::endl;
  }

+  //////////////////////////////////////////
+  // Test against single RHS
+  //////////////////////////////////////////
+  {
+    typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> HermMatrix;
+    HermMatrix MrhsCoarseOp     (mrhs);

+    GridParallelRNG          rh_CRNG(CoarseMrhs);rh_CRNG.SeedFixedIntegers(cseeds);
+    ConjugateGradient<CoarseVector>  mrhsCG(1.0e-8,2000,true);
+    CoarseVector rh_res(CoarseMrhs);
+    CoarseVector rh_src(CoarseMrhs);
+    random(rh_CRNG,rh_src);
+    rh_res= Zero();
+    mrhsCG(MrhsCoarseOp,rh_src,rh_res);
+  }
+  
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
--- a/tests/debug/Test_general_coarse_hdcg_phys48.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48.cc
@ -0,0 +1,739 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_general_coarse_hdcg.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <pboyle@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+//#include <Grid/algorithms/GeneralCoarsenedMatrix.h>
+#include <Grid/algorithms/iterative/AdefGeneric.h>
+#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Coarsened>
+void SaveOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Operator.Grid()->IsBoss());
+  assert(Operator._A.size()==Operator.geom.npoint);
+  WR.open(file);
+  for(int p=0;p<Operator._A.size();p++){
+    auto tmp = Operator.Cell.Extract(Operator._A[p]);
+    WR.writeScidacFieldRecord(tmp,record,0,0);
+    //    WR.writeScidacFieldRecord(tmp,record,0,BINARYIO_LEXICOGRAPHIC);
+  }
+  WR.close();
+#endif
+}
+template<class Coarsened>
+void LoadOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  Grid::ScidacReader RD ;
+  RD.open(file);
+  assert(Operator._A.size()==Operator.geom.npoint);
+  for(int p=0;p<Operator.geom.npoint;p++){
+    conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
+    //    RD.readScidacFieldRecord(Operator._A[p],record,BINARYIO_LEXICOGRAPHIC);
+    RD.readScidacFieldRecord(Operator._A[p],record,0);
+  }    
+  RD.close();
+  Operator.ExchangeCoarseLinks();
+#endif
+}
+template<class Coarsened>
+void ReLoadOperator(Coarsened &Operator,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  Grid::ScidacReader RD ;
+  RD.open(file);
+  assert(Operator._A.size()==Operator.geom.npoint);
+  for(int p=0;p<Operator.geom.npoint;p++){
+    auto tmp=Operator.Cell.Extract(Operator._A[p]);
+    RD.readScidacFieldRecord(tmp,record,0);
+    Operator._A[p] = Operator.Cell.ExchangePeriodic(tmp);
+  }    
+  RD.close();
+#endif
+}
+template<class aggregation>
+void SaveBasis(aggregation &Agg,std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacWriter WR(Agg.FineGrid->IsBoss());
+  WR.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    //WR.writeScidacFieldRecord(Agg.subspace[b],record,0,BINARYIO_LEXICOGRAPHIC);
+    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,0);
+  }
+  WR.close();
+#endif
+}
+template<class aggregation>
+void LoadBasis(aggregation &Agg, std::string file)
+{
+#ifdef HAVE_LIME
+  emptyUserRecord record;
+  ScidacReader RD ;
+  RD.open(file);
+  for(int b=0;b<Agg.subspace.size();b++){
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,BINARYIO_LEXICOGRAPHIC);
+    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
+  }    
+  RD.close();
+#endif
+}
+
+RealD InverseApproximation(RealD x){
+  return 1.0/x;
+}
+
+// Want Op in CoarsenOp to call MatPcDagMatPc
+template<class Field>
+class HermOpAdaptor : public LinearOperatorBase<Field>
+{
+  LinearOperatorBase<Field> & wrapped;
+public:
+  HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme)  {};
+  void Op     (const Field &in, Field &out)   { wrapped.HermOp(in,out);  }
+  void HermOp(const Field &in, Field &out)    { wrapped.HermOp(in,out); }
+  void AdjOp     (const Field &in, Field &out){ wrapped.HermOp(in,out);  }
+  void OpDiag (const Field &in, Field &out)                  {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out)  {    assert(0);  };
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+};
+template<class Field> class ChebyshevSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  Chebyshev<Field> Cheby;
+  ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    Cheby(_lo,_hi,_ord,InverseApproximation)
+  {
+    std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    Field tmp(in.Grid());
+    tmp = in;
+    Cheby(_SmootherOperator,tmp,out);
+  }
+};
+
+template<class Field> class CGSmoother : public LinearFunction<Field>
+{
+public:
+  using LinearFunction<Field>::operator();
+  typedef LinearOperatorBase<Field> FineOperator;
+  FineOperator   & _SmootherOperator;
+  int iters;
+  CGSmoother(int _iters, FineOperator &SmootherOperator) :
+    _SmootherOperator(SmootherOperator),
+    iters(_iters)
+  {
+    std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
+  };
+  void operator() (const Field &in, Field &out) 
+  {
+    ConjugateGradient<Field>  CG(0.0,iters,false); // non-converge is just fine in a smoother
+    CG(_SmootherOperator,in,out);
+  }
+};
+
+
+gridblasHandle_t GridBLAS::gridblasHandle;
+int            GridBLAS::gridblasInit;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=24;
+  //  const int nbasis = 62;
+  //  const int nbasis = 56;
+  const int nbasis = 36;
+  const int cb = 0 ;
+  RealD mass=0.00078;
+  RealD M5=1.8;
+  RealD b=1.5;
+  RealD c=0.5;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
+								   GridDefaultSimd(Nd,vComplex::Nsimd()),
+								   GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid with 4^4 cell
+  Coordinate Block({4,4,6,4});
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/Block[d];
+  }
+
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt,
+							    GridDefaultSimd(Nd,vComplex::Nsimd()),
+							    GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  ///////////////////////// RNGs /////////////////////////////////
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  ///////////////////////// Configuration /////////////////////////////////
+  LatticeGaugeField Umu(UGrid);
+  MemoryManager::Print();
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.1000");
+  NerscIO::readConfiguration(Umu,header,file);
+  MemoryManager::Print();
+
+  //////////////////////// Fermion action //////////////////////////////////
+  MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
+
+  SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
+
+  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
+  HermFineMatrix FineHermOp(HermOpEO);
+
+  LatticeFermion result(FrbGrid); result=Zero();
+
+  LatticeFermion    src(FrbGrid); random(RNG5,src);
+
+  // Run power method on FineHermOp
+  PowerMethod<LatticeFermion>       PM;   PM(HermOpEO,src);
+ 
+  ////////////////////////////////////////////////////////////
+  ///////////// Coarse basis and Little Dirac Operator ///////
+  ////////////////////////////////////////////////////////////
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
+  NearestStencilGeometry5D geom_nn(Coarse5d);
+  
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse5d,FrbGrid,cb);
+
+  ////////////////////////////////////////////////////////////
+  // Need to check about red-black grid coarsening
+  ////////////////////////////////////////////////////////////
+  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);
+
+  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.rat.scidac.62");
+  std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.rat.scidac.62");
+  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.rat.scidac.62");
+  bool load_agg=true;
+  bool load_refine=true;
+  bool load_mat=false;
+  MemoryManager::Print();
+  if ( load_agg ) {
+    LoadBasis(Aggregates,subspace_file);
+  } else {
+
+    // NBASIS=40
+    // Best so far: ord 2000 [0.01,95], 500,500  -- 466 iters
+    // slurm-398626.out:Grid : Message : 141.295253 s : 500 filt [1] <n|MdagM|n> 0.000103622063
+
+
+    //Grid : Message : 33.870465 s :  Chebyshev subspace pass-1 : ord 2000 [0.001,95]
+    //Grid : Message : 33.870485 s :  Chebyshev subspace pass-2 : nbasis40 min 1000 step 1000 lo0
+    //slurm-1482200.out : filt ~ 0.004 -- not as low mode projecting -- took 626 iters
+
+    // To try: 2000 [0.1,95]  ,2000,500,500 -- slurm-1482213.out 586 iterations
+
+    // To try: 2000 [0.01,95] ,2000,500,500 -- 469 (think I bumped 92 to 95) (??)
+    // To try: 2000 [0.025,95],2000,500,500
+    // To try: 2000 [0.005,95],2000,500,500
+
+    // NBASIS=44 -- HDCG paper was 64 vectors; AMD compiler craps out at 48
+    // To try: 2000 [0.01,95] ,2000,500,500 -- 419 lowest slurm-1482355.out
+    // To try: 2000 [0.025,95] ,2000,500,500 -- 487 
+    // To try: 2000 [0.005,95] ,2000,500,500
+    /*
+      Smoother [3,92] order 16
+slurm-1482355.out:Grid : Message : 35.239686 s :  Chebyshev subspace pass-1 : ord 2000 [0.01,95]
+slurm-1482355.out:Grid : Message : 35.239714 s :  Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0
+slurm-1482355.out:Grid : Message : 5561.305552 s : HDCG: Pcg converged in 419 iterations and 2616.202598 s
+
+slurm-1482367.out:Grid : Message : 43.157235 s :  Chebyshev subspace pass-1 : ord 2000 [0.025,95]
+slurm-1482367.out:Grid : Message : 43.157257 s :  Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0
+slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 iterations and 3131.185821 s
+    */
+		 /*
+		   Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+				       95.0,0.0075,
+				       2500,
+				       500,
+				       500,
+				       0.0);
+		 */
+
+		 /*
+		   Aggregates.CreateSubspaceChebyshevPowerLaw(RNG5,HermOpEO,nbasis,
+							      95.0,
+							      2000);
+		 */
+
+    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
+					0.0003,1.0e-5,2000); // Lo, tol, maxit
+  /*
+    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+				       95.0,0.05,
+				       2000,
+				       500,
+				       500,
+				       0.0);
+ */
+    /*
+      Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
+				       95.0,0.01,
+				       2000,
+				       500,
+				       500,
+				       0.0);
+    */
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); -- running slurm-1484934.out nbasis 56
+
+    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); <== last run
+    SaveBasis(Aggregates,subspace_file);
+  }
+  MemoryManager::Print();
+
+  int refine=1;
+  if(refine){
+    if ( load_refine ) {
+      LoadBasis(Aggregates,refine_file);
+    } else {
+      // HDCG used Pcg to refine
+      Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000);
+      SaveBasis(Aggregates,refine_file);
+    }
+  }
+  MemoryManager::Print();
+  Aggregates.Orthogonalise();
+  if ( load_mat ) {
+    LoadOperator(LittleDiracOp,ldop_file);
+  } else {
+    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
+    SaveOperator(LittleDiracOp,ldop_file);
+  }
+
+  // I/O test:
+  CoarseVector c_src(Coarse5d);   random(CRNG,c_src);
+  CoarseVector c_res(Coarse5d); 
+  CoarseVector c_ref(Coarse5d);
+
+  // Try projecting to one hop only
+  //  LittleDiracOp.ShiftMatrix(1.0e-4);
+  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
+  LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
+
+  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
+  HermMatrix CoarseOp     (LittleDiracOp);
+  HermMatrix CoarseOpProj (LittleDiracOpProj);
+  
+  MemoryManager::Print();
+  //////////////////////////////////////////
+  // Build a coarse lanczos
+  //////////////////////////////////////////
+  //  Chebyshev<CoarseVector>      IRLCheby(0.012,40.0,201);  //500 HDCG iters
+  //  int Nk=512; // Didn't save much
+  //  int Nm=640;
+  //  int Nstop=400;
+
+  //  Chebyshev<CoarseVector>      IRLCheby(0.005,40.0,201);  //319 HDCG iters @ 128//160 nk.
+  //  int Nk=128;
+  //  int Nm=160;
+
+  //  Chebyshev<CoarseVector>      IRLCheby(0.005,40.0,201);  //319 HDCG iters @ 128//160 nk.
+  Chebyshev<CoarseVector>      IRLCheby(0.04,40.0,201);  //319 HDCG iters @ 128//160 nk.
+  int Nk=192;
+  int Nm=256;
+  int Nstop=Nk;
+  
+  //  Chebyshev<CoarseVector>      IRLCheby(0.010,45.0,201);  // 1 iter
+  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
+  PlainHermOp<CoarseVector>    IRLOp    (CoarseOp);
+  
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1e-5,10);
+
+  int Nconv;
+  std::vector<RealD>            eval(Nm);
+  std::vector<CoarseVector>     evec(Nm,Coarse5d);
+
+  PowerMethod<CoarseVector>       cPM;   cPM(CoarseOp,c_src);
+
+  IRL.calc(eval,evec,c_src,Nconv);
+  DeflatedGuesser<CoarseVector> DeflCoarseGuesser(evec,eval);
+
+  //////////////////////////////////////////
+  // Build a coarse space solver
+  //////////////////////////////////////////
+  int maxit=30000;
+  ConjugateGradient<CoarseVector>  CG(1.0e-8,maxit,false);
+  ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
+  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
+  
+  
+  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
+  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
+  c_res=Zero();
+  //  HPDSolve(c_src,c_res); c_ref = c_res;
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
+  //////////////////////////////////////////////////////////////////////////
+  // Deflated (with real op EV's) solve for the projected coarse op
+  // Work towards ADEF1 in the coarse space
+  //////////////////////////////////////////////////////////////////////////
+  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
+  c_res=Zero();
+  //  HPDSolveProj(c_src,c_res);
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
+  //  c_res = c_res - c_ref;
+  //  std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;
+
+  //////////////////////////////////////////////////////////////////////
+  // Coarse ADEF1 with deflation space
+  //////////////////////////////////////////////////////////////////////
+  ChebyshevSmoother<CoarseVector >  CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence
+    //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  //
+  //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s
+  //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s
+  //    CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter
+  //  CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter
+  //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj);  // 311
+
+  ////////////////////////////////////////////////////////
+  // CG, Cheby mode spacing 200,200
+  // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
+  // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s
+  // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
+  ////////////////////////////////////////////////////////
+  // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs 
+  ////////////////////////////////////////////////////////
+  // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain
+  // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
+  // HDCG 38 iters 162s
+  //
+  // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs 
+  // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain
+  // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
+  // HDCG 38 iters 169s
+
+					       /*
+  TwoLevelADEF1defl<CoarseVector>
+    cADEF1(1.0e-8, 500,
+	   CoarseOp,
+	   CoarseSmoother,
+	   evec,eval);
+					       */
+  //  c_res=Zero();
+  //  cADEF1(c_src,c_res);
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
+  //  c_res = c_res - c_ref;
+  //  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
+  
+  //  cADEF1.Tolerance = 4.0e-2;
+  //  cADEF1.Tolerance = 1.0e-1;
+  //  cADEF1.Tolerance = 5.0e-2;
+  //  c_res=Zero();
+  //  cADEF1(c_src,c_res);
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
+  //  c_res = c_res - c_ref;
+  //  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
+  
+  //////////////////////////////////////////
+  // Build a smoother
+  //////////////////////////////////////////
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp);  //383
+  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp);  //328
+  //  std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1
+  //  std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations
+  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters
+  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters
+
+  //
+  // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40
+  // Need to measure cost of coarse space.
+  //
+  // -- i) Reduce coarse residual   -- 0.04
+  // -- ii) Lanczos on coarse space -- done
+  // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and
+  //         use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec
+  //
+  //
+  //
+  //
+
+  MemoryManager::Print();
+  //////////////////////////////////////
+  // mrhs coarse solve
+  //  Create a higher dim coarse grid
+  //////////////////////////////////////////////////////////////////////////////////////
+  ConjugateGradient<CoarseVector>  coarseCG(2.0e-2,20000,true);
+    
+  const int nrhs=vComplex::Nsimd();
+    
+  Coordinate mpi=GridDefaultMpi();
+  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
+  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
+  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
+    
+  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
+  MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
+  typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t;
+  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
+  MrhsHermMatrix MrhsCoarseOp     (mrhs);
+  MemoryManager::Print();
+
+  { 
+    CoarseVector rh_res(CoarseMrhs);
+    CoarseVector rh_guess(CoarseMrhs);
+    CoarseVector rh_src(CoarseMrhs);
+
+    rh_res= Zero();
+    rh_guess= Zero();
+    for(int r=0;r<nrhs;r++){
+      random(CRNG,c_src);
+      DeflCoarseGuesser(c_src,c_res);
+      InsertSlice(c_res,rh_res,r,0);
+      InsertSlice(c_res,rh_guess,r,0);
+      InsertSlice(c_src,rh_src,r,0);
+    }
+
+  MemoryManager::Print();
+    coarseCG(MrhsCoarseOp,rh_src,rh_res);
+  MemoryManager::Print();
+    //redo with block CG
+    
+    for(int r=0;r<nrhs;r++){
+      std::cout << " compare to single RHS "<<r<<"/"<<nrhs<<std::endl;
+      ExtractSlice(c_src,rh_src,r,0);
+      ExtractSlice(c_res,rh_res,r,0);
+      ExtractSlice(c_ref,rh_guess,r,0);
+      coarseCG(CoarseOp,c_src,c_ref);
+      std::cout << " mrhs [" <<r <<"] "<< norm2(c_res)<<std::endl;
+      std::cout << " srhs [" <<r <<"] "<< norm2(c_ref)<<std::endl;
+      c_ref=c_ref-c_res;
+      RealD diff =norm2(c_ref)/norm2(c_src);
+      std::cout << r << " diff " << diff<<std::endl;
+      assert(diff < 1.0e-1);
+    }
+  }
+  MemoryManager::Print();
+  //////////////////////////////////////
+  // fine solve
+  //////////////////////////////////////
+
+  
+  //  std::vector<RealD> los({2.0,2.5}); // Nbasis 40 == 36,36 iters
+  std::vector<RealD> los({2.0}); // Nbasis 40 == 36,36 iters
+
+  //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
+  //  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)
+  std::vector<int> ords({9}); // Nbasis 40 == 40 iters (320 mults)  
+
+ /*
+   Smoother opt @56 nbasis, 0.04 convergence, 192 evs
+ ord lo
+
+ 16   0.1  no converge -- likely sign indefinite
+ 32   0.1  no converge -- likely sign indefinite(?)
+
+ 16   0.5  422
+ 32   0.5  302
+ 
+ 8   1.0  575
+ 12  1.0  449
+ 16  1.0  375
+ 32  1.0  302
+
+ 12  3.0  476
+ 16  3.0  319
+ 32  3.0  306
+
+ Powerlaw setup 62 vecs
+slurm-1494943.out:Grid : Message : 4874.186617 s : HDCG: Pcg converged in 171 iterations and 1706.548006 s 1.0 32
+slurm-1494943.out:Grid : Message : 6490.121648 s : HDCG: Pcg converged in 194 iterations and 1616.219654 s 1.0 16
+
+ Cheby setup: 56vecs
+ -- CG smoother O(16): 487
+ 
+Power law setup, 56 vecs -- lambda^-5
+slurm-1494383.out:Grid : Message : 4377.173265 s : HDCG: Pcg converged in 204 iterations and 1153.548935 s 1.0 32
+
+Power law setup, 56 vecs -- lambda^-3
+
+slurm-1494242.out:Grid : Message : 4370.464814 s : HDCG: Pcg converged in 204 iterations and 1143.494776 s  1.0 32
+slurm-1494242.out:Grid : Message : 5432.414820 s : HDCG: Pcg converged in 237 iterations and 1061.455882 s  1.0 16
+slurm-1494242.out:Grid : Message : 6588.727977 s : HDCG: Pcg converged in 205 iterations and 1156.565210 s  0.5 32
+
+ Power law setup, 56 vecs -- lambda^-4
+ -- CG smoother    O(16): 290
+ -- Cheby smoother O(16): 218 -- getting close to the deflation level I expect 169 from BFM paper @O(7) smoother and 64 nbasis
+
+Grid : Message : 2790.797194 s : HDCG: Pcg converged in 190 iterations and 1049.563182 s 1.0 32
+Grid : Message : 3766.374396 s : HDCG: Pcg converged in 218 iterations and 975.455668 s  1.0 16
+Grid : Message : 4888.746190 s : HDCG: Pcg converged in 191 iterations and 1122.252055 s 0.5 32
+Grid : Message : 5956.679661 s : HDCG: Pcg converged in 231 iterations and 1067.812850 s 0.5 16
+
+Grid : Message : 2767.405829 s : HDCG: Pcg converged in 218 iterations and 967.214067 s -- 16
+Grid : Message : 3816.165905 s : HDCG: Pcg converged in 251 iterations and 1048.636269 s -- 12
+Grid : Message : 5121.206572 s : HDCG: Pcg converged in 318 iterations and 1304.916168 s -- 8
+
+ 
+[paboyle@login2.crusher debug]$ grep -v Memory slurm-402426.out  | grep converged | grep HDCG -- [1.0,16] cheby
+Grid : Message : 5185.521063 s : HDCG: Pcg converged in 377 iterations and 1595.843529 s
+
+[paboyle@login2.crusher debug]$ grep HDCG  slurm-402184.out | grep onver
+Grid : Message : 3760.438160 s : HDCG: Pcg converged in 422 iterations and 2129.243141 s
+Grid : Message : 5660.588015 s : HDCG: Pcg converged in 308 iterations and 1900.026821 s
+
+ 
+Grid : Message : 4238.206528 s : HDCG: Pcg converged in 575 iterations and 2657.430676 s
+Grid : Message : 6345.880344 s : HDCG: Pcg converged in 449 iterations and 2108.505208 s
+
+grep onverg slurm-401663.out | grep HDCG
+Grid : Message : 3900.817781 s : HDCG: Pcg converged in 476 iterations and 1992.591311 s
+Grid : Message : 5647.202699 s : HDCG: Pcg converged in 306 iterations and 1746.838660 s
+
+
+[paboyle@login2.crusher debug]$ grep converged slurm-401775.out | grep HDCG
+Grid : Message : 3583.177025 s : HDCG: Pcg converged in 375 iterations and 1800.896037 s
+Grid : Message : 5348.342243 s : HDCG: Pcg converged in 302 iterations and 1765.045018 s
+
+Conclusion: higher order smoother is doing better. Much better. Use a Krylov smoother instead Mirs as in BFM version.
+
+ */
+				      //
+  MemoryManager::Print();
+  for(int l=0;l<los.size();l++){
+
+    RealD lo = los[l];
+
+    for(int o=0;o<ords.size();o++){
+
+      ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false);
+      HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
+      
+      //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
+      ChebyshevSmoother<LatticeFermionD > ChebySmooth(lo,95,ords[o],FineHermOp);  // 311
+
+      /*
+       * CG smooth 11 iter: 
+       slurm-403825.out:Grid : Message : 4369.824339 s : HDCG: fPcg converged in 215 iterations 3.0
+       slurm-403908.out:Grid : Message : 3955.897470 s : HDCG: fPcg converged in 236 iterations 1.0
+       slurm-404273.out:Grid : Message : 3843.792191 s : HDCG: fPcg converged in 210 iterations 2.0
+       * CG smooth 9 iter: 
+      */
+      //
+      RealD MirsShift = lo;
+      ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
+      CGSmoother<LatticeFermionD> CGsmooth(ords[o],ShiftedFineHermOp) ;
+  
+      //////////////////////////////////////////
+      // Build a HDCG solver
+      //////////////////////////////////////////
+      /*      TwoLevelADEF2<LatticeFermion,CoarseVector,Subspace>
+	HDCG(1.0e-8, 700,
+	     FineHermOp,
+	     CGsmooth,
+	     HPDSolveSloppy,
+	     HPDSolve,
+	     Aggregates);
+      result=Zero();
+      */
+      //      std::cout << "Calling HDCG"<<std::endl;
+      //      HDCG(src,result);
+
+      //////////////////////////////////////////
+      // Build a HDCG mrhs solver
+      //////////////////////////////////////////
+  MemoryManager::Print();
+      DoNothingGuesser<CoarseVector> DoNothing;
+      HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,coarseCG,DoNothing);
+      TwoLevelADEF2mrhs<LatticeFermion,CoarseVector,Subspace>
+	HDCGmrhs(1.0e-8, 700,
+		 FineHermOp,
+		 CGsmooth,
+		 HPDSolveSloppy, // Never used
+		 HPDSolve,       // Used in Vstart
+		 HPDSolveMrhs,    // Used in M1
+		 DeflCoarseGuesser, // single RHS guess used in M1
+		 CoarseMrhs,        // Grid needed to Mrhs grid
+		 Aggregates);
+
+  MemoryManager::Print();
+      std::cout << "Calling mRHS HDCG"<<std::endl;
+      FrbGrid->Barrier();
+      
+  MemoryManager::Print();
+      std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
+      std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
+  MemoryManager::Print();
+      for(int r=0;r<nrhs;r++){
+	random(RNG5,src_mrhs[r]);
+	res_mrhs[r]=Zero();
+	std::cout << "Setup mrhs source "<<r<<std::endl;
+      }
+      std::cout << "Calling the mRHS HDCG"<<std::endl;
+  MemoryManager::Print();
+      HDCGmrhs(src_mrhs,res_mrhs);
+  MemoryManager::Print();
+
+    }
+  }
+
+  // Standard CG
+  result=Zero();
+  CGfine(HermOpEO, src, result);
+  
+  Grid_finalize();
+  return 0;
+}
Author	SHA1	Message	Date
Peter Boyle	25f71913b7	MultiRHS coarse	2024-01-04 12:01:17 -05:00
Peter Boyle	34ddd2b7b1	MultiRHS coarse space	2024-01-04 12:00:53 -05:00
Peter Boyle	d5fd90b2f3	Add 48^3 rtest	2024-01-04 12:00:01 -05:00