Simplifying the MultiRHS solver to make it do SRHS *and* MRHS

2025-09-18 01:01:04 +01:00 · 2024-03-06 14:04:33 -05:00
parent ee3b3c4c56
commit 070b61f08f
5 changed files with 287 additions and 478 deletions
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -1,157 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_DEFLATION_H
 #define GRID_DEFLATION_H
 namespace Grid { 
 template<class Field>
 class ZeroGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
 };
 template<class Field>
 class DoNothingGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) {  };
 };
 template<class Field>
 class SourceGuesser: public LinearFunction<Field> {
 public:
  using LinearFunction<Field>::operator();
  virtual void operator()(const Field &src, Field &guess) { guess = src; };
 };
 ////////////////////////////////
 // Fine grid deflation
 ////////////////////////////////
 template<class Field>
 class DeflatedGuesser: public LinearFunction<Field> {
 private:
  const std::vector<Field> &evec;
  const std::vector<RealD> &eval;
  const unsigned int       N;
 public:
  using LinearFunction<Field>::operator();
  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
  : DeflatedGuesser(_evec, _eval, _evec.size())
  {}
  DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
  : evec(_evec), eval(_eval), N(_N)
  {
    assert(evec.size()==eval.size());
    assert(N <= evec.size());
  } 
  virtual void operator()(const Field &src,Field &guess) {
    guess = Zero();
    for (int i=0;i<N;i++) {
      const Field& tmp = evec[i];
      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
    }
    guess.Checkerboard() = src.Checkerboard();
  }
 };
 template<class FineField, class CoarseField>
 class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
 private:
  const std::vector<FineField>   &subspace;
  const std::vector<CoarseField> &evec_coarse;
  const std::vector<RealD>       &eval_coarse;
 public:
  using LinearFunction<FineField>::operator();
  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
 				const std::vector<CoarseField> &_evec_coarse,
 				const std::vector<RealD>       &_eval_coarse)
    : subspace(_subspace), 
      evec_coarse(_evec_coarse), 
      eval_coarse(_eval_coarse)  
  {
  }
  void operator()(const FineField &src,FineField &guess) { 
    int N = (int)evec_coarse.size();
    CoarseField src_coarse(evec_coarse[0].Grid());
    CoarseField guess_coarse(evec_coarse[0].Grid());    guess_coarse = Zero();
    blockProject(src_coarse,src,subspace);    
    for (int i=0;i<N;i++) {
      const CoarseField & tmp = evec_coarse[i];
      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
    }
    blockPromote(guess_coarse,guess,subspace);
    guess.Checkerboard() = src.Checkerboard();
  };
  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
    int Nevec = (int)evec_coarse.size();
    int Nsrc = (int)src.size();
    // make temp variables
    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
    //Preporcessing
    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    guess_coarse[j] = Zero();
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockProject(src_coarse[j],src[j],subspace);
    }
    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
    for (int i=0;i<Nevec;i++)
    {
      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
      const CoarseField & tmp = evec_coarse[i];
      for (int j=0;j<Nsrc;j++)
      {
        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
      }
    }
    //postprocessing
    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
    for (int j=0;j<Nsrc;j++)
    {
    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
    blockPromote(guess_coarse[j],guess[j],subspace);
    guess[j].Checkerboard() = src[j].Checkerboard();
    }
  };
  };
 }
 #endif
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@@ -73,7 +73,7 @@ public:
  GridBase      * FineGrid(void)       { return _FineGrid; };   // this is all the linalg routines need to know
  GridCartesian * CoarseGrid(void)     { return _CoarseGrid; };   // this is all the linalg routines need to know
-  void ShiftMatrix(RealD shift)
+  /*  void ShiftMatrix(RealD shift)
  {
    int Nd=_FineGrid->Nd(); 
    Coordinate zero_shift(Nd,0);
@@ -102,6 +102,7 @@ public:
    assert(nfound==geom.npoint);
    ExchangeCoarseLinks();
  }
  */
  GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
    : geom(_geom),
@@ -459,6 +460,9 @@ public:
    CoarseScalar InnerProd(CoarseGrid()); 
    blockOrthogonalise(InnerProd,Subspace.subspace);
    for(int s=0;s<Subspace.subspace.size();s++){
      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
    }
    const int npoint = geom.npoint;
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -494,6 +498,7 @@ public:
 	}
 	phase=exp(phase*ci);
 	Mkl(k,l) = phase;
 	std::cout<<" Mkl "<<k<<" "<<l<<" "<<phase<<std::endl;
      }
    }
    invMkl = Mkl.inverse();
@@ -548,6 +553,7 @@ public:
 	tmat-=usecond();
 	linop.Op(phaV,MphaV);
 	tmat+=usecond();
 	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
 	tproj-=usecond();
 	blockProject(coarseInner,MphaV,Subspace.subspace);
@@ -555,6 +561,7 @@ public:
 	ComputeProj[p] = coarseInner;
 	tproj+=usecond();
 	std::cout << i << " " <<p << " ComputeProj "<<norm2(ComputeProj[p])<<std::endl;
      }
@@ -563,6 +570,7 @@ public:
 	FT[k] = Zero();
 	for(int l=0;l<npoint;l++){
 	  FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
 	  std::cout << i << " " <<k <<" "<<l<< " FT "<<norm2(FT[k])<<" "<<invMkl(l,k)<<std::endl;
 	}
 	int osites=CoarseGrid()->oSites();
@@ -583,6 +591,10 @@ public:
      //      PopulateAdag();
    }
    for(int p=0;p<geom.npoint;p++){
      std::cout << " _A["<<p<<"] "<<norm2(_A[p])<<std::endl;
    }
    // Need to write something to populate Adag from A
    ExchangeCoarseLinks();
    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
@@ -51,15 +51,15 @@ public:
  typedef iVector<CComplex,nbasis >  Cvec;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;
  typedef Lattice<CComplex >    FineComplexField;
  typedef CoarseVector Field;
  ////////////////////
  // Data members
  ////////////////////
  GridCartesian *       _CoarseGridMulti; 
  GridCartesian *       _CoarseGrid;
  GeneralCoarseOp &     _Op;
  NonLocalStencilGeometry geom;
  NonLocalStencilGeometry geom_srhs;
  PaddedCell Cell;
  GeneralLocalStencil Stencil;
@@ -77,20 +77,57 @@ public:
  GridBase      * Grid(void)           { return _CoarseGridMulti; };   // this is all the linalg routines need to know
  GridCartesian * CoarseGrid(void)     { return _CoarseGridMulti; };   // this is all the linalg routines need to know
-  MultiGeneralCoarsenedMatrix(GeneralCoarseOp & Op,GridCartesian *CoarseGridMulti) :
+  // Can be used to do I/O on the operator matrices externally
-    _Op(Op),
+  void SetMatrix (int p,CoarseMatrix & A)
-    _CoarseGrid(Op.CoarseGrid()),
+  {
    assert(A.size()==geom_srhs.npoint);
    GridtoBLAS(A[p],BLAS_A[p]);
  }
  void GetMatrix (int p,CoarseMatrix & A)
  {
    assert(A.size()==geom_srhs.npoint);
    BLAStoGrid(A[p],BLAS_A[p]);
  }
  /*
  void CopyMatrix (GeneralCoarseOp &_Op)
  {
    for(int p=0;p<geom.npoint;p++){
      auto Aup = _Op.Cell.Extract(_Op._A[p]);
      //Unpadded
      GridtoBLAS(Aup,BLAS_A[p]);
    }
  }
  void CheckMatrix (GeneralCoarseOp &_Op)
  {
    std::cout <<"************* Checking the little direc operator mRHS"<<std::endl;
    for(int p=0;p<geom.npoint;p++){
      //Unpadded
      auto Aup = _Op.Cell.Extract(_Op._A[p]);
      auto Ack = Aup;
      BLAStoGrid(Ack,BLAS_A[p]);
      std::cout << p<<" Ack "<<norm2(Ack)<<std::endl;
      std::cout << p<<" Aup "<<norm2(Aup)<<std::endl;
    }
    std::cout <<"************* "<<std::endl;
  }
  */
  MultiGeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridCartesian *CoarseGridMulti) :
    _CoarseGridMulti(CoarseGridMulti),
-    geom(_CoarseGridMulti,Op.geom.hops,Op.geom.skip+1),
+    geom_srhs(_geom),
-    Cell(Op.geom.Depth(),_CoarseGridMulti),
+    geom(_CoarseGridMulti,_geom.hops,_geom.skip+1),
    Cell(geom.Depth(),_CoarseGridMulti),
    Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
  {
-    int32_t padded_sites   = _Op._A[0].Grid()->lSites();
+    int32_t padded_sites   = Cell.grids.back()->lSites();
-    int32_t unpadded_sites = _CoarseGrid->lSites();
+    int32_t unpadded_sites = CoarseGridMulti->lSites();
    int32_t nrhs  = CoarseGridMulti->FullDimensions()[0];  // # RHS
    int32_t orhs  = nrhs/CComplex::Nsimd();
    padded_sites   = padded_sites/nrhs;
    unpadded_sites = unpadded_sites/nrhs;
    /////////////////////////////////////////////////
    // Device data vector storage
    /////////////////////////////////////////////////
@@ -98,9 +135,9 @@ public:
    for(int p=0;p<geom.npoint;p++){
      BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
    }
    BLAS_B.resize(nrhs *padded_sites);   // includes ghost zone
    BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
    BLAS_AP.resize(geom.npoint);
    BLAS_BP.resize(geom.npoint);
    for(int p=0;p<geom.npoint;p++){
@@ -113,21 +150,20 @@ public:
    // Pointers to data
    /////////////////////////////////////////////////
-    // Site identity mapping for A, C
+    // Site identity mapping for A
    for(int p=0;p<geom.npoint;p++){
      for(int ss=0;ss<unpadded_sites;ss++){
 	ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
 	acceleratorPut(BLAS_AP[p][ss],ptr);
      }
    }
    // Site identity mapping for C
    for(int ss=0;ss<unpadded_sites;ss++){
      ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
      acceleratorPut(BLAS_CP[ss],ptr);
    }
    /////////////////////////////////////////////////
    // Neighbour table is more complicated
    /////////////////////////////////////////////////
    int32_t j=0; // Interior point counter (unpadded)
    for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
      int ghost_zone=0;
@@ -150,18 +186,9 @@ public:
      }
    }
    assert(j==unpadded_sites);
    CopyMatrix();
  }
  template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
  {
 #if 0
    std::vector<typename vobj::scalar_object> tmp;
    unvectorizeToLexOrdArray(tmp,from);
    assert(tmp.size()==from.Grid()->lSites());
    assert(tmp.size()==to.size());
    to.resize(tmp.size());
    acceleratorCopyToDevice(&tmp[0],&to[0],sizeof(typename vobj::scalar_object)*tmp.size());
 #else
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -206,17 +233,9 @@ public:
 	to[w] = stmp;
      }
    });
 #endif
  }    
  template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
  {
 #if 0
    std::vector<typename vobj::scalar_object> tmp;
    tmp.resize(in.size());
    assert(in.size()==grid.Grid()->lSites());
    acceleratorCopyFromDevice(&in[0],&tmp[0],sizeof(typename vobj::scalar_object)*in.size());
    vectorizeFromLexOrdArray(tmp,grid);
 #else
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -261,15 +280,152 @@ public:
 	putlane(to[w], stmp, to_lane);
      }
    });
 #endif
  }
-  void CopyMatrix (void)
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
 		       Aggregation<Fobj,CComplex,nbasis> & Subspace,
 		       GridBase *CoarseGrid)
  {
-    for(int p=0;p<geom.npoint;p++){
+    std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
-      //Unpadded
+
-      auto Aup = _Op.Cell.Extract(_Op._A[p]);
+    GridBase *grid = Subspace.FineGrid;
-      GridtoBLAS(Aup,BLAS_A[p]);
+
    /////////////////////////////////////////////////////////////
    // Orthogonalise the subblocks over the basis
    /////////////////////////////////////////////////////////////
    CoarseScalar InnerProd(CoarseGrid); 
    blockOrthogonalise(InnerProd,Subspace.subspace);
    const int npoint = geom_srhs.npoint;
    Coordinate clatt = CoarseGrid->GlobalDimensions();
    int Nd = CoarseGrid->Nd();
      /*
       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
       *     Matrix index i is mapped to this shift via 
       *               geom.shifts[i]
       *
       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
       *       = M_{kl} A_ji^{b.b+l}
       *
       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
       *  
       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
       *
       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
       */
    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
    ComplexD ci(0.0,1.0);
    for(int k=0;k<npoint;k++){ // Loop over momenta
      for(int l=0;l<npoint;l++){ // Loop over nbr relative
 	ComplexD phase(0.0,0.0);
 	for(int mu=0;mu<Nd;mu++){
 	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
 	  phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
 	}
 	phase=exp(phase*ci);
 	Mkl(k,l) = phase;
      }
    }
    invMkl = Mkl.inverse();
    ///////////////////////////////////////////////////////////////////////
    // Now compute the matrix elements of linop between the orthonormal
    // set of vectors.
    ///////////////////////////////////////////////////////////////////////
    FineField phaV(grid); // Phased block basis vector
    FineField MphaV(grid);// Matrix applied
    std::vector<FineComplexField> phaF(npoint,grid);
    std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
    CoarseVector coarseInner(CoarseGrid);
    typedef typename CComplex::scalar_type SComplex;
    FineComplexField one(grid); one=SComplex(1.0);
    FineComplexField zz(grid); zz = Zero();
    for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
      /////////////////////////////////////////////////////
      // Stick a phase on every block
      /////////////////////////////////////////////////////
      CoarseComplexField coor(CoarseGrid);
      pha[p]=Zero();
      for(int mu=0;mu<Nd;mu++){
 	LatticeCoordinate(coor,mu);
 	RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
 	pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
      }
      pha[p]  =exp(pha[p]*ci);	
      blockZAXPY(phaF[p],pha[p],one,zz);
    }
    // Could save on storage here
    std::vector<CoarseMatrix> _A;
    _A.resize(geom_srhs.npoint,CoarseGrid);
    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
    CoarseVector          FT(CoarseGrid);
    for(int i=0;i<nbasis;i++){// Loop over basis vectors
      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
 	phaV = phaF[p]*Subspace.subspace[i];
 	/////////////////////////////////////////////////////////////////////
 	// Multiple phased subspace vector by matrix and project to subspace
 	// Remove local bulk phase to leave relative phases
 	/////////////////////////////////////////////////////////////////////
 	linop.Op(phaV,MphaV);
 	// Fixme, could use batched block projector here
 	blockProject(coarseInner,MphaV,Subspace.subspace);
 	coarseInner = conjugate(pha[p]) * coarseInner;
 	ComputeProj[p] = coarseInner;
      }
      for(int k=0;k<npoint;k++){
 	FT = Zero();
 	for(int l=0;l<npoint;l++){
 	  FT= FT+ invMkl(l,k)*ComputeProj[l];
 	}
 	int osites=CoarseGrid->oSites();
 	autoView( A_v  , _A[k], AcceleratorWrite);
 	autoView( FT_v  , FT, AcceleratorRead);
 	accelerator_for(sss, osites, 1, {
 	    for(int j=0;j<nbasis;j++){
 	      A_v[sss](i,j) = FT_v[sss](j);
 	    }
        });
      }
    }
    // Only needed if nonhermitian
    //    if ( ! hermitian ) {
    //      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
    //      PopulateAdag();
    //    }
    // Need to write something to populate Adag from A
    for(int p=0;p<geom_srhs.npoint;p++){
      GridtoBLAS(_A[p],BLAS_A[p]);
    }
    /*
 Grid : Message : 11698.730546 s : CoarsenOperator eigen  1334 us
 Grid : Message : 11698.730563 s : CoarsenOperator phase  34729 us
 Grid : Message : 11698.730565 s : CoarsenOperator phaseBZ 2423814 us
 Grid : Message : 11698.730566 s : CoarsenOperator mat    127890998 us
 Grid : Message : 11698.730567 s : CoarsenOperator proj   515840840 us
 Grid : Message : 11698.730568 s : CoarsenOperator inv    103948313 us
 Takes 600s to compute matrix elements, DOMINATED by the block project.
 Easy to speed up with the batched block project.
 Store npoint vectors, get npoint x Nbasis block projection, and 81 fold faster.
     */
  }
  void Mdag(const CoarseVector &in, CoarseVector &out)
  {
@@ -302,16 +458,17 @@ public:
    const int Nsimd = CComplex::Nsimd();
    int64_t nrhs  =pin.Grid()->GlobalDimensions()[0];
    assert(nrhs>=1);
    RealD flops,bytes;
    int64_t osites=in.Grid()->oSites(); // unpadded
-    int64_t unpadded_vol = _CoarseGrid->lSites();
+    int64_t unpadded_vol = CoarseGrid()->lSites()/nrhs;
    flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
    bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
          + 2.0*osites*sizeof(siteVector)*npoint;
    int64_t nrhs  =pin.Grid()->GlobalDimensions()[0];
    assert(nrhs>=1);
    t_GtoB=-usecond();
    GridtoBLAS(pin,BLAS_B);
@@ -339,7 +496,7 @@ public:
    BLAStoGrid(out,BLAS_C);
    t_BtoG+=usecond();
    t_tot+=usecond();
-
+    /*
    std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
    std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
    std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
@@ -351,12 +508,12 @@ public:
    std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
    std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
    std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
    */
    //    std::cout << GridLogMessage<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
  };
  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
 };
 NAMESPACE_END(Grid);
--- a/43
+++ b/43
@@ -1,6 +1,44 @@
- - Slice sum optimisation & A2A - atomic addition
+i) Clean up CoarsenedMatrix, GeneralCoarsenedMatrix, GeneralCoarsenedMatrixMultiRHS
 -- Ideally want a SINGLE implementation that does MultiRHS **AND** works with one RHS.
 -- -- Getting there. One RHS is hard due to vectorisation & hardwired coarse5d layout
 -- Compromise: Wrap it in a copy in/out for a slice.
 -- Bad for Lanczos: need to do a BLOCK Lanczos instead. Longer term.
 -- **** Make the test do ONLY the single RHS. ****
 -- I/O for the matrix elements required.
 -- Make the Adef2 build an eigenvector deflater and a block projector
 -- 
 -- Work with Regensburg on tests.
 -- Plan interface preserving the coarsened matrix interface (??)
 -- Move functionality from GeneralCoarsenedMatrix INTO GeneralCoarsenedMatrixMultiRHS -- DONE
   -- Don't immediately delete original
   -- Instead make the new one self contained, then delete.
   -- New DWF inverter test.
  // void PopulateAdag(void)
  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop, Aggregation<Fobj,CComplex,nbasis> & Subspace) -- DONE
  ExchangeCoarseLinks();
 iii) Aurora -- christoph's problem -- DONE
     Aurora -- Carleton's problem staggered.
 iv) Dennis merge and test Aurora -- DONE (save test)
 v) Merge Ed Bennet's request --DONE 
 vi) Repro CG  -- get down to the level of single node testing via split grid test 
 =========================
 ===============
 - - Slice sum optimisation & A2A - atomic addition -- Dennis
 - - Also faster non-atomic reduction
 - - Remaining PRs
 - - DDHMC
  - - MixedPrec is the action eval, high precision
  - - MixedPrecCleanup is the force eval, low precision
@@ -17,7 +55,6 @@ DDHMC
 -- Multishift Mixed Precision - DONE
 -- Pole dependent residual  - DONE
 =======
 -- comms threads issue??
 -- Part done: Staggered kernel performance on GPU
--- a/tests/debug/Test_general_coarse_hdcg_phys48.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys48.cc
@@ -208,9 +208,6 @@ public:
 };
 gridblasHandle_t GridBLAS::gridblasHandle;
 int            GridBLAS::gridblasInit;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@@ -281,7 +278,6 @@ int main (int argc, char ** argv)
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
  NearestStencilGeometry5D geom_nn(Coarse5d);
  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
@@ -309,75 +305,12 @@ int main (int argc, char ** argv)
      LoadBasis(Aggregates,subspace_file);
    }
  } else {
    // NBASIS=40
    // Best so far: ord 2000 [0.01,95], 500,500  -- 466 iters
    // slurm-398626.out:Grid : Message : 141.295253 s : 500 filt [1] <n|MdagM|n> 0.000103622063
    //Grid : Message : 33.870465 s :  Chebyshev subspace pass-1 : ord 2000 [0.001,95]
    //Grid : Message : 33.870485 s :  Chebyshev subspace pass-2 : nbasis40 min 1000 step 1000 lo0
    //slurm-1482200.out : filt ~ 0.004 -- not as low mode projecting -- took 626 iters
    // To try: 2000 [0.1,95]  ,2000,500,500 -- slurm-1482213.out 586 iterations
    // To try: 2000 [0.01,95] ,2000,500,500 -- 469 (think I bumped 92 to 95) (??)
    // To try: 2000 [0.025,95],2000,500,500
    // To try: 2000 [0.005,95],2000,500,500
    // NBASIS=44 -- HDCG paper was 64 vectors; AMD compiler craps out at 48
    // To try: 2000 [0.01,95] ,2000,500,500 -- 419 lowest slurm-1482355.out
    // To try: 2000 [0.025,95] ,2000,500,500 -- 487 
    // To try: 2000 [0.005,95] ,2000,500,500
    /*
      Smoother [3,92] order 16
 slurm-1482355.out:Grid : Message : 35.239686 s :  Chebyshev subspace pass-1 : ord 2000 [0.01,95]
 slurm-1482355.out:Grid : Message : 35.239714 s :  Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0
 slurm-1482355.out:Grid : Message : 5561.305552 s : HDCG: Pcg converged in 419 iterations and 2616.202598 s
 slurm-1482367.out:Grid : Message : 43.157235 s :  Chebyshev subspace pass-1 : ord 2000 [0.025,95]
 slurm-1482367.out:Grid : Message : 43.157257 s :  Chebyshev subspace pass-2 : nbasis44 min 500 step 500 lo0
 slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 iterations and 3131.185821 s
    */
 		 /*
 		   Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
 				       95.0,0.0075,
 				       2500,
 				       500,
 				       500,
 				       0.0);
 		 */
 		 /*
 		   Aggregates.CreateSubspaceChebyshevPowerLaw(RNG5,HermOpEO,nbasis,
 							      95.0,
 							      2000);
 		 */
    Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
 					0.0003,1.0e-5,2000); // Lo, tol, maxit
  /*
    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
 				       95.0,0.05,
 				       2000,
 				       500,
 				       500,
 				       0.0);
 */
    /*
      Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
 				       95.0,0.01,
 				       2000,
 				       500,
 				       500,
 				       0.0);
    */
    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); -- running slurm-1484934.out nbasis 56
    //    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500); <== last run
    SaveBasis(Aggregates,subspace_file);
  }
  MemoryManager::Print();
  if(refine){
    if ( load_refine ) {
@@ -388,15 +321,15 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
      SaveBasis(Aggregates,refine_file);
    }
  }
-  MemoryManager::Print();
+
  Aggregates.Orthogonalise();
  if ( load_mat ) {
    LoadOperator(LittleDiracOp,ldop_file);
  } else {
    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
-    SaveOperator(LittleDiracOp,ldop_file);
+    //    SaveOperator(LittleDiracOp,ldop_file);
  }
-
+  
  // I/O test:
  CoarseVector c_src(Coarse5d);   random(CRNG,c_src);
  CoarseVector c_res(Coarse5d); 
@@ -428,31 +361,42 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
    std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
  }
-  
+  //////////////////////////////////////
-  // Try projecting to one hop only
+  // mrhs coarse operator
-  //  LittleDiracOp.ShiftMatrix(1.0e-4);
+  //  Create a higher dim coarse grid
-  //  LittleDiracOperator LittleDiracOpProj(geom_nn,FrbGrid,Coarse5d);
+  //////////////////////////////////////////////////////////////////////////////////////
  //  LittleDiracOpProj.ProjectNearestNeighbour(0.01,LittleDiracOp); // smaller shift 0.02? n
-  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
+  std::cout << "**************************************"<<std::endl;
-  HermMatrix CoarseOp     (LittleDiracOp);
+  std::cout << "Building MultiRHS Coarse operator"<<std::endl;
-  //  HermMatrix CoarseOpProj (LittleDiracOpProj);
+  std::cout << "**************************************"<<std::endl;
  ConjugateGradient<CoarseVector>  coarseCG(4.0e-2,20000,true);
  const int nrhs=vComplex::Nsimd()*3;
  Coordinate mpi=GridDefaultMpi();
  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
  //  MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
  typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
  MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
  //  mrhs.CopyMatrix(LittleDiracOp);
  //  mrhs.SetMatrix(LittleDiracOp.);
  mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
  //  mrhs.CheckMatrix(LittleDiracOp);
  MemoryManager::Print();
  //////////////////////////////////////////
  // Build a coarse lanczos
  //////////////////////////////////////////
-  //  Chebyshev<CoarseVector>      IRLCheby(0.012,40.0,201);  //500 HDCG iters
+  std::cout << "**************************************"<<std::endl;
-  //  int Nk=512; // Didn't save much
+  std::cout << "Building Coarse Lanczos               "<<std::endl;
-  //  int Nm=640;
+  std::cout << "**************************************"<<std::endl;
  //  int Nstop=400;
-  //  Chebyshev<CoarseVector>      IRLCheby(0.005,40.0,201);  //319 HDCG iters @ 128//160 nk.
+  typedef HermitianLinearOperator<LittleDiracOperator,CoarseVector> HermMatrix;
-  //  int Nk=128;
+  HermMatrix CoarseOp     (LittleDiracOp);
  //  int Nm=160;
  //  Chebyshev<CoarseVector>      IRLCheby(0.005,40.0,201);  //319 HDCG iters @ 128//160 nk.
  //  Chebyshev<CoarseVector>      IRLCheby(0.04,40.0,201); 
  int Nk=192;
  int Nm=256;
  int Nstop=Nk;
@@ -491,121 +435,13 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
  ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
  ZeroGuesser<CoarseVector> CoarseZeroGuesser;
  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
  c_res=Zero();
  //  HPDSolve(c_src,c_res); c_ref = c_res;
  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
  //  std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
  //////////////////////////////////////////////////////////////////////////
  // Deflated (with real op EV's) solve for the projected coarse op
  // Work towards ADEF1 in the coarse space
  //////////////////////////////////////////////////////////////////////////
  //  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
  //  c_res=Zero();
  //  HPDSolveProj(c_src,c_res);
  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
  //  std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
  //  c_res = c_res - c_ref;
  //  std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;
-  //////////////////////////////////////////////////////////////////////
+  /////////// MRHS test .////////////
  // Coarse ADEF1 with deflation space
  //////////////////////////////////////////////////////////////////////
  //  ChebyshevSmoother<CoarseVector >  CoarseSmoother(1.0,37.,8,CoarseOpProj);  // just go to sloppy 0.1 convergence
    //  CoarseSmoother(0.1,37.,8,CoarseOpProj);  //
  //  CoarseSmoother(0.5,37.,6,CoarseOpProj);  //  8 iter 0.36s
  //    CoarseSmoother(0.5,37.,12,CoarseOpProj);  // 8 iter, 0.55s
  //    CoarseSmoother(0.5,37.,8,CoarseOpProj);// 7-9 iter
  //  CoarseSmoother(1.0,37.,8,CoarseOpProj); // 0.4 - 0.5s solve to 0.04, 7-9 iter
  //  ChebyshevSmoother<CoarseVector,HermMatrix > CoarseSmoother(0.5,36.,10,CoarseOpProj);  // 311
  ////////////////////////////////////////////////////////
  // CG, Cheby mode spacing 200,200
  // Unprojected Coarse CG solve to 1e-8 : 190 iters, 4.9s
  // Unprojected Coarse CG solve to 4e-2 :  33 iters, 0.8s
  // Projected Coarse CG solve to 1e-8 : 100 iters, 0.36s
  ////////////////////////////////////////////////////////
  // CoarseSmoother(1.0,48.,8,CoarseOpProj); 48 evecs 
  ////////////////////////////////////////////////////////
  // ADEF1 Coarse solve to 1e-8 : 44 iters, 2.34s  2.1x gain
  // ADEF1 Coarse solve to 4e-2 : 7 iters, 0.4s
  // HDCG 38 iters 162s
  //
  // CoarseSmoother(1.0,40.,8,CoarseOpProj); 48 evecs 
  // ADEF1 Coarse solve to 1e-8 : 37 iters, 2.0s  2.1x gain
  // ADEF1 Coarse solve to 4e-2 : 6 iters, 0.36s
  // HDCG 38 iters 169s
 					       /*
  TwoLevelADEF1defl<CoarseVector>
    cADEF1(1.0e-8, 500,
 	   CoarseOp,
 	   CoarseSmoother,
 	   evec,eval);
 					       */
  //  c_res=Zero();
  //  cADEF1(c_src,c_res);
  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
  //  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
  //  c_res = c_res - c_ref;
  //  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
  //  cADEF1.Tolerance = 4.0e-2;
  //  cADEF1.Tolerance = 1.0e-1;
  //  cADEF1.Tolerance = 5.0e-2;
  //  c_res=Zero();
  //  cADEF1(c_src,c_res);
  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
  //  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
  //  c_res = c_res - c_ref;
  //  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
  //////////////////////////////////////////
  // Build a smoother
  //////////////////////////////////////////
  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(10.0,100.0,10,FineHermOp); //499
  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(3.0,100.0,10,FineHermOp);  //383
  //  ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(1.0,100.0,10,FineHermOp);  //328
  //  std::vector<RealD> los({0.5,1.0,3.0}); // 147/142/146 nbasis 1
  //  std::vector<RealD> los({1.0,2.0}); // Nbasis 24: 88,86 iterations
  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 32 == 52, iters
  //  std::vector<RealD> los({2.0,4.0}); // Nbasis 40 == 36,36 iters
  //
  // Turns approx 2700 iterations into 340 fine multiplies with Nbasis 40
  // Need to measure cost of coarse space.
  //
  // -- i) Reduce coarse residual   -- 0.04
  // -- ii) Lanczos on coarse space -- done
  // -- iii) Possible 1 hop project and/or preconditioning it - easy - PrecCG it and
  //         use a limited stencil. Reread BFM code to check on evecs / deflation strategy with prec
  //
  //
  //
  //
  MemoryManager::Print();
  //////////////////////////////////////
  // mrhs coarse solve
  //  Create a higher dim coarse grid
  //////////////////////////////////////////////////////////////////////////////////////
  ConjugateGradient<CoarseVector>  coarseCG(4.0e-2,20000,true);
  const int nrhs=vComplex::Nsimd()*3;
  Coordinate mpi=GridDefaultMpi();
  Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
  Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
  Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
  GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi); 
  MultiGeneralCoarsenedMatrix mrhs(LittleDiracOp,CoarseMrhs);
  typedef decltype(mrhs) MultiGeneralCoarsenedMatrix_t;
  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
  MrhsHermMatrix MrhsCoarseOp     (mrhs);
-  MemoryManager::Print();
+
 #if 1
  { 
    CoarseVector rh_res(CoarseMrhs);
@@ -644,6 +480,7 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
      InsertSlice(c_src,rh_src,r,0);
    }
    std::cout << " Calling the multiRHS coarse CG"<<std::endl;
    coarseCG(MrhsCoarseOp,rh_src,rh_res);
    //redo with block CG ?
@@ -666,47 +503,11 @@ slurm-1482367.out:Grid : Message : 6169.469330 s : HDCG: Pcg converged in 487 it
  //////////////////////////////////////
  // fine solve
  //////////////////////////////////////
  //  std::vector<RealD> los({2.0,2.5}); // Nbasis 40 == 36,36 iters
  //  std::vector<RealD> los({2.0});
  //  std::vector<RealD> los({2.5});
  //  std::vector<int> ords({7,8,10}); // Nbasis 40 == 40,38,36 iters (320,342,396 mults)
  //  std::vector<int> ords({7}); // Nbasis 40 == 40 iters (320 mults)
  //  std::vector<int> ords({9}); // Nbasis 40 == 40 iters (320 mults)  
  // 148 outer				       
       //  std::vector<RealD> los({1.0});
       //  std::vector<int> ords({24}); 
  // 162 outer				       
       //  std::vector<RealD> los({2.5});
       //  std::vector<int> ords({9}); 
  // ??? outer				       
  std::vector<RealD> los({2.0});
  std::vector<int> ords({7}); 
 /*
   Smoother opt @56 nbasis, 0.04 convergence, 192 evs
 ord lo
 16   0.1  no converge -- likely sign indefinite
 32   0.1  no converge -- likely sign indefinite(?)
 16   0.5  422
 32   0.5  302
 8   1.0  575
 12  1.0  449
 16  1.0  375
 32  1.0  302
 12  3.0  476
 16  3.0  319
 32  3.0  306
 Powerlaw setup 62 vecs
 slurm-1494943.out:Grid : Message : 4874.186617 s : HDCG: Pcg converged in 171 iterations and 1706.548006 s 1.0 32
 slurm-1494943.out:Grid : Message : 6490.121648 s : HDCG: Pcg converged in 194 iterations and 1616.219654 s 1.0 16
@@ -727,38 +528,7 @@ slurm-1494242.out:Grid : Message : 6588.727977 s : HDCG: Pcg converged in 205 it
 -- CG smoother    O(16): 290
 -- Cheby smoother O(16): 218 -- getting close to the deflation level I expect 169 from BFM paper @O(7) smoother and 64 nbasis
 Grid : Message : 2790.797194 s : HDCG: Pcg converged in 190 iterations and 1049.563182 s 1.0 32
 Grid : Message : 3766.374396 s : HDCG: Pcg converged in 218 iterations and 975.455668 s  1.0 16
 Grid : Message : 4888.746190 s : HDCG: Pcg converged in 191 iterations and 1122.252055 s 0.5 32
 Grid : Message : 5956.679661 s : HDCG: Pcg converged in 231 iterations and 1067.812850 s 0.5 16
 Grid : Message : 2767.405829 s : HDCG: Pcg converged in 218 iterations and 967.214067 s -- 16
 Grid : Message : 3816.165905 s : HDCG: Pcg converged in 251 iterations and 1048.636269 s -- 12
 Grid : Message : 5121.206572 s : HDCG: Pcg converged in 318 iterations and 1304.916168 s -- 8
 [paboyle@login2.crusher debug]$ grep -v Memory slurm-402426.out  | grep converged | grep HDCG -- [1.0,16] cheby
 Grid : Message : 5185.521063 s : HDCG: Pcg converged in 377 iterations and 1595.843529 s
 [paboyle@login2.crusher debug]$ grep HDCG  slurm-402184.out | grep onver
 Grid : Message : 3760.438160 s : HDCG: Pcg converged in 422 iterations and 2129.243141 s
 Grid : Message : 5660.588015 s : HDCG: Pcg converged in 308 iterations and 1900.026821 s
 Grid : Message : 4238.206528 s : HDCG: Pcg converged in 575 iterations and 2657.430676 s
 Grid : Message : 6345.880344 s : HDCG: Pcg converged in 449 iterations and 2108.505208 s
 grep onverg slurm-401663.out | grep HDCG
 Grid : Message : 3900.817781 s : HDCG: Pcg converged in 476 iterations and 1992.591311 s
 Grid : Message : 5647.202699 s : HDCG: Pcg converged in 306 iterations and 1746.838660 s
 [paboyle@login2.crusher debug]$ grep converged slurm-401775.out | grep HDCG
 Grid : Message : 3583.177025 s : HDCG: Pcg converged in 375 iterations and 1800.896037 s
 Grid : Message : 5348.342243 s : HDCG: Pcg converged in 302 iterations and 1765.045018 s
 Conclusion: higher order smoother is doing better. Much better. Use a Krylov smoother instead Mirs as in BFM version.
 */
 				      //
  MemoryManager::Print();
@@ -774,14 +544,6 @@ Conclusion: higher order smoother is doing better. Much better. Use a Krylov smo
      //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
      ChebyshevSmoother<LatticeFermionD > ChebySmooth(lo,95,ords[o],FineHermOp);  // 311
      /*
       * CG smooth 11 iter: 
       slurm-403825.out:Grid : Message : 4369.824339 s : HDCG: fPcg converged in 215 iterations 3.0
       slurm-403908.out:Grid : Message : 3955.897470 s : HDCG: fPcg converged in 236 iterations 1.0
       slurm-404273.out:Grid : Message : 3843.792191 s : HDCG: fPcg converged in 210 iterations 2.0
       * CG smooth 9 iter: 
      */
      //
      RealD MirsShift = lo;
      ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
      CGSmoother<LatticeFermionD> CGsmooth(ords[o],ShiftedFineHermOp) ;
@@ -820,16 +582,14 @@ Conclusion: higher order smoother is doing better. Much better. Use a Krylov smo
 		 CoarseMrhs,        // Grid needed to Mrhs grid
 		 Aggregates);
  MemoryManager::Print();
      std::cout << "Calling mRHS HDCG"<<std::endl;
      FrbGrid->Barrier();
  MemoryManager::Print();
      std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
      std::cout << " mRHS source"<<std::endl;
      std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
      std::cout << " mRHS result"<<std::endl;
-  MemoryManager::Print();
+
  random(RNG5,src_mrhs[0]);
  for(int r=0;r<nrhs;r++){
 	if(r>0)src_mrhs[r]=src_mrhs[0];