Simplifying the MultiRHS solver to make it do SRHS *and* MRHS

2025-08-18 12:11:53 +01:00 · 2024-03-06 14:04:33 -05:00
parent ee3b3c4c56
commit 070b61f08f
5 changed files with 287 additions and 478 deletions
--- a/Grid/algorithms/iterative/Deflation.h
+++ b/Grid/algorithms/iterative/Deflation.h
@@ -1,157 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_DEFLATION_H
-#define GRID_DEFLATION_H
-
-namespace Grid { 
-
-template<class Field>
-class ZeroGuesser: public LinearFunction<Field> {
-public:
-  using LinearFunction<Field>::operator();
-    virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
-};
-template<class Field>
-class DoNothingGuesser: public LinearFunction<Field> {
-public:
-  using LinearFunction<Field>::operator();
-  virtual void operator()(const Field &src, Field &guess) {  };
-};
-template<class Field>
-class SourceGuesser: public LinearFunction<Field> {
-public:
-  using LinearFunction<Field>::operator();
-  virtual void operator()(const Field &src, Field &guess) { guess = src; };
-};
-
-////////////////////////////////
-// Fine grid deflation
-////////////////////////////////
-template<class Field>
-class DeflatedGuesser: public LinearFunction<Field> {
-private:
-  const std::vector<Field> &evec;
-  const std::vector<RealD> &eval;
-  const unsigned int       N;
-
-public:
-  using LinearFunction<Field>::operator();
-
-  DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval)
-  : DeflatedGuesser(_evec, _eval, _evec.size())
-  {}
-
-  DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
-  : evec(_evec), eval(_eval), N(_N)
-  {
-    assert(evec.size()==eval.size());
-    assert(N <= evec.size());
-  } 
-
-  virtual void operator()(const Field &src,Field &guess) {
-    guess = Zero();
-    for (int i=0;i<N;i++) {
-      const Field& tmp = evec[i];
-      axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
-    }
-    guess.Checkerboard() = src.Checkerboard();
-  }
-};
-
-template<class FineField, class CoarseField>
-class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
-private:
-  const std::vector<FineField>   &subspace;
-  const std::vector<CoarseField> &evec_coarse;
-  const std::vector<RealD>       &eval_coarse;
-public:
-  
-  using LinearFunction<FineField>::operator();
-  LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace,
-				const std::vector<CoarseField> &_evec_coarse,
-				const std::vector<RealD>       &_eval_coarse)
-    : subspace(_subspace), 
-      evec_coarse(_evec_coarse), 
-      eval_coarse(_eval_coarse)  
-  {
-  }
-  
-  void operator()(const FineField &src,FineField &guess) { 
-    int N = (int)evec_coarse.size();
-    CoarseField src_coarse(evec_coarse[0].Grid());
-    CoarseField guess_coarse(evec_coarse[0].Grid());    guess_coarse = Zero();
-    blockProject(src_coarse,src,subspace);    
-    for (int i=0;i<N;i++) {
-      const CoarseField & tmp = evec_coarse[i];
-      axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
-    }
-    blockPromote(guess_coarse,guess,subspace);
-    guess.Checkerboard() = src.Checkerboard();
-  };
-
-  void operator()(const std::vector<FineField> &src,std::vector<FineField> &guess) {
-    int Nevec = (int)evec_coarse.size();
-    int Nsrc = (int)src.size();
-    // make temp variables
-    std::vector<CoarseField> src_coarse(Nsrc,evec_coarse[0].Grid());
-    std::vector<CoarseField> guess_coarse(Nsrc,evec_coarse[0].Grid());    
-    //Preporcessing
-    std::cout << GridLogMessage << "Start BlockProject for loop" << std::endl;
-    for (int j=0;j<Nsrc;j++)
-    {
-    guess_coarse[j] = Zero();
-    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
-    blockProject(src_coarse[j],src[j],subspace);
-    }
-    //deflation set up for eigen vector batchsize 1 and source batch size equal number of sources
-    std::cout << GridLogMessage << "Start ProjectAccum for loop" << std::endl;
-    for (int i=0;i<Nevec;i++)
-    {
-      std::cout << GridLogMessage << "ProjectAccum Nvec: " << i << std::endl;
-      const CoarseField & tmp = evec_coarse[i];
-      for (int j=0;j<Nsrc;j++)
-      {
-        axpy(guess_coarse[j],TensorRemove(innerProduct(tmp,src_coarse[j])) / eval_coarse[i],tmp,guess_coarse[j]);
-      }
-    }
-    //postprocessing
-    std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
-    for (int j=0;j<Nsrc;j++)
-    {
-    std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
-    blockPromote(guess_coarse[j],guess[j],subspace);
-    guess[j].Checkerboard() = src[j].Checkerboard();
-    }
-  };
-
-  };
-
-
-
-}
-#endif
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@@ -73,7 +73,7 @@ public:
  GridBase      * FineGrid(void)       { return _FineGrid; };   // this is all the linalg routines need to know
  GridCartesian * CoarseGrid(void)     { return _CoarseGrid; };   // this is all the linalg routines need to know

-  void ShiftMatrix(RealD shift)
+  /*  void ShiftMatrix(RealD shift)
  {
    int Nd=_FineGrid->Nd(); 
    Coordinate zero_shift(Nd,0);
@@ -102,6 +102,7 @@ public:
    assert(nfound==geom.npoint);
    ExchangeCoarseLinks();
  }
+  */
  
  GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
    : geom(_geom),
@@ -459,6 +460,9 @@ public:
    CoarseScalar InnerProd(CoarseGrid()); 
    blockOrthogonalise(InnerProd,Subspace.subspace);

+    for(int s=0;s<Subspace.subspace.size();s++){
+      std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
+    }
    const int npoint = geom.npoint;
      
    Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -494,6 +498,7 @@ public:
 	}
 	phase=exp(phase*ci);
 	Mkl(k,l) = phase;
+	std::cout<<" Mkl "<<k<<" "<<l<<" "<<phase<<std::endl;
      }
    }
    invMkl = Mkl.inverse();
@@ -548,6 +553,7 @@ public:
 	tmat-=usecond();
 	linop.Op(phaV,MphaV);
 	tmat+=usecond();
+	std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;

 	tproj-=usecond();
 	blockProject(coarseInner,MphaV,Subspace.subspace);
@@ -555,6 +561,7 @@ public:

 	ComputeProj[p] = coarseInner;
 	tproj+=usecond();
+	std::cout << i << " " <<p << " ComputeProj "<<norm2(ComputeProj[p])<<std::endl;

      }

@@ -563,6 +570,7 @@ public:
 	FT[k] = Zero();
 	for(int l=0;l<npoint;l++){
 	  FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
+	  std::cout << i << " " <<k <<" "<<l<< " FT "<<norm2(FT[k])<<" "<<invMkl(l,k)<<std::endl;
 	}
      
 	int osites=CoarseGrid()->oSites();
@@ -583,6 +591,10 @@ public:
      //      PopulateAdag();
    }

+    for(int p=0;p<geom.npoint;p++){
+      std::cout << " _A["<<p<<"] "<<norm2(_A[p])<<std::endl;
+    }
+
    // Need to write something to populate Adag from A
    ExchangeCoarseLinks();
    std::cout << GridLogMessage<<"CoarsenOperator eigen  "<<teigen<<" us"<<std::endl;
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h
@@ -51,15 +51,15 @@ public:
  typedef iVector<CComplex,nbasis >  Cvec;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;
+  typedef Lattice<CComplex >    FineComplexField;
  typedef CoarseVector Field;

  ////////////////////
  // Data members
  ////////////////////
  GridCartesian *       _CoarseGridMulti; 
-  GridCartesian *       _CoarseGrid;
-  GeneralCoarseOp &     _Op;
  NonLocalStencilGeometry geom;
+  NonLocalStencilGeometry geom_srhs;
  PaddedCell Cell;
  GeneralLocalStencil Stencil;

@@ -77,20 +77,57 @@ public:
  GridBase      * Grid(void)           { return _CoarseGridMulti; };   // this is all the linalg routines need to know
  GridCartesian * CoarseGrid(void)     { return _CoarseGridMulti; };   // this is all the linalg routines need to know

-  MultiGeneralCoarsenedMatrix(GeneralCoarseOp & Op,GridCartesian *CoarseGridMulti) :
-    _Op(Op),
-    _CoarseGrid(Op.CoarseGrid()),
+  // Can be used to do I/O on the operator matrices externally
+  void SetMatrix (int p,CoarseMatrix & A)
+  {
+    assert(A.size()==geom_srhs.npoint);
+    GridtoBLAS(A[p],BLAS_A[p]);
+  }
+  void GetMatrix (int p,CoarseMatrix & A)
+  {
+    assert(A.size()==geom_srhs.npoint);
+    BLAStoGrid(A[p],BLAS_A[p]);
+  }
+  /*
+  void CopyMatrix (GeneralCoarseOp &_Op)
+  {
+    for(int p=0;p<geom.npoint;p++){
+      auto Aup = _Op.Cell.Extract(_Op._A[p]);
+      //Unpadded
+      GridtoBLAS(Aup,BLAS_A[p]);
+    }
+  }
+  void CheckMatrix (GeneralCoarseOp &_Op)
+  {
+    std::cout <<"************* Checking the little direc operator mRHS"<<std::endl;
+    for(int p=0;p<geom.npoint;p++){
+      //Unpadded
+      auto Aup = _Op.Cell.Extract(_Op._A[p]);
+      auto Ack = Aup;
+      BLAStoGrid(Ack,BLAS_A[p]);
+      std::cout << p<<" Ack "<<norm2(Ack)<<std::endl;
+      std::cout << p<<" Aup "<<norm2(Aup)<<std::endl;
+    }
+    std::cout <<"************* "<<std::endl;
+  }
+  */
+  
+  MultiGeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridCartesian *CoarseGridMulti) :
    _CoarseGridMulti(CoarseGridMulti),
-    geom(_CoarseGridMulti,Op.geom.hops,Op.geom.skip+1),
-    Cell(Op.geom.Depth(),_CoarseGridMulti),
+    geom_srhs(_geom),
+    geom(_CoarseGridMulti,_geom.hops,_geom.skip+1),
+    Cell(geom.Depth(),_CoarseGridMulti),
    Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
  {
-    int32_t padded_sites   = _Op._A[0].Grid()->lSites();
-    int32_t unpadded_sites = _CoarseGrid->lSites();
+    int32_t padded_sites   = Cell.grids.back()->lSites();
+    int32_t unpadded_sites = CoarseGridMulti->lSites();
    
    int32_t nrhs  = CoarseGridMulti->FullDimensions()[0];  // # RHS
    int32_t orhs  = nrhs/CComplex::Nsimd();

+    padded_sites   = padded_sites/nrhs;
+    unpadded_sites = unpadded_sites/nrhs;
+    
    /////////////////////////////////////////////////
    // Device data vector storage
    /////////////////////////////////////////////////
@@ -98,9 +135,9 @@ public:
    for(int p=0;p<geom.npoint;p++){
      BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
    }
+    
    BLAS_B.resize(nrhs *padded_sites);   // includes ghost zone
    BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
-
    BLAS_AP.resize(geom.npoint);
    BLAS_BP.resize(geom.npoint);
    for(int p=0;p<geom.npoint;p++){
@@ -113,21 +150,20 @@ public:
    // Pointers to data
    /////////////////////////////////////////////////

-    // Site identity mapping for A, C
+    // Site identity mapping for A
    for(int p=0;p<geom.npoint;p++){
      for(int ss=0;ss<unpadded_sites;ss++){
 	ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
 	acceleratorPut(BLAS_AP[p][ss],ptr);
      }
    }
+    // Site identity mapping for C
    for(int ss=0;ss<unpadded_sites;ss++){
      ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
      acceleratorPut(BLAS_CP[ss],ptr);
    }

-    /////////////////////////////////////////////////
    // Neighbour table is more complicated
-    /////////////////////////////////////////////////
    int32_t j=0; // Interior point counter (unpadded)
    for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
      int ghost_zone=0;
@@ -150,18 +186,9 @@ public:
      }
    }
    assert(j==unpadded_sites);
-    CopyMatrix();
  }
  template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
  {
-#if 0
-    std::vector<typename vobj::scalar_object> tmp;
-    unvectorizeToLexOrdArray(tmp,from);
-    assert(tmp.size()==from.Grid()->lSites());
-    assert(tmp.size()==to.size());
-    to.resize(tmp.size());
-    acceleratorCopyToDevice(&tmp[0],&to[0],sizeof(typename vobj::scalar_object)*tmp.size());
-#else
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -206,17 +233,9 @@ public:
 	to[w] = stmp;
      }
    });
-#endif
  }    
  template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
  {
-#if 0
-    std::vector<typename vobj::scalar_object> tmp;
-    tmp.resize(in.size());
-    assert(in.size()==grid.Grid()->lSites());
-    acceleratorCopyFromDevice(&in[0],&tmp[0],sizeof(typename vobj::scalar_object)*in.size());
-    vectorizeFromLexOrdArray(tmp,grid);
-#else
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -261,15 +280,152 @@ public:
 	putlane(to[w], stmp, to_lane);
      }
    });
-#endif
  }
-  void CopyMatrix (void)
+  void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
+		       Aggregation<Fobj,CComplex,nbasis> & Subspace,
+		       GridBase *CoarseGrid)
  {
-    for(int p=0;p<geom.npoint;p++){
-      //Unpadded
-      auto Aup = _Op.Cell.Extract(_Op._A[p]);
-      GridtoBLAS(Aup,BLAS_A[p]);
+    std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
+
+    GridBase *grid = Subspace.FineGrid;
+
+    /////////////////////////////////////////////////////////////
+    // Orthogonalise the subblocks over the basis
+    /////////////////////////////////////////////////////////////
+    CoarseScalar InnerProd(CoarseGrid); 
+    blockOrthogonalise(InnerProd,Subspace.subspace);
+
+    const int npoint = geom_srhs.npoint;
+
+    Coordinate clatt = CoarseGrid->GlobalDimensions();
+    int Nd = CoarseGrid->Nd();
+      /*
+       *     Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
+       *     Matrix index i is mapped to this shift via 
+       *               geom.shifts[i]
+       *
+       *     conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block] 
+       *       =  \sum_{l in ball}  e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} > 
+       *       =  \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
+       *       = M_{kl} A_ji^{b.b+l}
+       *
+       *     Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
+       *  
+       *     Where q_k = delta_k . (2*M_PI/global_nb[mu])
+       *
+       *     Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
+       */
+    Eigen::MatrixXcd Mkl    = Eigen::MatrixXcd::Zero(npoint,npoint);
+    Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
+    ComplexD ci(0.0,1.0);
+    for(int k=0;k<npoint;k++){ // Loop over momenta
+
+      for(int l=0;l<npoint;l++){ // Loop over nbr relative
+	ComplexD phase(0.0,0.0);
+	for(int mu=0;mu<Nd;mu++){
+	  RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
+	  phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
+	}
+	phase=exp(phase*ci);
+	Mkl(k,l) = phase;
+      }
    }
+    invMkl = Mkl.inverse();
+
+    ///////////////////////////////////////////////////////////////////////
+    // Now compute the matrix elements of linop between the orthonormal
+    // set of vectors.
+    ///////////////////////////////////////////////////////////////////////
+    FineField phaV(grid); // Phased block basis vector
+    FineField MphaV(grid);// Matrix applied
+    std::vector<FineComplexField> phaF(npoint,grid);
+    std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
+    
+    CoarseVector coarseInner(CoarseGrid);
+    
+    typedef typename CComplex::scalar_type SComplex;
+    FineComplexField one(grid); one=SComplex(1.0);
+    FineComplexField zz(grid); zz = Zero();
+    for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
+      /////////////////////////////////////////////////////
+      // Stick a phase on every block
+      /////////////////////////////////////////////////////
+      CoarseComplexField coor(CoarseGrid);
+      pha[p]=Zero();
+      for(int mu=0;mu<Nd;mu++){
+	LatticeCoordinate(coor,mu);
+	RealD TwoPiL =  M_PI * 2.0/ clatt[mu];
+	pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
+      }
+      pha[p]  =exp(pha[p]*ci);	
+
+      blockZAXPY(phaF[p],pha[p],one,zz);
+    }
+
+    // Could save on storage here
+    std::vector<CoarseMatrix> _A;
+    _A.resize(geom_srhs.npoint,CoarseGrid);
+
+    std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
+    CoarseVector          FT(CoarseGrid);
+    for(int i=0;i<nbasis;i++){// Loop over basis vectors
+      std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
+      for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
+
+	phaV = phaF[p]*Subspace.subspace[i];
+
+	/////////////////////////////////////////////////////////////////////
+	// Multiple phased subspace vector by matrix and project to subspace
+	// Remove local bulk phase to leave relative phases
+	/////////////////////////////////////////////////////////////////////
+	linop.Op(phaV,MphaV);
+
+	// Fixme, could use batched block projector here
+	blockProject(coarseInner,MphaV,Subspace.subspace);
+
+	coarseInner = conjugate(pha[p]) * coarseInner;
+
+	ComputeProj[p] = coarseInner;
+      }
+
+      for(int k=0;k<npoint;k++){
+	FT = Zero();
+	for(int l=0;l<npoint;l++){
+	  FT= FT+ invMkl(l,k)*ComputeProj[l];
+	}
+      
+	int osites=CoarseGrid->oSites();
+	autoView( A_v  , _A[k], AcceleratorWrite);
+	autoView( FT_v  , FT, AcceleratorRead);
+	accelerator_for(sss, osites, 1, {
+	    for(int j=0;j<nbasis;j++){
+	      A_v[sss](i,j) = FT_v[sss](j);
+	    }
+        });
+      }
+    }
+
+    // Only needed if nonhermitian
+    //    if ( ! hermitian ) {
+    //      std::cout << GridLogMessage<<"PopulateAdag  "<<std::endl;
+    //      PopulateAdag();
+    //    }
+    // Need to write something to populate Adag from A
+    
+    for(int p=0;p<geom_srhs.npoint;p++){
+      GridtoBLAS(_A[p],BLAS_A[p]);
+    }
+    /*
+Grid : Message : 11698.730546 s : CoarsenOperator eigen  1334 us
+Grid : Message : 11698.730563 s : CoarsenOperator phase  34729 us
+Grid : Message : 11698.730565 s : CoarsenOperator phaseBZ 2423814 us
+Grid : Message : 11698.730566 s : CoarsenOperator mat    127890998 us
+Grid : Message : 11698.730567 s : CoarsenOperator proj   515840840 us
+Grid : Message : 11698.730568 s : CoarsenOperator inv    103948313 us
+Takes 600s to compute matrix elements, DOMINATED by the block project.
+Easy to speed up with the batched block project.
+Store npoint vectors, get npoint x Nbasis block projection, and 81 fold faster.
+     */
  }
  void Mdag(const CoarseVector &in, CoarseVector &out)
  {
@@ -302,16 +458,17 @@ public:
      
    const int Nsimd = CComplex::Nsimd();

+    int64_t nrhs  =pin.Grid()->GlobalDimensions()[0];
+    assert(nrhs>=1);
+
    RealD flops,bytes;
    int64_t osites=in.Grid()->oSites(); // unpadded
-    int64_t unpadded_vol = _CoarseGrid->lSites();
+    int64_t unpadded_vol = CoarseGrid()->lSites()/nrhs;
    
    flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
    bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
          + 2.0*osites*sizeof(siteVector)*npoint;
    
-    int64_t nrhs  =pin.Grid()->GlobalDimensions()[0];
-    assert(nrhs>=1);

    t_GtoB=-usecond();
    GridtoBLAS(pin,BLAS_B);
@@ -339,7 +496,7 @@ public:
    BLAStoGrid(out,BLAS_C);
    t_BtoG+=usecond();
    t_tot+=usecond();
-
+    /*
    std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
    std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
    std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
@@ -351,12 +508,12 @@ public:
    std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
    std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
    std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
+    */
    //    std::cout << GridLogMessage<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
  };
  virtual  void Mdiag    (const Field &in, Field &out){ assert(0);};
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);};
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out){assert(0);};
-
 };
  
 NAMESPACE_END(Grid);