From e9cc21900f00b81a17ab87d649e014edc99c636b Mon Sep 17 00:00:00 2001
From: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Date: Tue, 20 Jun 2017 12:37:41 +0100
Subject: [PATCH] Block solver complete for staggered. Now stable on mass 0.003
 and gives 8x (!) speed up on Haswell laptop vs. standard CG for 8 RHS solves.

166 iterations vs. 537 iterations so algorithmic gain + 2x in flop rate gain.

Better than a slap in the face with a wet kipper.
---
 .../iterative/BlockConjugateGradient.h        | 295 ++++++++++++++++--
 lib/lattice/Lattice_reduction.h               | 235 +++-----------
 .../solver/Test_staggered_block_cg_unprec.cc  |  13 +-
 3 files changed, 321 insertions(+), 222 deletions(-)
diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h
index 53e11fa7..f8b83b1f 100644
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -33,6 +33,8 @@ directory
 
 namespace Grid {
 
+enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
+
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
@@ -40,24 +42,274 @@ template <class Field>
 class BlockConjugateGradient : public OperatorFunction<Field> {
  public:
 
+
   typedef typename Field::scalar_type scomplex;
 
   int blockDim ;
-
   int Nblock;
+
+  BlockCGtype CGtype;
   bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                            // Defaults true.
   RealD Tolerance;
   Integer MaxIterations;
   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
   
-  BlockConjugateGradient(int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
+  BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
     : Tolerance(tol),
+    CGtype(cgtype),
     blockDim(_Orthog),
     MaxIterations(maxit),
     ErrorOnNoConverge(err_on_no_conv){};
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Thin QR factorisation (google it)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void ThinQRfact (Eigen::MatrixXcd &m_rr,
+		 Eigen::MatrixXcd &C,
+		 Eigen::MatrixXcd &Cinv,
+		 Field & Q,
+		 const Field & R)
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  //Dimensions
+  // R_{ferm x Nblock} =  Q_{ferm x Nblock} x  C_{Nblock x Nblock} -> ferm x Nblock
+  //
+  // Rdag R = m_rr = Herm = L L^dag        <-- Cholesky decomposition (LLT routine in Eigen)
+  //
+  //   Q  C = R => Q = R C^{-1}
+  //
+  // Want  Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock} 
+  //
+  // Set C = L^{dag}, and then Q^dag Q = ident 
+  //
+  // Checks:
+  // Cdag C = Rdag R ; passes.
+  // QdagQ  = 1      ; passes
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  sliceInnerProductMatrix(m_rr,R,R,Orthog);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Cholesky from Eigen
+  // There exists a ldlt that is documented as more stable
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+
+  C    = L.adjoint();
+  Cinv = C.inverse();
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Q = R C^{-1}
+  //
+  // Q_j  = R_i Cinv(i,j) 
+  //
+  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // FIXME:: make a sliceMulMatrix to avoid zero vector
+  sliceMulMatrix(Q,Cinv,R,Orthog);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Call one of several implementations
+////////////////////////////////////////////////////////////////////////////////////////////////////
 void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+{
+  if ( CGtype == BlockCGrQ ) {
+    BlockCGrQsolve(Linop,Src,Psi);
+  } else if (CGtype == BlockCG ) {
+    BlockCGsolve(Linop,Src,Psi);
+  } else if (CGtype == CGmultiRHS ) {
+    CGmultiRHSsolve(Linop,Src,Psi);
+  } else {
+    assert(0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////
+// BlockCGrQ implementation:
+//--------------------------
+// X is guess/Solution
+// B is RHS
+// Solve A X_i = B_i    ;        i refers to Nblock index
+////////////////////////////////////////////////////////////////////////////
+void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) 
+{
+  int Orthog = blockDim; // First dimension is block dim; this is an assumption
+  Nblock = B._grid->_fdimensions[Orthog];
+
+  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
+
+  X.checkerboard = B.checkerboard;
+  conformable(X, B);
+
+  Field tmp(B);
+  Field Q(B);
+  Field D(B);
+  Field Z(B);
+  Field AD(B);
+
+  Eigen::MatrixXcd m_DZ     = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_M      = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_C      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Cinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_S      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+  Eigen::MatrixXcd m_Sinv   = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  Eigen::MatrixXcd m_tmp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+  Eigen::MatrixXcd m_tmp1   = Eigen::MatrixXcd::Identity(Nblock,Nblock);
+
+  // Initial residual computation & set up
+  std::vector<RealD> residuals(Nblock);
+  std::vector<RealD> ssq(Nblock);
+
+  sliceNorm(ssq,B,Orthog);
+  RealD sssum=0;
+  for(int b=0;b<Nblock;b++) sssum+=ssq[b];
+
+  sliceNorm(residuals,B,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  sliceNorm(residuals,X,Orthog);
+  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
+
+  /************************************************************************
+   * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
+   ************************************************************************
+   * Dimensions:
+   *
+   *   X,B==(Nferm x Nblock)
+   *   A==(Nferm x Nferm)
+   *  
+   * Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
+   * 
+   * QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+   * for k: 
+   *   Z  = AD
+   *   M  = [D^dag Z]^{-1}
+   *   X  = X + D MC
+   *   QS = Q - ZM
+   *   D  = Q + D S^dag
+   *   C  = S C
+   */
+  ///////////////////////////////////////
+  // Initial block: initial search dir is guess
+  ///////////////////////////////////////
+  std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
+
+  //1.  QC = R = B-AX, D = Q     ; QC => Thin QR factorisation (google it)
+
+  Linop.HermOp(X, AD);
+  tmp = B - AD;  
+  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+  D=Q;
+
+  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
+
+  ///////////////////////////////////////
+  // Timers
+  ///////////////////////////////////////
+  GridStopWatch sliceInnerTimer;
+  GridStopWatch sliceMaddTimer;
+  GridStopWatch QRTimer;
+  GridStopWatch MatrixTimer;
+  GridStopWatch SolverTimer;
+  SolverTimer.Start();
+
+  int k;
+  for (k = 1; k <= MaxIterations; k++) {
+
+    //3. Z  = AD
+    MatrixTimer.Start();
+    Linop.HermOp(D, Z);      
+    MatrixTimer.Stop();
+
+    //4. M  = [D^dag Z]^{-1}
+    sliceInnerTimer.Start();
+    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
+    sliceInnerTimer.Stop();
+    m_M       = m_DZ.inverse();
+
+    //5. X  = X + D MC
+    m_tmp     = m_M * m_C;
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(X,m_tmp, D,X,Orthog);     
+    sliceMaddTimer.Stop();
+
+    //6. QS = Q - ZM
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
+    sliceMaddTimer.Stop();
+    QRTimer.Start();
+    ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
+    QRTimer.Stop();
+    
+    //7. D  = Q + D S^dag
+    m_tmp = m_S.adjoint();
+    sliceMaddTimer.Start();
+    sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
+    sliceMaddTimer.Stop();
+
+    //8. C  = S C
+    m_C = m_S*m_C;
+    
+    /*********************
+     * convergence monitor
+     *********************
+     */
+    m_rr = m_C.adjoint() * m_C;
+
+    RealD max_resid=0;
+    RealD rrsum=0;
+    RealD rr;
+
+    for(int b=0;b<Nblock;b++) {
+      rrsum+=real(m_rr(b,b));
+      rr = real(m_rr(b,b))/ssq[b];
+      if ( rr > max_resid ) max_resid = rr;
+    }
+
+    std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
+	      <<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
+
+    if ( max_resid < Tolerance*Tolerance ) { 
+
+      SolverTimer.Stop();
+
+      std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
+
+      for(int b=0;b<Nblock;b++){
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
+		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+      }
+      std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
+
+      Linop.HermOp(X, AD);
+      AD = AD-B;
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
+
+      std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
+      std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed()     <<std::endl;
+      std::cout << GridLogMessage << "\tInnerProd  " << sliceInnerTimer.Elapsed() <<std::endl;
+      std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed()  <<std::endl;
+      std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed()  <<std::endl;
+	    
+      IterationsToComplete = k;
+      return;
+    }
+
+  }
+  std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
+
+  if (ErrorOnNoConverge) assert(0);
+  IterationsToComplete = k;
+}
+//////////////////////////////////////////////////////////////////////////
+// Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
+//////////////////////////////////////////////////////////////////////////
+void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
   int Orthog = blockDim; // First dimension is block dim; this is an assumption
   Nblock = Src._grid->_fdimensions[Orthog];
@@ -163,8 +415,9 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
      *********************
      */
     RealD max_resid=0;
+    RealD rr;
     for(int b=0;b<Nblock;b++){
-      RealD rr = real(m_rr(b,b))/ssq[b];
+      rr = real(m_rr(b,b))/ssq[b];
       if ( rr > max_resid ) max_resid = rr;
     }
     
@@ -174,13 +427,14 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 
       std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
       for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
+	std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
+		  << std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
       }
       std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
 
       Linop.HermOp(Psi, AP);
       AP = AP-Src;
-      std::cout << GridLogMessage <<"\t A__ True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
+      std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
 
       std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl;
@@ -198,33 +452,11 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
   if (ErrorOnNoConverge) assert(0);
   IterationsToComplete = k;
 }
-};
-
-
 //////////////////////////////////////////////////////////////////////////
 // multiRHS conjugate gradient. Dimension zero should be the block direction
+// Use this for spread out across nodes
 //////////////////////////////////////////////////////////////////////////
-template <class Field>
-class MultiRHSConjugateGradient : public OperatorFunction<Field> {
- public:
-
-  typedef typename Field::scalar_type scomplex;
-
-  int blockDim;
-  int Nblock;
-  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
-                           // Defaults true.
-  RealD Tolerance;
-  Integer MaxIterations;
-  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
-  
-  MultiRHSConjugateGradient(int Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol),
-    blockDim(Orthog),
-    MaxIterations(maxit),
-    ErrorOnNoConverge(err_on_no_conv){};
-
-void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
+void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
 {
   int Orthog = blockDim; // First dimension is block dim
   Nblock = Src._grid->_fdimensions[Orthog];
@@ -331,7 +563,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
 
       std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
       for(int b=0;b<Nblock;b++){
-	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
+	std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
       }
       std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
 
@@ -357,9 +589,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
   if (ErrorOnNoConverge) assert(0);
   IterationsToComplete = k;
 }
+
 };
 
-
-
 }
 #endif
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index 78f88ce3..c5b20f3c 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -369,71 +369,6 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
   }
 };
 
-
-/*
-template<class vobj>
-static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
-			     int Orthog,RealD scale=1.0) 
-{    
-  // FIXME: Implementation is slow
-  // Best base the linear combination by constructing a 
-  // set of vectors of size grid->_rdimensions[Orthog].
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-  
-  int Nblock = X._grid->GlobalDimensions()[Orthog];
-  
-  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
-  // If we based this on Cshift it would work for spread out
-  // but it would be even slower
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Rslice,Y,i,Orthog);
-    ExtractSlice(Xslice,X,i,Orthog);
-    Rslice = Rslice + Xslice*(scale*a[i]);
-    InsertSlice(Rslice,R,i,Orthog);
-  }
-};
-template<class vobj>
-static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
-  {
-    // FIXME: Implementation is slow
-    // Look at localInnerProduct implementation,
-    // and do inside a site loop with block strided iterators
-    typedef typename vobj::scalar_object sobj;
-    typedef typename vobj::scalar_type scalar_type;
-    typedef typename vobj::vector_type vector_type;
-    typedef typename vobj::tensor_reduced scalar;
-    typedef typename scalar::scalar_object  scomplex;
-  
-    int Nblock = lhs._grid->GlobalDimensions()[Orthog];
-    vec.resize(Nblock);
-    std::vector<scomplex> sip(Nblock);
-    Lattice<scalar> IP(lhs._grid); 
-    IP=localInnerProduct(lhs,rhs);
-    sliceSum(IP,sip,Orthog);
-  
-    for(int ss=0;ss<Nblock;ss++){
-      vec[ss] = TensorRemove(sip[ss]);
-    }
-  }
-*/
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// FIXME: Implementation is slow
-// If we based this on Cshift it would work for spread out
-// but it would be even slower
-//
-// Repeated extract slice is inefficient
-//
-// Best base the linear combination by constructing a 
-// set of vectors of size grid->_rdimensions[Orthog].
-//////////////////////////////////////////////////////////////////////////////////////////
-
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
   int NN    = BlockSolverGrid->_ndimension;
@@ -453,7 +388,6 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
 
-
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
@@ -469,64 +403,10 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
   Lattice<vobj> Xslice(SliceGrid);
   Lattice<vobj> Rslice(SliceGrid);
 
-#if 0
-  // R[i] = Y[i] + X[j] a(j,i) 
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Rslice,Y,i,Orthog);
-    for(int j=0;j<Nblock;j++){
-      ExtractSlice(Xslice,X,j,Orthog);
-      Rslice = Rslice + Xslice*(scale*aa(j,i));
-    }
-    InsertSlice(Rslice,R,i,Orthog);
-  }
-#endif
-#if 0
-  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
-
-#pragma omp parallel 
-{ 
-
-  std::vector<int> lcoor(nl); // sliced coor
-  std::vector<int> hcoor(nh); // unsliced coor
-  std::vector<sobj> s_x(Nblock);
-
-#pragma omp for
-  for(int idx=0;idx<SliceGrid->lSites();idx++){
-
-    SliceGrid->LocalIndexToLocalCoor(idx,lcoor); 
-
-    int ddl=0;
-    for(int d=0;d<nh;d++){
-      if ( d!=Orthog ) { 
-	hcoor[d]=lcoor[ddl++];
-      }
-    }
-
-    sobj dot;
-    for(int i=0;i<Nblock;i++){
-      hcoor[Orthog] = i;
-      peekLocalSite(s_x[i],X,hcoor);
-    }
-
-    for(int i=0;i<Nblock;i++){
-      hcoor[Orthog] = i;
-      peekLocalSite(dot,Y,hcoor);
-      for(int j=0;j<Nblock;j++){
-	dot = dot + s_x[j]*(scale*aa(j,i));
-      }
-      pokeLocalSite(dot,R,hcoor);
-    }
-  }
-}
-#endif
-
-#if 1
   assert( FullGrid->_simd_layout[Orthog]==1);
   int nh =  FullGrid->_ndimension;
   int nl = SliceGrid->_ndimension;
 
-
   //FIXME package in a convenient iterator
   //Should loop over a plane orthogonal to direction "Orthog"
   int stride=FullGrid->_slice_stride[Orthog];
@@ -535,7 +415,6 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
   int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
   {
-
     std::vector<vobj> s_x(Nblock);
 
 #pragma omp for collapse(2)
@@ -543,13 +422,11 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
     for(int b=0;b<block;b++){
       int o  = n*stride + b;
 
-
       for(int i=0;i<Nblock;i++){
 	s_x[i] = X[o+i*ostride];
       }
 
       vobj dot;
-
       for(int i=0;i<Nblock;i++){
 	dot = Y[o+i*ostride];
 	for(int j=0;j<Nblock;j++){
@@ -559,15 +436,63 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
       }
     }}
   }
-#endif
+};
+
+template<class vobj>
+static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X._grid->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X._grid;
+  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+  Lattice<vobj> Xslice(SliceGrid);
+  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  int nl = SliceGrid->_ndimension;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+#pragma omp parallel 
+  {
+    std::vector<vobj> s_x(Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = s_x[0]*(scale*aa(0,i));
+	for(int j=1;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }}
+  }
+
 };
 
 
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
-  // FIXME: Implementation is slow
-  // Not sure of best solution.. think about it
   typedef typename vobj::scalar_object sobj;
   typedef typename vobj::scalar_type scalar_type;
   typedef typename vobj::vector_type vector_type;
@@ -582,63 +507,6 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
   
   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 
-#if 0  
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Lslice,lhs,i,Orthog);
-    for(int j=0;j<Nblock;j++){
-      ExtractSlice(Rslice,rhs,j,Orthog);
-      mat(i,j) = innerProduct(Lslice,Rslice);
-    }
-  }
-#endif
-
-#if 0
-  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
-
-#pragma omp parallel 
-{ 
-  std::vector<int> lcoor(nl); // sliced coor
-  std::vector<int> hcoor(nh); // unsliced coor
-  std::vector<sobj> Left(Nblock);
-  std::vector<sobj> Right(Nblock);
-  Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-#pragma omp for
-  for(int idx=0;idx<SliceGrid->lSites();idx++){
-
-    SliceGrid->LocalIndexToLocalCoor(idx,lcoor); 
-
-    int ddl=0;
-    for(int d=0;d<nh;d++){
-      if ( d!=Orthog ) { 
-	hcoor[d]=lcoor[ddl++];
-      }
-    }
-
-    // Get the scalar objects
-    for(int i=0;i<Nblock;i++){
-      hcoor[Orthog] = i;
-      peekLocalSite(Left[i] ,lhs,hcoor);
-      peekLocalSite(Right[i],rhs,hcoor);
-    }
-
-    for(int i=0;i<Nblock;i++){
-    for(int j=0;j<Nblock;j++){
-      std::complex<double> ip = innerProduct(Left[i],Right[j]);
-      mat_thread(i,j) += ip;
-    }}
-  }
-
-#pragma omp critical
-  {
-    mat += mat_thread;
-  }  
-
-}
-#endif
-
-#if 1
   assert( FullGrid->_simd_layout[Orthog]==1);
   int nh =  FullGrid->_ndimension;
   int nl = SliceGrid->_ndimension;
@@ -681,7 +549,6 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
       mat += mat_thread;
     }  
   }
-#endif
   return;
 }
 
diff --git a/tests/solver/Test_staggered_block_cg_unprec.cc b/tests/solver/Test_staggered_block_cg_unprec.cc
index 8da93195..8db41e98 100644
--- a/tests/solver/Test_staggered_block_cg_unprec.cc
+++ b/tests/solver/Test_staggered_block_cg_unprec.cc
@@ -51,7 +51,7 @@ int main (int argc, char ** argv)
   typedef typename ImprovedStaggeredFermion5DR::ComplexField ComplexField; 
   typename ImprovedStaggeredFermion5DR::ImplParams params; 
 
-  const int Ls=4;
+  const int Ls=8;
 
   Grid_init(&argc,&argv);
 
@@ -80,12 +80,13 @@ int main (int argc, char ** argv)
 
   ConjugateGradient<FermionField> CG(1.0e-8,10000);
   int blockDim = 0;
-  BlockConjugateGradient<FermionField>    BCG(blockDim,1.0e-8,10000);
-  MultiRHSConjugateGradient<FermionField> mCG(blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    BCGrQ(BlockCGrQ,blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    BCG  (BlockCG,blockDim,1.0e-8,10000);
+  BlockConjugateGradient<FermionField>    mCG  (CGmultiRHS,blockDim,1.0e-8,10000);
 
-  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
   std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
-  std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
+  std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
   ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
   MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
   FermionField src4d(UGrid); random(pRNG,src4d);
@@ -112,7 +113,7 @@ int main (int argc, char ** argv)
   std::cout << GridLogMessage << " Calling Block CG for "<<Ls <<" right hand sides" <<std::endl;
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
   result=zero;
-  BCG(HermOp,src,result);
+  BCGrQ(HermOp,src,result);
   std::cout << GridLogMessage << "************************************************************************ "<<std::endl;