Grid/lib/algorithms/iterative/BlockConjugateGradient.h

/*************************************************************************************

Grid physics library, www.github.com/paboyle/Grid

Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h

Copyright (C) 2017

Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/*  END LEGAL */
#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
#define GRID_BLOCK_CONJUGATE_GRADIENT_H

#include <Grid/Eigen/Dense>

namespace Grid {

GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
  int NN    = BlockSolverGrid->_ndimension;
  int nsimd = BlockSolverGrid->Nsimd();

  std::vector<int> latt_phys(0);
  std::vector<int> simd_phys(0);
  std::vector<int>  mpi_phys(0);
  
  for(int d=0;d<NN;d++){
    if( d!=Orthog ) { 
    latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
    simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
     mpi_phys.push_back(BlockSolverGrid->_processors[d]);
    }
  }
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
}
  //////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum
  //////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
{    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

  int Nblock = X._grid->GlobalDimensions()[Orthog];
    
  GridBase *FullGrid  = X._grid;
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  
  Lattice<vobj> Xslice(SliceGrid);
  Lattice<vobj> Rslice(SliceGrid);
  // If we based this on Cshift it would work for spread out
  // but it would be even slower
  for(int i=0;i<Nblock;i++){
    ExtractSlice(Rslice,Y,i,Orthog);
    for(int j=0;j<Nblock;j++){
      ExtractSlice(Xslice,X,j,Orthog);
      Rslice = Rslice + Xslice*(scale*aa(j,i));
    }
    InsertSlice(Rslice,R,i,Orthog);
  }
};
template<class vobj>
static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
{
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;

  GridBase *FullGrid  = lhs._grid;
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);

  int Nblock = FullGrid->GlobalDimensions()[Orthog];
  
  Lattice<vobj> Lslice(SliceGrid);
  Lattice<vobj> Rslice(SliceGrid);

  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);

  for(int i=0;i<Nblock;i++){
    ExtractSlice(Lslice,lhs,i,Orthog);
    for(int j=0;j<Nblock;j++){
      ExtractSlice(Rslice,rhs,j,Orthog);
      mat(i,j) = innerProduct(Lslice,Rslice);
    }
  }
#undef FORCE_DIAG
#ifdef FORCE_DIAG
  for(int i=0;i<Nblock;i++){
    for(int j=0;j<Nblock;j++){
      if ( i != j ) mat(i,j)=0.0;
    }
  }
#endif
  return;
}
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
{
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::tensor_reduced scalar;
  typedef typename scalar::scalar_object  scomplex;
  
  int Nblock = lhs._grid->GlobalDimensions()[Orthog];

  vec.resize(Nblock);
  std::vector<scomplex> sip(Nblock);
  Lattice<scalar> IP(lhs._grid); 

  IP=localInnerProduct(lhs,rhs);
  sliceSum(IP,sip,Orthog);
  
  for(int ss=0;ss<Nblock;ss++){
    vec[ss] = TensorRemove(sip[ss]);
  }
}
template<class vobj>
static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) {

  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  
  int Nblock = rhs._grid->GlobalDimensions()[Orthog];
  std::vector<ComplexD> ip(Nblock);
  sn.resize(Nblock);
  
  sliceInnerProductVector(ip,rhs,rhs,Orthog);
  for(int ss=0;ss<Nblock;ss++){
    sn[ss] = real(ip[ss]);
  }
};
/*
template<class vobj>
static void sliceInnerProductMatrixOld(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
{
  typedef typename vobj::scalar_object  sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::tensor_reduced scalar;
  typedef typename scalar::scalar_object  scomplex;

  int Nblock = lhs._grid->GlobalDimensions()[Orthog];

  std::cout << " sliceInnerProductMatrix Dim "<<Orthog<<" Nblock " << Nblock<<std::endl;

  Lattice<scalar> IP(lhs._grid); 
  std::vector<scomplex> sip(Nblock);
    
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);

  Lattice<vobj> tmp = rhs;
  
  for(int s1=0;s1<Nblock;s1++){
    
    IP=localInnerProduct(lhs,tmp);
    sliceSum(IP,sip,Orthog);

    std::cout << "InnerProductMatrix ["<<s1<<"] = ";
    for(int ss=0;ss<Nblock;ss++){
      std::cout << TensorRemove(sip[ss])<<" ";
    }
    std::cout << std::endl;

    for(int ss=0;ss<Nblock;ss++){
      mat(ss,(s1+ss)%Nblock) = TensorRemove(sip[ss]);
    }
    if ( s1!=(Nblock-1) ) { 
      tmp = Cshift(tmp,Orthog,1);
    }
  }
}
*/

//////////////////////////////////////////////////////////////////////////
// Block conjugate gradient. Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
template <class Field>
class BlockConjugateGradient : public OperatorFunction<Field> {
 public:

  typedef typename Field::scalar_type scomplex;

  const int blockDim = 0;

  int Nblock;
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  
  BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol),
    MaxIterations(maxit),
    ErrorOnNoConverge(err_on_no_conv){};

void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) 
{
  int Orthog = 0; // First dimension is block dim
  Nblock = Src._grid->_fdimensions[Orthog];
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<std::endl;
  std::cout<<GridLogMessage<<" Block Conjugate Gradient : Nblock "<<Nblock<<std::endl;

  Psi.checkerboard = Src.checkerboard;
  conformable(Psi, Src);

  Field P(Src);
  Field AP(Src);
  Field R(Src);
  
  Eigen::MatrixXcd m_pAp    = Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
  Eigen::MatrixXcd m_rr     = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);

  Eigen::MatrixXcd m_alpha      = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  Eigen::MatrixXcd m_beta   = Eigen::MatrixXcd::Zero(Nblock,Nblock);

  // Initial residual computation & set up
  std::vector<RealD> residuals(Nblock);
  std::vector<RealD> ssq(Nblock);

  sliceNorm(ssq,Src,Orthog);
  RealD sssum=0;
  for(int b=0;b<Nblock;b++) sssum+=ssq[b];

  sliceNorm(residuals,Src,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }

  sliceNorm(residuals,Psi,Orthog);
  for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }

  // Initial search dir is guess
  Linop.HermOp(Psi, AP);
  

  /************************************************************************
   * Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
   ************************************************************************
   * O'Leary : R = B - A X
   * O'Leary : P = M R ; preconditioner M = 1
   * O'Leary : alpha = PAP^{-1} RMR
   * O'Leary : beta  = RMR^{-1}_old RMR_new
   * O'Leary : X=X+Palpha
   * O'Leary : R_new=R_old-AP alpha
   * O'Leary : P=MR_new+P beta
   */

  R = Src - AP;  
  P = R;
  sliceInnerProductMatrix(m_rr,R,R,Orthog);

  int k;
  for (k = 1; k <= MaxIterations; k++) {

    RealD rrsum=0;
    for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));

    std::cout << GridLogIterative << " iteration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
	      <<" / "<<std::sqrt(rrsum/sssum) <<std::endl;

    Linop.HermOp(P, AP);

    // Alpha
    sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
    m_pAp_inv = m_pAp.inverse();
    m_alpha   = m_pAp_inv * m_rr ;

    // Psi, R update
    sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog);     // add alpha *  P to psi
    sliceMaddMatrix(R  ,m_alpha,AP,  R,Orthog,-1.0);// sub alpha * AP to resid

    // Beta
    m_rr_inv = m_rr.inverse();
    sliceInnerProductMatrix(m_rr,R,R,Orthog);
    m_beta = m_rr_inv *m_rr;

    // Search update
    sliceMaddMatrix(AP,m_beta,P,R,Orthog);
    P= AP;

    /*********************
     * convergence monitor
     *********************
     */
    RealD max_resid=0;
    for(int b=0;b<Nblock;b++){
      RealD rr = real(m_rr(b,b))/ssq[b];
      if ( rr > max_resid ) max_resid = rr;
    }
    
    if ( max_resid < Tolerance*Tolerance ) { 
      std::cout << GridLogMessage<<" Block solver has converged in "
		<<k<<" iterations; max residual is "<<std::sqrt(max_resid)<<std::endl;
      for(int b=0;b<Nblock;b++){
	std::cout << GridLogMessage<< " block "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
      }

      Linop.HermOp(Psi, AP);
      AP = AP-Src;
      std::cout << " Block solver true residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
      IterationsToComplete = k;
      return;
    }

  }
  std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;

  if (ErrorOnNoConverge) assert(0);
  IterationsToComplete = k;
}
};
}
#endif
Start of blockCG 2017-04-09 15:42:10 +01:00			`/*************************************************************************************`

			`Grid physics library, www.github.com/paboyle/Grid`

			`Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h`

			`Copyright (C) 2017`

			`Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>`
			`Author: Peter Boyle <paboyle@ph.ed.ac.uk>`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License along`
			`with this program; if not, write to the Free Software Foundation, Inc.,`
			`51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`

			`See the full license in the file "LICENSE" in the top level distribution`
			`directory`
			`*************************************************************************************/`
			`/* END LEGAL */`
			`#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H`
			`#define GRID_BLOCK_CONJUGATE_GRADIENT_H`

			`#include <Grid/Eigen/Dense>`

			`namespace Grid {`

			`GridBase makeSubSliceGrid(const GridBase BlockSolverGrid,int Orthog)`
			`{`
			`int NN = BlockSolverGrid->_ndimension;`
			`int nsimd = BlockSolverGrid->Nsimd();`

			`std::vector<int> latt_phys(0);`
			`std::vector<int> simd_phys(0);`
			`std::vector<int> mpi_phys(0);`

			`for(int d=0;d<NN;d++){`
			`if( d!=Orthog ) {`
			`latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);`
			`simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);`
			`mpi_phys.push_back(BlockSolverGrid->_processors[d]);`
			`}`
			`}`
			`return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);`
			`}`
			`//////////////////////////////////////////////////////////////////////////////////////////////////////////////`
			`// Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum`
			`//////////////////////////////////////////////////////////////////////////////////////////////////////////////`
			`template<class vobj>`
			`static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)`
			`{`
			`typedef typename vobj::scalar_object sobj;`
			`typedef typename vobj::scalar_type scalar_type;`
			`typedef typename vobj::vector_type vector_type;`

			`int Nblock = X._grid->GlobalDimensions()[Orthog];`

			`GridBase *FullGrid = X._grid;`
			`GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);`

			`Lattice<vobj> Xslice(SliceGrid);`
			`Lattice<vobj> Rslice(SliceGrid);`
			`// If we based this on Cshift it would work for spread out`
			`// but it would be even slower`
			`for(int i=0;i<Nblock;i++){`
			`ExtractSlice(Rslice,Y,i,Orthog);`
			`for(int j=0;j<Nblock;j++){`
			`ExtractSlice(Xslice,X,j,Orthog);`
			`Rslice = Rslice + Xslice(scaleaa(j,i));`
			`}`
			`InsertSlice(Rslice,R,i,Orthog);`
			`}`
			`};`
			`template<class vobj>`
			`static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)`
			`{`
			`typedef typename vobj::scalar_object sobj;`
			`typedef typename vobj::scalar_type scalar_type;`
			`typedef typename vobj::vector_type vector_type;`

			`GridBase *FullGrid = lhs._grid;`
			`GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);`

			`int Nblock = FullGrid->GlobalDimensions()[Orthog];`

			`Lattice<vobj> Lslice(SliceGrid);`
			`Lattice<vobj> Rslice(SliceGrid);`

			`mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);`

			`for(int i=0;i<Nblock;i++){`
			`ExtractSlice(Lslice,lhs,i,Orthog);`
			`for(int j=0;j<Nblock;j++){`
			`ExtractSlice(Rslice,rhs,j,Orthog);`
			`mat(i,j) = innerProduct(Lslice,Rslice);`
			`}`
			`}`
			`#undef FORCE_DIAG`
			`#ifdef FORCE_DIAG`
			`for(int i=0;i<Nblock;i++){`
			`for(int j=0;j<Nblock;j++){`
			`if ( i != j ) mat(i,j)=0.0;`
			`}`
			`}`
			`#endif`
			`return;`
			`}`
			`template<class vobj>`
			`static void sliceInnerProductVector( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)`
			`{`
			`typedef typename vobj::scalar_object sobj;`
			`typedef typename vobj::scalar_type scalar_type;`
			`typedef typename vobj::vector_type vector_type;`
			`typedef typename vobj::tensor_reduced scalar;`
			`typedef typename scalar::scalar_object scomplex;`

			`int Nblock = lhs._grid->GlobalDimensions()[Orthog];`

			`vec.resize(Nblock);`
			`std::vector<scomplex> sip(Nblock);`
			`Lattice<scalar> IP(lhs._grid);`

			`IP=localInnerProduct(lhs,rhs);`
			`sliceSum(IP,sip,Orthog);`

			`for(int ss=0;ss<Nblock;ss++){`
			`vec[ss] = TensorRemove(sip[ss]);`
			`}`
			`}`
			`template<class vobj>`
			`static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) {`

			`typedef typename vobj::scalar_object sobj;`
			`typedef typename vobj::scalar_type scalar_type;`
			`typedef typename vobj::vector_type vector_type;`

			`int Nblock = rhs._grid->GlobalDimensions()[Orthog];`
			`std::vector<ComplexD> ip(Nblock);`
			`sn.resize(Nblock);`

			`sliceInnerProductVector(ip,rhs,rhs,Orthog);`
			`for(int ss=0;ss<Nblock;ss++){`
			`sn[ss] = real(ip[ss]);`
			`}`
			`};`
			`/*`
			`template<class vobj>`
			`static void sliceInnerProductMatrixOld( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)`
			`{`
			`typedef typename vobj::scalar_object sobj;`
			`typedef typename vobj::scalar_type scalar_type;`
			`typedef typename vobj::vector_type vector_type;`
			`typedef typename vobj::tensor_reduced scalar;`
			`typedef typename scalar::scalar_object scomplex;`

			`int Nblock = lhs._grid->GlobalDimensions()[Orthog];`

			`std::cout << " sliceInnerProductMatrix Dim "<<Orthog<<" Nblock " << Nblock<<std::endl;`

			`Lattice<scalar> IP(lhs._grid);`
			`std::vector<scomplex> sip(Nblock);`

			`mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);`

			`Lattice<vobj> tmp = rhs;`

			`for(int s1=0;s1<Nblock;s1++){`

			`IP=localInnerProduct(lhs,tmp);`
			`sliceSum(IP,sip,Orthog);`

			`std::cout << "InnerProductMatrix ["<<s1<<"] = ";`
			`for(int ss=0;ss<Nblock;ss++){`
			`std::cout << TensorRemove(sip[ss])<<" ";`
			`}`
			`std::cout << std::endl;`

			`for(int ss=0;ss<Nblock;ss++){`
			`mat(ss,(s1+ss)%Nblock) = TensorRemove(sip[ss]);`
			`}`
			`if ( s1!=(Nblock-1) ) {`
			`tmp = Cshift(tmp,Orthog,1);`
			`}`
			`}`
			`}`
			`*/`

			`//////////////////////////////////////////////////////////////////////////`
			`// Block conjugate gradient. Dimension zero should be the block direction`
			`//////////////////////////////////////////////////////////////////////////`
			`template <class Field>`
			`class BlockConjugateGradient : public OperatorFunction<Field> {`
			`public:`

			`typedef typename Field::scalar_type scomplex;`

			`const int blockDim = 0;`

			`int Nblock;`
			`bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.`
			`// Defaults true.`
			`RealD Tolerance;`
			`Integer MaxIterations;`
			`Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion`

			`BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)`
			`: Tolerance(tol),`
			`MaxIterations(maxit),`
			`ErrorOnNoConverge(err_on_no_conv){};`

			`void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)`
			`{`
			`int Orthog = 0; // First dimension is block dim`
			`Nblock = Src._grid->_fdimensions[Orthog];`
			`std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<std::endl;`
			`std::cout<<GridLogMessage<<" Block Conjugate Gradient : Nblock "<<Nblock<<std::endl;`

			`Psi.checkerboard = Src.checkerboard;`
			`conformable(Psi, Src);`

			`Field P(Src);`
			`Field AP(Src);`
			`Field R(Src);`

			`Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock);`
			`Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);`
			`Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);`
			`Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);`

			`Eigen::MatrixXcd m_alpha = Eigen::MatrixXcd::Zero(Nblock,Nblock);`
			`Eigen::MatrixXcd m_beta = Eigen::MatrixXcd::Zero(Nblock,Nblock);`

			`// Initial residual computation & set up`
			`std::vector<RealD> residuals(Nblock);`
			`std::vector<RealD> ssq(Nblock);`

			`sliceNorm(ssq,Src,Orthog);`
			`RealD sssum=0;`
			`for(int b=0;b<Nblock;b++) sssum+=ssq[b];`

			`sliceNorm(residuals,Src,Orthog);`
			`for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }`

			`sliceNorm(residuals,Psi,Orthog);`
			`for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }`

			`// Initial search dir is guess`
			`Linop.HermOp(Psi, AP);`


			`/************************************************************************`
			`* Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)`
			`************************************************************************`
			`* O'Leary : R = B - A X`
			`* O'Leary : P = M R ; preconditioner M = 1`
			`* O'Leary : alpha = PAP^{-1} RMR`
			`* O'Leary : beta = RMR^{-1}_old RMR_new`
			`* O'Leary : X=X+Palpha`
			`* O'Leary : R_new=R_old-AP alpha`
			`* O'Leary : P=MR_new+P beta`
			`*/`

			`R = Src - AP;`
			`P = R;`
			`sliceInnerProductMatrix(m_rr,R,R,Orthog);`

			`int k;`
			`for (k = 1; k <= MaxIterations; k++) {`

			`RealD rrsum=0;`
			`for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));`

			`std::cout << GridLogIterative << " iteration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum`
			`<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;`

			`Linop.HermOp(P, AP);`

			`// Alpha`
			`sliceInnerProductMatrix(m_pAp,P,AP,Orthog);`
			`m_pAp_inv = m_pAp.inverse();`
			`m_alpha = m_pAp_inv * m_rr ;`

			`// Psi, R update`
			`sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog); // add alpha * P to psi`
			`sliceMaddMatrix(R ,m_alpha,AP, R,Orthog,-1.0);// sub alpha * AP to resid`

			`// Beta`
			`m_rr_inv = m_rr.inverse();`
			`sliceInnerProductMatrix(m_rr,R,R,Orthog);`
			`m_beta = m_rr_inv *m_rr;`

			`// Search update`
			`sliceMaddMatrix(AP,m_beta,P,R,Orthog);`
			`P= AP;`

			`/*********************`
			`* convergence monitor`
			`*********************`
			`*/`
			`RealD max_resid=0;`
			`for(int b=0;b<Nblock;b++){`
			`RealD rr = real(m_rr(b,b))/ssq[b];`
			`if ( rr > max_resid ) max_resid = rr;`
			`}`

			`if ( max_resid < Tolerance*Tolerance ) {`
			`std::cout << GridLogMessage<<" Block solver has converged in "`
			`<<k<<" iterations; max residual is "<<std::sqrt(max_resid)<<std::endl;`
			`for(int b=0;b<Nblock;b++){`
			`std::cout << GridLogMessage<< " block "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;`
			`}`

			`Linop.HermOp(Psi, AP);`
			`AP = AP-Src;`
			`std::cout << " Block solver true residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;`
			`IterationsToComplete = k;`
			`return;`
			`}`

			`}`
			`std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;`

			`if (ErrorOnNoConverge) assert(0);`
			`IterationsToComplete = k;`
			`}`
			`};`
			`}`
			`#endif`