Works with CPS evolution

2026-02-09 00:13:27 +00:00 · 2017-03-21 12:40:43 -04:00
parent e9712bc7fb
commit 0d5af667d8
8 changed files with 597 additions and 27 deletions
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -39,6 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 #include <Grid/algorithms/iterative/ConjugateGradientShifted.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -45,8 +45,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
      : Tolerance(tol),
        MaxIterations(maxit),
@@ -157,14 +155,13 @@ class ConjugateGradient : public OperatorFunction<Field> {
        std::cout << std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
-	IterationsToComplete = k;	
+
        return;
      }
    }
    std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
              << std::endl;
    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;
  }
 };
 }
--- a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -35,7 +35,6 @@ namespace Grid {
  class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
  public:                                                
    RealD   Tolerance;
    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid; //Grid for single-precision fields
@@ -43,16 +42,12 @@ namespace Grid {
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    Integer TotalInnerIterations; //Number of inner CG iterations
    Integer TotalOuterIterations; //Number of restarts
    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
    LinearFunction<FieldF> *guesser;
    MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
-      Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
      OuterLoopNormMult(100.), guesser(NULL){ };
    void useGuesser(LinearFunction<FieldF> &g){
@@ -60,8 +55,9 @@ namespace Grid {
    }
    void operator() (const FieldD &src_d_in, FieldD &sol_d){
-      TotalInnerIterations = 0;
+	(*this)(src_d_in,sol_d,NULL);
-	
+    }
    void operator() (const FieldD &src_d_in, FieldD &sol_d, RealD *shift){
      GridStopWatch TotalTimer;
      TotalTimer.Start();
@@ -81,7 +77,7 @@ namespace Grid {
      FieldD src_d(DoublePrecGrid);
      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
-      RealD inner_tol = InnerTolerance;
+      RealD inner_tol = Tolerance;
      FieldF src_f(SinglePrecGrid);
      src_f.checkerboard = cb;
@@ -89,18 +85,17 @@ namespace Grid {
      FieldF sol_f(SinglePrecGrid);
      sol_f.checkerboard = cb;
-      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations);
      CG_f.ErrorOnNoConverge = false;
      GridStopWatch InnerCGtimer;
      GridStopWatch PrecChangeTimer;
-      Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
+      for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
      for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
 	//Compute double precision rsd and also new RHS vector.
 	Linop_d.HermOp(sol_d, tmp_d);
 	if(shift) axpy(tmp_d,*shift,sol_d,tmp_d);
 	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
@@ -124,9 +119,8 @@ namespace Grid {
 	//Inner CG
 	CG_f.Tolerance = inner_tol;
 	InnerCGtimer.Start();
-	CG_f(Linop_f, src_f, sol_f);
+	CG_f(Linop_f, src_f, sol_f,shift);
 	InnerCGtimer.Stop();
 	TotalInnerIterations += CG_f.IterationsToComplete;
 	//Convert sol back to double and add to double prec solution
 	PrecChangeTimer.Start();
@@ -139,13 +133,11 @@ namespace Grid {
      //Final trial CG
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
-      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
+      ConjugateGradientShifted<FieldD> CG_d(Tolerance, MaxInnerIterations);
-      CG_d(Linop_d, src_d_in, sol_d);
+      CG_d(Linop_d, src_d_in, sol_d,shift);
      TotalFinalStepIterations = CG_d.IterationsToComplete;
      TotalTimer.Stop();
-      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
    }
  };
--- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -45,6 +45,7 @@ public:
    Integer MaxIterations;
    int verbose;
    MultiShiftFunction shifts;
    int iter;
    ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
 	MaxIterations(maxit),
@@ -60,6 +61,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
  std::vector<Field> results(nshift,grid);
  (*this)(Linop,src,results,psi);
 }
 void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
 {
  int nshift = shifts.order;
@@ -105,11 +107,12 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  RealD a,b,c,d;
  RealD cp,bp,qq; //prev
  int cb=src.checkerboard;
  // Matrix mult fields
  Field r(grid);
-  Field p(grid);
+  Field p(grid); p.checkerboard = src.checkerboard;
  Field tmp(grid);
-  Field mmp(grid);
+  Field mmp(grid);mmp.checkerboard = src.checkerboard;
  // Check lightest mass
  for(int s=0;s<nshift;s++){
@@ -132,6 +135,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
  p=src;
  //MdagM+m[0]
  std::cout << "p.checkerboard " << p.checkerboard
  << "mmp.checkerboard " << mmp.checkerboard << std::endl;
  Linop.HermOpAndNorm(p,mmp,d,qq);
  axpy(mmp,mass[0],p,mmp);
  RealD rn = norm2(p);
@@ -269,6 +275,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	RealD cn = norm2(src);
 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
      }
      iter = k;
      return;
    }
  }
--- a/lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -0,0 +1,404 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
    Copyright (C) 2015
 Author: Chulwoo Jung <chulwoo@quark.phy.bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END/ LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H
 #define GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H
 namespace Grid {
  //Mixed precision restarted defect correction CG
  template<class FieldD,class FieldF
 //, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0
 //, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0
 > 
  class MixedPrecisionConjugateGradientMultiShift : public LinearFunction<FieldD> {
  public:                                                
 //    RealD   Tolerance;
    Integer MaxInnerIterations;
    Integer MaxOuterIterations;
    GridBase* SinglePrecGrid; //Grid for single-precision fields
    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
    LinearOperatorBase<FieldF> &Linop_f;
    LinearOperatorBase<FieldD> &Linop_d;
    MultiShiftFunction shifts;
    Integer iter;
    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
 //    LinearFunction<FieldF> *guesser;
    MixedPrecisionConjugateGradientMultiShift(GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, 
 Integer maxinnerit,	MultiShiftFunction &_shifts ) :
      Linop_f(_Linop_f), Linop_d(_Linop_d),
      MaxInnerIterations(maxinnerit), SinglePrecGrid(_sp_grid),
      OuterLoopNormMult(100.), shifts(_shifts) {};
    void operator() (const FieldD &src_d_in, FieldD &sol_d){
 	assert(0); // not yet implemented
    }
    void operator() (const FieldD &src_d_in, std::vector<FieldD> &sol_d){
      GridStopWatch TotalTimer;
      TotalTimer.Start();
      int cb = src_d_in.checkerboard;
      int nshift = shifts.order;
      assert(nshift == sol_d.size());
      for(int i=0;i<nshift;i++) sol_d[i].checkerboard = cb;
      RealD src_norm = norm2(src_d_in);
 //      RealD stop = src_norm * Tolerance*Tolerance;
      GridBase* DoublePrecGrid = src_d_in._grid;
      FieldD tmp_d(DoublePrecGrid); tmp_d.checkerboard = cb;
      FieldD tmp2_d(DoublePrecGrid); tmp2_d.checkerboard = cb;
      FieldD src_d(DoublePrecGrid);
      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
 //      RealD inner_tol = Tolerance;
  	FieldD psi_d(DoublePrecGrid);psi_d.checkerboard = cb;
      FieldF src_f(SinglePrecGrid);
      src_f.checkerboard = cb;
      std::vector<FieldF> sol_f(nshift,SinglePrecGrid);
      for(int i=0;i<nshift;i++) sol_f[i].checkerboard = cb;
 //      ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations);
      ConjugateGradientMultiShift<FieldF> MSCG(MaxInnerIterations,shifts);
 //      CG_f.ErrorOnNoConverge = false;
      GridStopWatch InnerCGtimer;
      GridStopWatch PrecChangeTimer;
 {
 //	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
 //	if(norm < OuterLoopNormMult * stop){
 //	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
 //	  break;
 //	}
 //	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
 	PrecChangeTimer.Start();
 	precisionChange(src_f, src_d);
 	PrecChangeTimer.Stop();
 //	zeroit(sol_f);
 	//Inner CG
 	InnerCGtimer.Start();
  int if_relup = 0;
 #if 0
        MSCG(Linop_f,src_f,sol_f);
 #else
 {
  GridBase *grid = SinglePrecGrid;
  ////////////////////////////////////////////////////////////////////////
  // Convenience references to the info stored in "MultiShiftFunction"
  ////////////////////////////////////////////////////////////////////////
  int nshift = shifts.order;
  std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
  std::vector<RealD> &mresidual(shifts.tolerances);
  std::vector<RealD> alpha(nshift,1.);
  std::vector<FieldF>   ps(nshift,grid);// Search directions
  assert(sol_f.size()==nshift);
  assert(mass.size()==nshift);
  assert(mresidual.size()==nshift);
  // dynamic sized arrays on stack; 2d is a pain with vector
  RealD  bs[nshift];
  RealD  rsq[nshift];
  RealD  z[nshift][2];
  int     converged[nshift];
  const int       primary =0;
  //Primary shift fields CG iteration
  RealD a,b,c,d;
  RealD cp,bp,qq; //prev
  int cb=src_f.checkerboard;
  // Matrix mult fields
  FieldF r(grid); r.checkerboard = src_f.checkerboard;
  FieldF p(grid); p.checkerboard = src_f.checkerboard;
  FieldF tmp(grid); tmp.checkerboard = src_f.checkerboard;
  FieldF mmp(grid);mmp.checkerboard = src_f.checkerboard;
  FieldF psi(grid);psi.checkerboard = src_f.checkerboard;
    std::cout.precision(12);
    std::cout<<GridLogMessage<<"norm2(psi_d)= "<<norm2(psi_d)<<std::endl;
    std::cout<<GridLogMessage<<"norm2(psi)= "<<norm2(psi)<<std::endl;
  // Check lightest mass
  for(int s=0;s<nshift;s++){
    assert( mass[s]>= mass[primary] );
    converged[s]=0;
  }
  // Wire guess to zero
  // Residuals "r" are src
  // First search direction "p" is also src
  cp = norm2(src_f);
  Real c_relup = cp;
  for(int s=0;s<nshift;s++){
    rsq[s] = cp * mresidual[s] * mresidual[s];
    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientMultiShift: shift "<<s
 	     <<" target resid "<<rsq[s]<<std::endl;
    ps[s] = src_f;
  }
  // r and p for primary
  r=src_f;
  p=src_f;
  //MdagM+m[0]
  std::cout << "p.checkerboard " << p.checkerboard
  << "mmp.checkerboard " << mmp.checkerboard << std::endl;
  Linop_f.HermOpAndNorm(p,mmp,d,qq);
  axpy(mmp,mass[0],p,mmp);
  RealD rn = norm2(p);
  d += rn*mass[0];
  // have verified that inner product of 
  // p and mmp is equal to d after this since
  // the d computation is tricky
  //  qq = real(innerProduct(p,mmp));
  //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
  b = -cp /d;
  // Set up the various shift variables
  int       iz=0;
  z[0][1-iz] = 1.0;
  z[0][iz]   = 1.0;
  bs[0]      = b;
  for(int s=1;s<nshift;s++){
    z[s][1-iz] = 1.0;
    z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
    bs[s]      = b*z[s][iz]; 
  }
  // r += b[0] A.p[0]
  // c= norm(r)
  c=axpy_norm(r,b,mmp,r);
 axpby(psi,0.,-bs[0],src_f,src_f);
  for(int s=0;s<nshift;s++) {
    axpby(sol_f[s],0.,-bs[s]*alpha[s],src_f,src_f);
  }
  // Iteration loop
  int k;
 // inefficient zeroing, please replace!
 //  RealD sol_norm = axpy_norm(sol_d[0],-1.,sol_d[0],sol_d[0]);
  zeroit(sol_d[0]);
  std::cout<<GridLogMessage<<"norm(sol_d[0])= "<<norm2(sol_d[0])<<std::endl;
  int all_converged = 1;
 	RealD tmp1,tmp2;
  for (k=1;k<=MaxOuterIterations;k++){
    a = c /cp;
    axpy(p,a,p,r);
    // Note to self - direction ps is iterated seperately
    // for each shift. Does not appear to have any scope
    // for avoiding linear algebra in "single" case.
    // 
    // However SAME r is used. Could load "r" and update
    // ALL ps[s]. 2/3 Bandwidth saving
    // New Kernel: Load r, vector of coeffs, vector of pointers ps
    for(int s=0;s<nshift;s++){
      if ( ! converged[s] ) { 
 	if (s==0){
 	  axpy(ps[s],a,ps[s],r);
 	} else{
 	  RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
 	  axpby(ps[s],z[s][iz],as,r,ps[s]);
 	}
      }
    }
    cp=c;
    Linop_f.HermOpAndNorm(p,mmp,d,qq);
    axpy(mmp,mass[0],p,mmp);
    RealD rn = norm2(p);
    d += rn*mass[0];
    bp=b;
    b=-cp/d;
    c=axpy_norm(r,b,mmp,r);
    // Toggle the recurrence history
    bs[0] = b;
    iz = 1-iz;
    for(int s=1;s<nshift;s++){
      if((!converged[s])){
 	RealD z0 = z[s][1-iz];
 	RealD z1 = z[s][iz];
 	z[s][iz] = z0*z1*bp
 	  / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
 	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
      }
    }
    axpy(psi,-bs[0],ps[0],psi);
    for(int s=0;s<nshift;s++){
      int ss = s;
      // Scope for optimisation here in case of "single".
      // Could load sol_f[0] and pull all ps[s] in.
      //      if ( single ) ss=primary;
      // Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
      // Pipelined CG gain:
      //
      // New Kernel: Load r, vector of coeffs, vector of pointers ps
      // New Kernel: Load sol_f[0], vector of coeffs, vector of pointers ps
      // If can predict the coefficient bs then we can fuse these and avoid write reread cyce
      //  on ps[s].
      // Before:  3 x npole  + 3 x npole
      // After :  2 x npole (ps[s])        => 3x speed up of multishift CG.
      if( (!converged[s]) ) { 
 	axpy(sol_f[ss],-bs[s]*alpha[s],ps[s],sol_f[ss]);
      }
    }
    if (k%MaxInnerIterations==0){
 //    if (c < 1e-4*c_relup){
       RealD c_f=c;
       precisionChange(tmp_d,psi);
       RealD sol_norm =axpy_norm (psi_d,1.,tmp_d,psi_d);
       tmp1 = norm2(psi);
       zeroit(psi);
       tmp2 = norm2(psi);
       std::cout<<GridLogMessage<<"k= "<<k<<" norm2(sol)= "<<sol_norm<<" "<<tmp1<<" "<<tmp2<<std::endl;
 //       precisionChange(sol_d[0],sol_f[0]);
       Linop_d.HermOpAndNorm(psi_d,tmp_d,tmp1,tmp2);
       axpy(tmp2_d,mass[0],psi_d,tmp_d);
       axpy(tmp_d,-1.,tmp2_d,src_d);
       precisionChange(r,tmp_d);
 	c_relup = norm2(r);
       std::cout<<GridLogMessage<<"k= "<<k<<" norm2(r)= "<<c<<" "<<c_relup<<" "<<c_f<<std::endl;
 	if_relup=1;
    }
    // Convergence checks
  all_converged=1;
    for(int s=0;s<nshift;s++){
      if ( (!converged[s]) ){
 	RealD css  = c * z[s][iz]* z[s][iz];
 	if(css<rsq[s]){
 	  if ( ! converged[s] )
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
 	      converged[s]=1;
 	} else {
 		if (k%MaxInnerIterations==0)
 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has not converged "<<css<<"<"<<rsq[s]<<std::endl;
 	  all_converged=0;
 	}
      }
    }
 #if 0
    if ( all_converged ){
      std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
 #else
    if ( converged[0] ){
      std::cout<<GridLogMessage<< "CGMultiShift: Shift 0 have converged iteration, terminating  "<<k<<std::endl;
 #endif
 #if 1
      for(int s=1; s < nshift; s++) { 
 	Linop_f.HermOpAndNorm(sol_f[s],mmp,d,qq);
 	axpy(tmp,mass[s],sol_f[s],mmp);
 	axpy(r,-alpha[s],src_f,tmp);
 	RealD rn = norm2(r);
 	RealD cn = norm2(src_f);
 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
      }
 #endif
     iter = k;
      break;
    }
  }
  // ugly hack
  if ( !all_converged )
  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
 //  assert(0);
 }
 #endif
 	InnerCGtimer.Stop();
 	//Convert sol back to double and add to double prec solution
 	PrecChangeTimer.Start();
 	sol_d[0]=psi_d;
 	for(int i=1;i<nshift;i++)precisionChange(sol_d[i], sol_f[i]);
      std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
      // Check answers 
      for(int s=0; s < nshift; s++) { 
 	RealD tmp1,tmp2;
       Linop_d.HermOpAndNorm(sol_d[s],tmp_d,tmp1,tmp2);
       axpy(tmp2_d,shifts.poles[s],sol_d[s],tmp_d);
       axpy(tmp_d,-1.,src_d,tmp2_d);
 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(norm2(tmp_d)/norm2(src_d))<<std::endl;
      }
 	PrecChangeTimer.Stop();
 }
      //Final trial CG
 //     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
      TotalTimer.Stop();
      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
    }
  };
 }
 #endif
--- a/lib/algorithms/iterative/ConjugateGradientShifted.h
+++ b/lib/algorithms/iterative/ConjugateGradientShifted.h
@@ -0,0 +1,168 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ConjugateGradient.h
    Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_SHIFTED_H
 #define GRID_CONJUGATE_GRADIENT_SHIFTED_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Base classes for iterative processes based on operators
    // single input vec, single output vec.
    /////////////////////////////////////////////////////////////
  template<class Field> 
    class ConjugateGradientShifted : public OperatorFunction<Field> {
 public:                                                
    bool ErrorOnNoConverge; //throw an assert when the CG fails to converge. Defaults true.
    RealD   Tolerance;
    Integer MaxIterations;
    ConjugateGradientShifted(RealD tol,Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv) { 
    };
    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi ){
 	(*this)(Linop,src,psi,NULL);
    }
    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi, RealD *shift){
      psi.checkerboard = src.checkerboard;
      conformable(psi,src);
      RealD cp,c,a,d,b,ssq,qq,b_pred;
      Field   p(src);
      Field mmp(src);
      Field   r(src);
      //Initial residual computation & set up
      RealD guess = norm2(psi);
      assert(std::isnan(guess)==0);
      Linop.HermOpAndNorm(psi,mmp,d,b);
 	if(shift) axpy(mmp,*shift,psi,mmp);
 	RealD rn = norm2(psi);
 	if(shift) d += rn*(*shift);
 	RealD d2 = real(innerProduct(psi,mmp));
 	b= norm2(mmp);
      RealD src_norm=norm2(src);
      r= src-mmp;
      p= r;
      a  =norm2(p);
      cp =a;
      ssq=norm2(src);
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
      RealD rsq =  Tolerance* Tolerance*ssq;
      //Check if guess is really REALLY good :)
      if ( cp <= rsq ) {
 	return;
      }
      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;
      GridStopWatch LinalgTimer;
      GridStopWatch MatrixTimer;
      GridStopWatch SolverTimer;
      SolverTimer.Start();
      int k;
      for (k=1;k<=MaxIterations;k++){
 	c=cp;
 	MatrixTimer.Start();
 	Linop.HermOpAndNorm(p,mmp,d,qq);
 	MatrixTimer.Stop();
 	LinalgTimer.Start();
 	if(shift) axpy(mmp,*shift,p,mmp);
 	RealD rn = norm2(p);
 	if(shift) d += rn*(*shift);
 	RealD d2 = real(innerProduct(p,mmp));
 	qq = norm2(mmp);
      if (k%10==1) std::cout<< std::setprecision(4)<< "d:  "<<d<<" d2= "<<d2<<std::endl;
 	//	RealD    qqck = norm2(mmp);
 	//	ComplexD dck  = innerProduct(p,mmp);
 	a      = c/d;
 	b_pred = a*(a*qq-d)/c;
 	cp = axpy_norm(r,-a,mmp,r);
 	b = cp/c;
      if (k%10==1) std::cout<< std::setprecision(4)<<"k= "<<k<<" src:  "<<src_norm<<" r= "<<cp<<std::endl;
 	// Fuse these loops ; should be really easy
 	psi= a*p+psi;
 	p  = p*b+r;
 	LinalgTimer.Stop();
 	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl;
 	// Stopping condition
 	if ( cp <= rsq ) { 
 	  SolverTimer.Stop();
 	  Linop.HermOpAndNorm(psi,mmp,d,qq);
 	  if(shift) mmp = mmp + (*shift) * psi;
 	  p=mmp-src;
 	  RealD mmpnorm = sqrt(norm2(mmp));
 	  RealD psinorm = sqrt(norm2(psi));
 	  RealD srcnorm = sqrt(norm2(src));
 	  RealD resnorm = sqrt(norm2(p));
 	  RealD true_residual = resnorm/srcnorm;
 	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
 		   <<" computed residual "<<sqrt(cp/ssq)
 		   <<" true residual "    <<true_residual
 		   <<" target "<<Tolerance<<std::endl;
 	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
 	  std::cout<<std::endl;
 	if(ErrorOnNoConverge)
 	  assert(true_residual/Tolerance < 1000.0);
 	  return;
 	}
      }
      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
 //      assert(0);
    }
  };
 }
 #endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -8,6 +8,7 @@
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Chulwoo Jung <chulwoo@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
--- a/lib/algorithms/iterative/Matrix.h
+++ b/lib/algorithms/iterative/Matrix.h
@@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <iomanip>
 #include <complex>
 #include <typeinfo>
-#include <Grid/Grid.h>
+#include <Grid.h>
 /** Sign function **/