From 0d5af667d87459a24a969979a922ba16d50719d7 Mon Sep 17 00:00:00 2001
From: Chulwoo Jung <chulwoo@bnl.gov>
Date: Tue, 21 Mar 2017 12:40:43 -0400
Subject: [PATCH] Works with CPS evolution

---
 lib/Algorithms.h                              |   1 +
 lib/algorithms/iterative/ConjugateGradient.h  |   5 +-
 .../iterative/ConjugateGradientMixedPrec.h    |  32 +-
 .../iterative/ConjugateGradientMultiShift.h   |  11 +-
 .../ConjugateGradientMultiShiftMixedPrec.h    | 404 ++++++++++++++++++
 .../iterative/ConjugateGradientShifted.h      | 168 ++++++++
 .../iterative/ImplicitlyRestartedLanczos.h    |   1 +
 lib/algorithms/iterative/Matrix.h             |   2 +-
 8 files changed, 597 insertions(+), 27 deletions(-)
 create mode 100644 lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
 create mode 100644 lib/algorithms/iterative/ConjugateGradientShifted.h
diff --git a/lib/Algorithms.h b/lib/Algorithms.h
index 67eb11c3..d586df9d 100644
--- a/lib/Algorithms.h
+++ b/lib/Algorithms.h
@@ -39,6 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/algorithms/approx/MultiShiftFunction.h>
 
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
+#include <Grid/algorithms/iterative/ConjugateGradientShifted.h>
 #include <Grid/algorithms/iterative/ConjugateResidual.h>
 #include <Grid/algorithms/iterative/NormalEquations.h>
 #include <Grid/algorithms/iterative/SchurRedBlack.h>
diff --git a/lib/algorithms/iterative/ConjugateGradient.h b/lib/algorithms/iterative/ConjugateGradient.h
index 0f24ae94..cf3872c8 100644
--- a/lib/algorithms/iterative/ConjugateGradient.h
+++ b/lib/algorithms/iterative/ConjugateGradient.h
@@ -45,8 +45,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
                            // Defaults true.
   RealD Tolerance;
   Integer MaxIterations;
-  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
-  
   ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
       : Tolerance(tol),
         MaxIterations(maxit),
@@ -157,14 +155,13 @@ class ConjugateGradient : public OperatorFunction<Field> {
         std::cout << std::endl;
 
         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
-	IterationsToComplete = k;	
+
         return;
       }
     }
     std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
               << std::endl;
     if (ErrorOnNoConverge) assert(0);
-    IterationsToComplete = k;
   }
 };
 }
diff --git a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h b/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
index c7332455..641d58bb 100644
--- a/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
+++ b/lib/algorithms/iterative/ConjugateGradientMixedPrec.h
@@ -35,7 +35,6 @@ namespace Grid {
   class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
   public:                                                
     RealD   Tolerance;
-    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
     Integer MaxInnerIterations;
     Integer MaxOuterIterations;
     GridBase* SinglePrecGrid; //Grid for single-precision fields
@@ -43,16 +42,12 @@ namespace Grid {
     LinearOperatorBase<FieldF> &Linop_f;
     LinearOperatorBase<FieldD> &Linop_d;
 
-    Integer TotalInnerIterations; //Number of inner CG iterations
-    Integer TotalOuterIterations; //Number of restarts
-    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
-
     //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
     LinearFunction<FieldF> *guesser;
     
     MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
       Linop_f(_Linop_f), Linop_d(_Linop_d),
-      Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
+      Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
       OuterLoopNormMult(100.), guesser(NULL){ };
 
     void useGuesser(LinearFunction<FieldF> &g){
@@ -60,8 +55,9 @@ namespace Grid {
     }
   
     void operator() (const FieldD &src_d_in, FieldD &sol_d){
-      TotalInnerIterations = 0;
-	
+	(*this)(src_d_in,sol_d,NULL);
+    }
+    void operator() (const FieldD &src_d_in, FieldD &sol_d, RealD *shift){
       GridStopWatch TotalTimer;
       TotalTimer.Start();
     
@@ -81,7 +77,7 @@ namespace Grid {
       FieldD src_d(DoublePrecGrid);
       src_d = src_d_in; //source for next inner iteration, computed from residual during operation
     
-      RealD inner_tol = InnerTolerance;
+      RealD inner_tol = Tolerance;
     
       FieldF src_f(SinglePrecGrid);
       src_f.checkerboard = cb;
@@ -89,18 +85,17 @@ namespace Grid {
       FieldF sol_f(SinglePrecGrid);
       sol_f.checkerboard = cb;
     
-      ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations);
       CG_f.ErrorOnNoConverge = false;
 
       GridStopWatch InnerCGtimer;
 
       GridStopWatch PrecChangeTimer;
     
-      Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
-      
-      for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
+      for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
 	//Compute double precision rsd and also new RHS vector.
 	Linop_d.HermOp(sol_d, tmp_d);
+	if(shift) axpy(tmp_d,*shift,sol_d,tmp_d);
 	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
       
 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
@@ -124,9 +119,8 @@ namespace Grid {
 	//Inner CG
 	CG_f.Tolerance = inner_tol;
 	InnerCGtimer.Start();
-	CG_f(Linop_f, src_f, sol_f);
+	CG_f(Linop_f, src_f, sol_f,shift);
 	InnerCGtimer.Stop();
-	TotalInnerIterations += CG_f.IterationsToComplete;
       
 	//Convert sol back to double and add to double prec solution
 	PrecChangeTimer.Start();
@@ -139,13 +133,11 @@ namespace Grid {
       //Final trial CG
       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
     
-      ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
-      CG_d(Linop_d, src_d_in, sol_d);
-      TotalFinalStepIterations = CG_d.IterationsToComplete;
+      ConjugateGradientShifted<FieldD> CG_d(Tolerance, MaxInnerIterations);
+      CG_d(Linop_d, src_d_in, sol_d,shift);
 
       TotalTimer.Stop();
-      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
-      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
     }
   };
 
diff --git a/lib/algorithms/iterative/ConjugateGradientMultiShift.h b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
index a9ccfd2c..890a7392 100644
--- a/lib/algorithms/iterative/ConjugateGradientMultiShift.h
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShift.h
@@ -45,6 +45,7 @@ public:
     Integer MaxIterations;
     int verbose;
     MultiShiftFunction shifts;
+    int iter;
 
     ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : 
 	MaxIterations(maxit),
@@ -60,6 +61,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
   std::vector<Field> results(nshift,grid);
   (*this)(Linop,src,results,psi);
 }
+
 void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
 {
   int nshift = shifts.order;
@@ -105,11 +107,12 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
   RealD a,b,c,d;
   RealD cp,bp,qq; //prev
   
+  int cb=src.checkerboard;
   // Matrix mult fields
   Field r(grid);
-  Field p(grid);
+  Field p(grid); p.checkerboard = src.checkerboard;
   Field tmp(grid);
-  Field mmp(grid);
+  Field mmp(grid);mmp.checkerboard = src.checkerboard;
   
   // Check lightest mass
   for(int s=0;s<nshift;s++){
@@ -132,6 +135,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
   p=src;
   
   //MdagM+m[0]
+  std::cout << "p.checkerboard " << p.checkerboard
+  << "mmp.checkerboard " << mmp.checkerboard << std::endl;
+
   Linop.HermOpAndNorm(p,mmp,d,qq);
   axpy(mmp,mass[0],p,mmp);
   RealD rn = norm2(p);
@@ -269,6 +275,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
 	RealD cn = norm2(src);
 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
       }
+      iter = k;
       return;
     }
   }
diff --git a/lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
new file mode 100644
index 00000000..fefbec90
--- /dev/null
+++ b/lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
@@ -0,0 +1,404 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
+
+    Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@quark.phy.bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END/ LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H
+#define GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H
+
+namespace Grid {
+
+  //Mixed precision restarted defect correction CG
+  template<class FieldD,class FieldF
+//, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0
+//, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0
+> 
+  class MixedPrecisionConjugateGradientMultiShift : public LinearFunction<FieldD> {
+  public:                                                
+//    RealD   Tolerance;
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+    LinearOperatorBase<FieldF> &Linop_f;
+    LinearOperatorBase<FieldD> &Linop_d;
+    MultiShiftFunction shifts;
+    Integer iter;
+
+    //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
+//    LinearFunction<FieldF> *guesser;
+    
+    MixedPrecisionConjugateGradientMultiShift(GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, 
+Integer maxinnerit,	MultiShiftFunction &_shifts ) :
+      Linop_f(_Linop_f), Linop_d(_Linop_d),
+      MaxInnerIterations(maxinnerit), SinglePrecGrid(_sp_grid),
+      OuterLoopNormMult(100.), shifts(_shifts) {};
+
+  
+    void operator() (const FieldD &src_d_in, FieldD &sol_d){
+	assert(0); // not yet implemented
+    }
+    void operator() (const FieldD &src_d_in, std::vector<FieldD> &sol_d){
+      GridStopWatch TotalTimer;
+      TotalTimer.Start();
+    
+      int cb = src_d_in.checkerboard;
+
+      int nshift = shifts.order;
+      assert(nshift == sol_d.size());
+      for(int i=0;i<nshift;i++) sol_d[i].checkerboard = cb;
+    
+      RealD src_norm = norm2(src_d_in);
+//      RealD stop = src_norm * Tolerance*Tolerance;
+
+      GridBase* DoublePrecGrid = src_d_in._grid;
+      FieldD tmp_d(DoublePrecGrid); tmp_d.checkerboard = cb;
+    
+      FieldD tmp2_d(DoublePrecGrid); tmp2_d.checkerboard = cb;
+    
+      FieldD src_d(DoublePrecGrid);
+      src_d = src_d_in; //source for next inner iteration, computed from residual during operation
+    
+//      RealD inner_tol = Tolerance;
+  	FieldD psi_d(DoublePrecGrid);psi_d.checkerboard = cb;
+    
+      FieldF src_f(SinglePrecGrid);
+      src_f.checkerboard = cb;
+    
+      std::vector<FieldF> sol_f(nshift,SinglePrecGrid);
+      for(int i=0;i<nshift;i++) sol_f[i].checkerboard = cb;
+    
+//      ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations);
+      ConjugateGradientMultiShift<FieldF> MSCG(MaxInnerIterations,shifts);
+//      CG_f.ErrorOnNoConverge = false;
+
+      GridStopWatch InnerCGtimer;
+
+      GridStopWatch PrecChangeTimer;
+    
+{
+//	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
+
+//	if(norm < OuterLoopNormMult * stop){
+//	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
+//	  break;
+//	}
+//	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ??
+
+	PrecChangeTimer.Start();
+	precisionChange(src_f, src_d);
+	PrecChangeTimer.Stop();
+      
+//	zeroit(sol_f);
+
+
+	//Inner CG
+	InnerCGtimer.Start();
+  int if_relup = 0;
+#if 0
+        MSCG(Linop_f,src_f,sol_f);
+#else
+{
+  
+  GridBase *grid = SinglePrecGrid;
+  
+  ////////////////////////////////////////////////////////////////////////
+  // Convenience references to the info stored in "MultiShiftFunction"
+  ////////////////////////////////////////////////////////////////////////
+  int nshift = shifts.order;
+
+
+  std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
+  std::vector<RealD> &mresidual(shifts.tolerances);
+  std::vector<RealD> alpha(nshift,1.);
+  std::vector<FieldF>   ps(nshift,grid);// Search directions
+
+  assert(sol_f.size()==nshift);
+  assert(mass.size()==nshift);
+  assert(mresidual.size()==nshift);
+  
+  // dynamic sized arrays on stack; 2d is a pain with vector
+  RealD  bs[nshift];
+  RealD  rsq[nshift];
+  RealD  z[nshift][2];
+  int     converged[nshift];
+  
+  const int       primary =0;
+  
+  //Primary shift fields CG iteration
+  RealD a,b,c,d;
+  RealD cp,bp,qq; //prev
+  
+  int cb=src_f.checkerboard;
+  // Matrix mult fields
+  FieldF r(grid); r.checkerboard = src_f.checkerboard;
+  FieldF p(grid); p.checkerboard = src_f.checkerboard;
+  FieldF tmp(grid); tmp.checkerboard = src_f.checkerboard;
+  FieldF mmp(grid);mmp.checkerboard = src_f.checkerboard;
+  FieldF psi(grid);psi.checkerboard = src_f.checkerboard;
+    std::cout.precision(12);
+    std::cout<<GridLogMessage<<"norm2(psi_d)= "<<norm2(psi_d)<<std::endl;
+    std::cout<<GridLogMessage<<"norm2(psi)= "<<norm2(psi)<<std::endl;
+  
+  
+  // Check lightest mass
+  for(int s=0;s<nshift;s++){
+    assert( mass[s]>= mass[primary] );
+    converged[s]=0;
+  }
+  
+  // Wire guess to zero
+  // Residuals "r" are src
+  // First search direction "p" is also src
+  cp = norm2(src_f);
+  Real c_relup = cp;
+  for(int s=0;s<nshift;s++){
+    rsq[s] = cp * mresidual[s] * mresidual[s];
+    std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientMultiShift: shift "<<s
+	     <<" target resid "<<rsq[s]<<std::endl;
+    ps[s] = src_f;
+  }
+  // r and p for primary
+  r=src_f;
+  p=src_f;
+  
+  //MdagM+m[0]
+  std::cout << "p.checkerboard " << p.checkerboard
+  << "mmp.checkerboard " << mmp.checkerboard << std::endl;
+
+  Linop_f.HermOpAndNorm(p,mmp,d,qq);
+  axpy(mmp,mass[0],p,mmp);
+  RealD rn = norm2(p);
+  d += rn*mass[0];
+  
+  // have verified that inner product of 
+  // p and mmp is equal to d after this since
+  // the d computation is tricky
+  //  qq = real(innerProduct(p,mmp));
+  //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl;
+  
+  b = -cp /d;
+  
+  // Set up the various shift variables
+  int       iz=0;
+  z[0][1-iz] = 1.0;
+  z[0][iz]   = 1.0;
+  bs[0]      = b;
+  for(int s=1;s<nshift;s++){
+    z[s][1-iz] = 1.0;
+    z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0]));
+    bs[s]      = b*z[s][iz]; 
+  }
+  
+  // r += b[0] A.p[0]
+  // c= norm(r)
+  c=axpy_norm(r,b,mmp,r);
+  
+ axpby(psi,0.,-bs[0],src_f,src_f);
+  for(int s=0;s<nshift;s++) {
+    axpby(sol_f[s],0.,-bs[s]*alpha[s],src_f,src_f);
+  }
+  
+  
+  // Iteration loop
+  int k;
+ // inefficient zeroing, please replace!
+//  RealD sol_norm = axpy_norm(sol_d[0],-1.,sol_d[0],sol_d[0]);
+  zeroit(sol_d[0]);
+  std::cout<<GridLogMessage<<"norm(sol_d[0])= "<<norm2(sol_d[0])<<std::endl;
+  
+
+  int all_converged = 1;
+	RealD tmp1,tmp2;
+  for (k=1;k<=MaxOuterIterations;k++){
+    
+    a = c /cp;
+    axpy(p,a,p,r);
+    
+    // Note to self - direction ps is iterated seperately
+    // for each shift. Does not appear to have any scope
+    // for avoiding linear algebra in "single" case.
+    // 
+    // However SAME r is used. Could load "r" and update
+    // ALL ps[s]. 2/3 Bandwidth saving
+    // New Kernel: Load r, vector of coeffs, vector of pointers ps
+    for(int s=0;s<nshift;s++){
+      if ( ! converged[s] ) { 
+	if (s==0){
+	  axpy(ps[s],a,ps[s],r);
+	} else{
+	  RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
+	  axpby(ps[s],z[s][iz],as,r,ps[s]);
+	}
+      }
+    }
+    
+    cp=c;
+    
+    Linop_f.HermOpAndNorm(p,mmp,d,qq);
+    axpy(mmp,mass[0],p,mmp);
+    RealD rn = norm2(p);
+    d += rn*mass[0];
+    
+    bp=b;
+    b=-cp/d;
+    
+    c=axpy_norm(r,b,mmp,r);
+
+
+    // Toggle the recurrence history
+    bs[0] = b;
+    iz = 1-iz;
+    for(int s=1;s<nshift;s++){
+      if((!converged[s])){
+	RealD z0 = z[s][1-iz];
+	RealD z1 = z[s][iz];
+	z[s][iz] = z0*z1*bp
+	  / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); 
+	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike
+      }
+    }
+    
+    axpy(psi,-bs[0],ps[0],psi);
+    for(int s=0;s<nshift;s++){
+      int ss = s;
+      // Scope for optimisation here in case of "single".
+      // Could load sol_f[0] and pull all ps[s] in.
+      //      if ( single ) ss=primary;
+      // Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
+      // Pipelined CG gain:
+      //
+      // New Kernel: Load r, vector of coeffs, vector of pointers ps
+      // New Kernel: Load sol_f[0], vector of coeffs, vector of pointers ps
+      // If can predict the coefficient bs then we can fuse these and avoid write reread cyce
+      //  on ps[s].
+      // Before:  3 x npole  + 3 x npole
+      // After :  2 x npole (ps[s])        => 3x speed up of multishift CG.
+      
+      if( (!converged[s]) ) { 
+	axpy(sol_f[ss],-bs[s]*alpha[s],ps[s],sol_f[ss]);
+      }
+    }
+
+
+    if (k%MaxInnerIterations==0){
+//    if (c < 1e-4*c_relup){
+       RealD c_f=c;
+       precisionChange(tmp_d,psi);
+       RealD sol_norm =axpy_norm (psi_d,1.,tmp_d,psi_d);
+       tmp1 = norm2(psi);
+       zeroit(psi);
+       tmp2 = norm2(psi);
+       std::cout<<GridLogMessage<<"k= "<<k<<" norm2(sol)= "<<sol_norm<<" "<<tmp1<<" "<<tmp2<<std::endl;
+//       precisionChange(sol_d[0],sol_f[0]);
+       Linop_d.HermOpAndNorm(psi_d,tmp_d,tmp1,tmp2);
+       axpy(tmp2_d,mass[0],psi_d,tmp_d);
+       axpy(tmp_d,-1.,tmp2_d,src_d);
+       precisionChange(r,tmp_d);
+	c_relup = norm2(r);
+       std::cout<<GridLogMessage<<"k= "<<k<<" norm2(r)= "<<c<<" "<<c_relup<<" "<<c_f<<std::endl;
+	if_relup=1;
+    }
+    
+    // Convergence checks
+  all_converged=1;
+    for(int s=0;s<nshift;s++){
+      
+      if ( (!converged[s]) ){
+	
+	RealD css  = c * z[s][iz]* z[s][iz];
+	
+	if(css<rsq[s]){
+	  if ( ! converged[s] )
+	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
+	      converged[s]=1;
+	} else {
+		if (k%MaxInnerIterations==0)
+	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has not converged "<<css<<"<"<<rsq[s]<<std::endl;
+	  all_converged=0;
+	}
+
+      }
+    }
+    
+#if 0
+    if ( all_converged ){
+      std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
+#else
+    if ( converged[0] ){
+      std::cout<<GridLogMessage<< "CGMultiShift: Shift 0 have converged iteration, terminating  "<<k<<std::endl;
+#endif
+      
+#if 1
+      for(int s=1; s < nshift; s++) { 
+	Linop_f.HermOpAndNorm(sol_f[s],mmp,d,qq);
+	axpy(tmp,mass[s],sol_f[s],mmp);
+	axpy(r,-alpha[s],src_f,tmp);
+	RealD rn = norm2(r);
+	RealD cn = norm2(src_f);
+	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
+      }
+#endif
+     iter = k;
+      break;
+    }
+  }
+  // ugly hack
+  if ( !all_converged )
+  std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
+//  assert(0);
+}
+	
+#endif
+	InnerCGtimer.Stop();
+      
+	//Convert sol back to double and add to double prec solution
+	PrecChangeTimer.Start();
+	sol_d[0]=psi_d;
+	for(int i=1;i<nshift;i++)precisionChange(sol_d[i], sol_f[i]);
+      std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
+      // Check answers 
+      for(int s=0; s < nshift; s++) { 
+	RealD tmp1,tmp2;
+       Linop_d.HermOpAndNorm(sol_d[s],tmp_d,tmp1,tmp2);
+       axpy(tmp2_d,shifts.poles[s],sol_d[s],tmp_d);
+       axpy(tmp_d,-1.,src_d,tmp2_d);
+	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(norm2(tmp_d)/norm2(src_d))<<std::endl;
+      }
+	PrecChangeTimer.Stop();
+      
+}
+    
+      //Final trial CG
+ //     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
+    
+      TotalTimer.Stop();
+      std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
+    }
+  };
+
+}
+
+#endif
diff --git a/lib/algorithms/iterative/ConjugateGradientShifted.h b/lib/algorithms/iterative/ConjugateGradientShifted.h
new file mode 100644
index 00000000..85804753
--- /dev/null
+++ b/lib/algorithms/iterative/ConjugateGradientShifted.h
@@ -0,0 +1,168 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ConjugateGradient.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_SHIFTED_H
+#define GRID_CONJUGATE_GRADIENT_SHIFTED_H
+
+namespace Grid {
+
+    /////////////////////////////////////////////////////////////
+    // Base classes for iterative processes based on operators
+    // single input vec, single output vec.
+    /////////////////////////////////////////////////////////////
+
+  template<class Field> 
+    class ConjugateGradientShifted : public OperatorFunction<Field> {
+public:                                                
+    bool ErrorOnNoConverge; //throw an assert when the CG fails to converge. Defaults true.
+    RealD   Tolerance;
+    Integer MaxIterations;
+    ConjugateGradientShifted(RealD tol,Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv) { 
+    };
+
+    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi ){
+	(*this)(Linop,src,psi,NULL);
+    }
+
+    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi, RealD *shift){
+
+      psi.checkerboard = src.checkerboard;
+      conformable(psi,src);
+
+      RealD cp,c,a,d,b,ssq,qq,b_pred;
+      
+      Field   p(src);
+      Field mmp(src);
+      Field   r(src);
+      
+      //Initial residual computation & set up
+      RealD guess = norm2(psi);
+      assert(std::isnan(guess)==0);
+
+      Linop.HermOpAndNorm(psi,mmp,d,b);
+	if(shift) axpy(mmp,*shift,psi,mmp);
+	RealD rn = norm2(psi);
+	if(shift) d += rn*(*shift);
+	RealD d2 = real(innerProduct(psi,mmp));
+	b= norm2(mmp);
+      RealD src_norm=norm2(src);
+      r= src-mmp;
+      p= r;
+      
+      a  =norm2(p);
+      cp =a;
+      ssq=norm2(src);
+
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
+      std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
+
+      RealD rsq =  Tolerance* Tolerance*ssq;
+      
+      //Check if guess is really REALLY good :)
+      if ( cp <= rsq ) {
+	return;
+      }
+      
+      std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;
+
+      GridStopWatch LinalgTimer;
+      GridStopWatch MatrixTimer;
+      GridStopWatch SolverTimer;
+
+      SolverTimer.Start();
+      int k;
+      for (k=1;k<=MaxIterations;k++){
+	
+	c=cp;
+
+	MatrixTimer.Start();
+	Linop.HermOpAndNorm(p,mmp,d,qq);
+	MatrixTimer.Stop();
+	LinalgTimer.Start();
+	if(shift) axpy(mmp,*shift,p,mmp);
+	RealD rn = norm2(p);
+	if(shift) d += rn*(*shift);
+	RealD d2 = real(innerProduct(p,mmp));
+	qq = norm2(mmp);
+      if (k%10==1) std::cout<< std::setprecision(4)<< "d:  "<<d<<" d2= "<<d2<<std::endl;
+
+	//	RealD    qqck = norm2(mmp);
+	//	ComplexD dck  = innerProduct(p,mmp);
+      
+	a      = c/d;
+	b_pred = a*(a*qq-d)/c;
+
+	cp = axpy_norm(r,-a,mmp,r);
+	b = cp/c;
+      if (k%10==1) std::cout<< std::setprecision(4)<<"k= "<<k<<" src:  "<<src_norm<<" r= "<<cp<<std::endl;
+	
+	// Fuse these loops ; should be really easy
+	psi= a*p+psi;
+	p  = p*b+r;
+	  
+	LinalgTimer.Stop();
+	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl;
+	
+	// Stopping condition
+	if ( cp <= rsq ) { 
+	  
+	  SolverTimer.Stop();
+	  Linop.HermOpAndNorm(psi,mmp,d,qq);
+	  if(shift) mmp = mmp + (*shift) * psi;
+	  p=mmp-src;
+	  
+	  RealD mmpnorm = sqrt(norm2(mmp));
+	  RealD psinorm = sqrt(norm2(psi));
+	  RealD srcnorm = sqrt(norm2(src));
+	  RealD resnorm = sqrt(norm2(p));
+	  RealD true_residual = resnorm/srcnorm;
+
+	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
+		   <<" computed residual "<<sqrt(cp/ssq)
+		   <<" true residual "    <<true_residual
+		   <<" target "<<Tolerance<<std::endl;
+	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
+	  std::cout<<std::endl;
+	  
+	if(ErrorOnNoConverge)
+	  assert(true_residual/Tolerance < 1000.0);
+
+	  return;
+	}
+      }
+      std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
+//      assert(0);
+    }
+  };
+}
+#endif
diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index 233eb8f5..ff2c26bb 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -8,6 +8,7 @@
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
diff --git a/lib/algorithms/iterative/Matrix.h b/lib/algorithms/iterative/Matrix.h
index cb78a598..afdda2d5 100644
--- a/lib/algorithms/iterative/Matrix.h
+++ b/lib/algorithms/iterative/Matrix.h
@@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <iomanip>
 #include <complex>
 #include <typeinfo>
-#include <Grid/Grid.h>
+#include <Grid.h>
 
 
 /** Sign function **/