From 0486ff8e7901dccd53f47031cececf04af70f1fd Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Tue, 20 Jun 2017 18:46:01 +0100
Subject: [PATCH 1/5] Improved the lancos

---
 TODO                                          |  28 +-
 lib/algorithms/densematrix/DenseMatrix.h      | 137 ---
 lib/algorithms/densematrix/Francis.h          | 525 ----------
 lib/algorithms/densematrix/Householder.h      | 242 -----
 .../iterative/ImplicitlyRestartedLanczos.h    | 987 ++++--------------
 lib/qcd/hmc/checkpointers/ILDGCheckpointer.h  |   2 +-
 tests/solver/Test_dwf_lanczos.cc              |   2 +-
 7 files changed, 211 insertions(+), 1712 deletions(-)
 delete mode 100644 lib/algorithms/densematrix/DenseMatrix.h
 delete mode 100644 lib/algorithms/densematrix/Francis.h
 delete mode 100644 lib/algorithms/densematrix/Householder.h

diff --git a/TODO b/TODO
index a5d4cabd..eeb7dfa5 100644
--- a/TODO
+++ b/TODO
@@ -1,24 +1,28 @@
 TODO:
 ---------------
 
-Peter's work list:
-1)- Precision conversion and sort out localConvert      <-- 
-2)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
-
--- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
--- Physical propagator interface
--- Conserved currents
--- GaugeFix into central location
--- Multigrid Wilson and DWF, compare to other Multigrid implementations
--- HDCR resume
+Large item work list:
+1)- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
+2)- MultiRHS with spread out extra dim
+3)- BG/Q port and check
+4)- Precision conversion and sort out localConvert      <-- partial
+  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
+5)- Physical propagator interface
+6)- Conserved currents
+7)- Multigrid Wilson and DWF, compare to other Multigrid implementations
+8)- HDCR resume
 
 Recent DONE 
+-- GaugeFix into central location                      <-- DONE
+-- Scidac and Ildg metadata handling                   <-- DONE
+-- Binary I/O MPI2 IO                                  <-- DONE
 -- Binary I/O speed up & x-strips                      <-- DONE
 -- Cut down the exterior overhead                      <-- DONE
 -- Interior legs from SHM comms                        <-- DONE
 -- Half-precision comms                                <-- DONE
--- Merge high precision reduction into develop        
--- multiRHS DWF; benchmark on Cori/BNL for comms elimination
+-- Merge high precision reduction into develop         <-- DONE
+-- BlockCG, BCGrQ                                      <-- DONE
+-- multiRHS DWF; benchmark on Cori/BNL for comms elimination <-- DONE
    -- slice* linalg routines for multiRHS, BlockCG    
 
 -----
diff --git a/lib/algorithms/densematrix/DenseMatrix.h b/lib/algorithms/densematrix/DenseMatrix.h
deleted file mode 100644
index d86add21..00000000
--- a/lib/algorithms/densematrix/DenseMatrix.h
+++ /dev/null
@@ -1,137 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/DenseMatrix.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_DENSE_MATRIX_H
-#define GRID_DENSE_MATRIX_H
-
-namespace Grid {
-    /////////////////////////////////////////////////////////////
-    // Matrix untils
-    /////////////////////////////////////////////////////////////
-
-template<class T> using DenseVector = std::vector<T>;
-template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
-
-template<class T> void Size(DenseVector<T> & vec, int &N) 
-{ 
-  N= vec.size();
-}
-template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
-{ 
-  N= mat.size();
-  M= mat[0].size();
-}
-
-template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
-{ 
-  int M; Size(mat,N,M);
-  assert(N==M);
-}
-
-template<class T> void Resize(DenseVector<T > & mat, int N) { 
-  mat.resize(N);
-}
-template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
-  mat.resize(N);
-  for(int i=0;i<N;i++){
-    mat[i].resize(M);
-  }
-}
-template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
-  int N,M;
-  Size(mat,N,M);
-  for(int i=0;i<N;i++){
-  for(int j=0;j<M;j++){
-    mat[i][j] = val;
-  }}
-}
-
-/** Transpose of a matrix **/
-template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
-  int N,M;
-  Size(mat,N,M);
-  DenseMatrix<T> C; Resize(C,M,N);
-  for(int i=0;i<M;i++){
-  for(int j=0;j<N;j++){
-    C[i][j] = mat[j][i];
-  }} 
-  return C;
-}
-/** Set DenseMatrix to unit matrix **/
-template<class T> void Unity(DenseMatrix<T> &A){
-  int N;  SizeSquare(A,N);
-  for(int i=0;i<N;i++){
-    for(int j=0;j<N;j++){
-      if ( i==j ) A[i][j] = 1;
-      else        A[i][j] = 0;
-    } 
-  } 
-}
-
-/** Add C * I to matrix **/
-template<class T>
-void PlusUnit(DenseMatrix<T> & A,T c){
-  int dim;  SizeSquare(A,dim);
-  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
-}
-
-/** return the Hermitian conjugate of matrix **/
-template<class T>
-DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
-
-  int dim; SizeSquare(mat,dim);
-
-  DenseMatrix<T> C; Resize(C,dim,dim);
-
-  for(int i=0;i<dim;i++){
-    for(int j=0;j<dim;j++){
-      C[i][j] = conj(mat[j][i]);
-    } 
-  } 
-  return C;
-}
-/**Get a square submatrix**/
-template <class T>
-DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
-{
-  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
-
-  for(int i = row_st; i<row_end; i++){
-  for(int j = col_st; j<col_end; j++){
-    H[i-row_st][j-col_st]=A[i][j];
-  }}
-  return H;
-}
-
-}
-
-#include "Householder.h"
-#include "Francis.h"
-
-#endif
-
diff --git a/lib/algorithms/densematrix/Francis.h b/lib/algorithms/densematrix/Francis.h
deleted file mode 100644
index 08ecbd7b..00000000
--- a/lib/algorithms/densematrix/Francis.h
+++ /dev/null
@@ -1,525 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/Francis.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef FRANCIS_H
-#define FRANCIS_H
-
-#include <cstdlib>
-#include <string>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <fstream>
-#include <complex>
-#include <algorithm>
-
-//#include <timer.h>
-//#include <lapacke.h>
-//#include <Eigen/Dense>
-
-namespace Grid {
-
-template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
-template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
-
-/**
-  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
-H =
-      x  x  x  x  x  x  x  x  x
-      x  x  x  x  x  x  x  x  x
-      0  x  x  x  x  x  x  x  x
-      0  0  x  x  x  x  x  x  x
-      0  0  0  x  x  x  x  x  x
-      0  0  0  0  x  x  x  x  x
-      0  0  0  0  0  x  x  x  x
-      0  0  0  0  0  0  x  x  x
-      0  0  0  0  0  0  0  x  x
-Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
-**/
-template <class T>
-int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
-{
-  DenseMatrix<T> H = Hin; 
-
-  int N ; SizeSquare(H,N);
-  int M = N;
-
-  Fill(evals,0);
-  Fill(evecs,0);
-
-  T s,t,x=0,y=0,z=0;
-  T u,d;
-  T apd,amd,bc;
-  DenseVector<T> p(N,0);
-  T nrm = Norm(H);    ///DenseMatrix Norm
-  int n, m;
-  int e = 0;
-  int it = 0;
-  int tot_it = 0;
-  int l = 0;
-  int r = 0;
-  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
-  DenseVector<int> trows(N,0);
-
-  /// Check if the matrix is really hessenberg, if not abort
-  RealD sth = 0;
-  for(int j=0;j<N;j++){
-    for(int i=j+2;i<N;i++){
-      sth = abs(H[i][j]);
-      if(sth > small){
-	std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
-	exit(1);
-      }
-    }
-  }
-
-  do{
-    std::cout << "Francis QR Step N = " << N << std::endl;
-    /** Check for convergence
-      x  x  x  x  x
-      0  x  x  x  x
-      0  0  x  x  x
-      0  0  x  x  x
-      0  0  0  0  x
-      for this matrix l = 4
-     **/
-    do{
-      l = Chop_subdiag(H,nrm,e,small);
-      r = 0;    ///May have converged on more than one eval
-      ///Single eval
-      if(l == N-1){
-        evals[e] = H[l][l];
-        N--; e++; r++; it = 0;
-      }
-      ///RealD eval
-      if(l == N-2){
-        trows[l+1] = 1;    ///Needed for UTSolve
-        apd = H[l][l] + H[l+1][l+1];
-        amd = H[l][l] - H[l+1][l+1];
-        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
-        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
-        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
-        N-=2; e+=2; r++; it = 0;
-      }
-    } while(r>0);
-
-    if(N ==0) break;
-
-    DenseVector<T > ck; Resize(ck,3);
-    DenseVector<T> v;   Resize(v,3);
-
-    for(int m = N-3; m >= l; m--){
-      ///Starting vector essentially random shift.
-      if(it%10 == 0 && N >= 3 && it > 0){
-        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
-        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
-        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
-        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
-        z = H[m+1][m]*H[m+2][m+1];
-      }
-      ///Starting vector implicit Q theorem
-      else{
-        s = (H[N-2][N-2] + H[N-1][N-1]);
-        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
-        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
-        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
-        z = H[m+1][m]*H[m+2][m+1];
-      }
-      ck[0] = x; ck[1] = y; ck[2] = z;
-
-      if(m == l) break;
-
-      /** Some stupid thing from numerical recipies, seems to work**/
-      // PAB.. for heaven's sake quote page, purpose, evidence it works.
-      //       what sort of comment is that!?!?!?
-      u=abs(H[m][m-1])*(abs(y)+abs(z));
-      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
-      if ((T)abs(u+d) == (T)abs(d) ){
-	l = m; break;
-      }
-
-      //if (u < small){l = m; break;}
-    }
-    if(it > 100000){
-     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
-     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
-      exit(1);
-    }
-    normalize(ck);    ///Normalization cancels in PHP anyway
-    T beta;
-    Householder_vector<T >(ck, 0, 2, v, beta);
-    Householder_mult<T >(H,v,beta,0,l,l+2,0);
-    Householder_mult<T >(H,v,beta,0,l,l+2,1);
-    ///Accumulate eigenvector
-    Householder_mult<T >(P,v,beta,0,l,l+2,1);
-    int sw = 0;      ///Are we on the last row?
-    for(int k=l;k<N-2;k++){
-      x = H[k+1][k];
-      y = H[k+2][k];
-      z = (T)0.0;
-      if(k+3 <= N-1){
-	z = H[k+3][k];
-      } else{
-	sw = 1; 
-	v[2] = (T)0.0;
-      }
-      ck[0] = x; ck[1] = y; ck[2] = z;
-      normalize(ck);
-      Householder_vector<T >(ck, 0, 2-sw, v, beta);
-      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
-      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
-      ///Accumulate eigenvector
-      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
-    }
-    it++;
-    tot_it++;
-  }while(N > 1);
-  N = evals.size();
-  ///Annoying - UT solves in reverse order;
-  DenseVector<T> tmp; Resize(tmp,N);
-  for(int i=0;i<N;i++){
-    tmp[i] = evals[N-i-1];
-  } 
-  evals = tmp;
-  UTeigenvectors(H, trows, evals, evecs);
-  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
-  return tot_it;
-}
-
-template <class T>
-int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
-{
-  /**
-  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
-  H =
-  x  x  0  0  0  0
-  x  x  x  0  0  0
-  0  x  x  x  0  0
-  0  0  x  x  x  0
-  0  0  0  x  x  x
-  0  0  0  0  x  x
-  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
-  return my_Wilkinson(Hin, evals, evecs, small, small);
-}
-
-template <class T>
-int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
-{
-  int N; SizeSquare(Hin,N);
-  int M = N;
-
-  ///I don't want to modify the input but matricies must be passed by reference
-  //Scale a matrix by its "norm"
-  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
-  DenseMatrix<T> H;  H = Hin;
-  
-  RealD Hnorm = abs(Norm(Hin));
-  H = H * (1.0 / Hnorm);
-
-  // TODO use openmp and memset
-  Fill(evals,0);
-  Fill(evecs,0);
-
-  T s, t, x = 0, y = 0, z = 0;
-  T u, d;
-  T apd, amd, bc;
-  DenseVector<T> p; Resize(p,N); Fill(p,0);
-
-  T nrm = Norm(H);    ///DenseMatrix Norm
-  int n, m;
-  int e = 0;
-  int it = 0;
-  int tot_it = 0;
-  int l = 0;
-  int r = 0;
-  DenseMatrix<T> P; Resize(P,N,N);
-  Unity(P);
-  DenseVector<int> trows(N, 0);
-  /// Check if the matrix is really symm tridiag
-  RealD sth = 0;
-  for(int j = 0; j < N; ++j)
-  {
-    for(int i = j + 2; i < N; ++i)
-    {
-      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
-      {
-	std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
-	std::cout << "Warning tridiagonalize and call again" << std::endl;
-        // exit(1); // see what is going on
-        //return;
-      }
-    }
-  }
-
-  do{
-    do{
-      //Jasper
-      //Check if the subdiagonal term is small enough (<small)
-      //if true then it is converged.
-      //check start from H.dim - e - 1
-      //How to deal with more than 2 are converged?
-      //What if Chop_symm_subdiag return something int the middle?
-      //--------------
-      l = Chop_symm_subdiag(H,nrm, e, small);
-      r = 0;    ///May have converged on more than one eval
-      //Jasper
-      //In this case
-      // x  x  0  0  0  0
-      // x  x  x  0  0  0
-      // 0  x  x  x  0  0
-      // 0  0  x  x  x  0
-      // 0  0  0  x  x  0
-      // 0  0  0  0  0  x  <- l
-      //--------------
-      ///Single eval
-      if(l == N - 1)
-      {
-        evals[e] = H[l][l];
-        N--;
-        e++;
-        r++;
-        it = 0;
-      }
-      //Jasper
-      // x  x  0  0  0  0
-      // x  x  x  0  0  0
-      // 0  x  x  x  0  0
-      // 0  0  x  x  0  0
-      // 0  0  0  0  x  x  <- l
-      // 0  0  0  0  x  x
-      //--------------
-      ///RealD eval
-      if(l == N - 2)
-      {
-        trows[l + 1] = 1;    ///Needed for UTSolve
-        apd = H[l][l] + H[l + 1][ l + 1];
-        amd = H[l][l] - H[l + 1][l + 1];
-        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
-        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
-        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
-        N -= 2;
-        e += 2;
-        r++;
-        it = 0;
-      }
-    }while(r > 0);
-    //Jasper
-    //Already converged
-    //--------------
-    if(N == 0) break;
-
-    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
-
-    for(int m = N - 3; m >= l; m--)
-    {
-      ///Starting vector essentially random shift.
-      if(it%10 == 0 && N >= 3 && it > 0)
-      {
-        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
-        x = H[m][m] - t;
-        z = H[m + 1][m];
-      } else {
-      ///Starting vector implicit Q theorem
-        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
-        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
-	  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
-        x = H[m][m] - t;
-        z = H[m + 1][m];
-      }
-      //Jasper
-      //why it is here????
-      //-----------------------
-      if(m == l)
-        break;
-
-      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
-      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
-      if ((T)abs(u + d) == (T)abs(d))
-      {
-        l = m;
-        break;
-      }
-    }
-    //Jasper
-    if(it > 1000000)
-    {
-      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
-      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
-      exit(1);
-    }
-    //
-    T s, c;
-    Givens_calc<T>(x, z, c, s);
-    Givens_mult<T>(H, l, l + 1, c, -s, 0);
-    Givens_mult<T>(H, l, l + 1, c,  s, 1);
-    Givens_mult<T>(P, l, l + 1, c,  s, 1);
-    //
-    for(int k = l; k < N - 2; ++k)
-    {
-      x = H.A[k + 1][k];
-      z = H.A[k + 2][k];
-      Givens_calc<T>(x, z, c, s);
-      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
-      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
-      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
-    }
-    it++;
-    tot_it++;
-  }while(N > 1);
-
-  N = evals.size();
-  ///Annoying - UT solves in reverse order;
-  DenseVector<T> tmp(N);
-  for(int i = 0; i < N; ++i)
-    tmp[i] = evals[N-i-1];
-  evals = tmp;
-  //
-  UTeigenvectors(H, trows, evals, evecs);
-  //UTSymmEigenvectors(H, trows, evals, evecs);
-  for(int i = 0; i < evals.size(); ++i)
-  {
-    evecs[i] = P * evecs[i];
-    normalize(evecs[i]);
-    evals[i] = evals[i] * Hnorm;
-  }
-  // // FIXME this is to test
-  // Hin.write("evecs3", evecs);
-  // Hin.write("evals3", evals);
-  // // check rsd
-  // for(int i = 0; i < M; i++) {
-  //   vector<T> Aevec = Hin * evecs[i];
-  //   RealD norm2(0.);
-  //   for(int j = 0; j < M; j++) {
-  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
-  //   }
-  // }
-  return tot_it;
-}
-
-template <class T>
-void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
-
-  /**
-  turn a matrix A =
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  into
-  x  x  x  x  x
-  x  x  x  x  x
-  0  x  x  x  x
-  0  0  x  x  x
-  0  0  0  x  x
-  with householder rotations
-  Slow.
-  */
-  int N ; SizeSquare(A,N);
-  DenseVector<T > p; Resize(p,N); Fill(p,0);
-
-  for(int k=start;k<N-2;k++){
-    //cerr << "hess" << k << std::endl;
-    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
-    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
-    normalize(ck);    ///Normalization cancels in PHP anyway
-    T beta;
-    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
-    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
-    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
-    ///Accumulate eigenvector
-    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
-  }
-  /*for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-    A(0,k,l);
-    }
-    }*/
-}
-
-template <class T>
-void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
-///Tridiagonalize a matrix
-  int N; SizeSquare(A,N);
-  Hess(A,Q,start);
-  /*for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-    A(0,l,k);
-    }
-    }*/
-}
-
-template <class T>
-void ForceTridiagonal(DenseMatrix<T> &A){
-///Tridiagonalize a matrix
-  int N ; SizeSquare(A,N);
-  for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-      A[l][k]=0;
-      A[k][l]=0;
-    }
-  }
-}
-
-template <class T>
-int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
-  int N; SizeSquare(Ain,N);
-  DenseMatrix<T > A; A = Ain;
-  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
-  Tri(A,Q,0);
-  int it = my_Wilkinson<T>(A, evals, evecs, small);
-  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
-  return it;
-}
-
-
-template <class T>
-int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  return my_Wilkinson(Ain, evals, evecs, small);
-}
-
-template <class T>
-int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  return my_SymmEigensystem(Ain, evals, evecs, small);
-}
-
-template <class T>
-int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-///Solve a general eigensystem, not necessarily in tridiagonal form
-  int N = Ain.dim;
-  DenseMatrix<T > A(N); A = Ain;
-  DenseMatrix<T > Q(N);Q.Unity();
-  Hess(A,Q,0);
-  int it = QReigensystem<T>(A, evals, evecs, small);
-  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
-  return it;
-}
-
-}
-#endif
diff --git a/lib/algorithms/densematrix/Householder.h b/lib/algorithms/densematrix/Householder.h
deleted file mode 100644
index 0c6b7d0b..00000000
--- a/lib/algorithms/densematrix/Householder.h
+++ /dev/null
@@ -1,242 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/Householder.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef HOUSEHOLDER_H
-#define HOUSEHOLDER_H
-
-#define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-#define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-#define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-
-#include <cstdlib>
-#include <string>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <fstream>
-#include <complex>
-#include <algorithm>
-
-namespace Grid {
-/** Comparison function for finding the max element in a vector **/
-template <class T> bool cf(T i, T j) { 
-  return abs(i) < abs(j); 
-}
-
-/** 
-	Calculate a real Givens angle 
- **/
-template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
-
-  RealD mz = (RealD)abs(z);
-  
-  if(mz==0.0){
-    c = 1; s = 0;
-  }
-  if(mz >= (RealD)abs(y)){
-    T t = -y/z;
-    s = (T)1.0 / sqrt ((T)1.0 + t * t);
-    c = s * t;
-  } else {
-    T t = -z/y;
-    c = (T)1.0 / sqrt ((T)1.0 + t * t);
-    s = c * t;
-  }
-}
-
-template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
-{
-  int q ; SizeSquare(A,q);
-
-  if(dir == 0){
-    for(int j=0;j<q;j++){
-      T nu = A[i][j];
-      T w  = A[k][j];
-      A[i][j] = (c*nu + s*w);
-      A[k][j] = (-s*nu + c*w);
-    }
-  }
-
-  if(dir == 1){
-    for(int j=0;j<q;j++){
-      T nu = A[j][i];
-      T w  = A[j][k];
-      A[j][i] = (c*nu - s*w);
-      A[j][k] = (s*nu + c*w);
-    }
-  }
-}
-
-/**
-	from input = x;
-	Compute the complex Householder vector, v, such that
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-
-	P | x |    | x | k = 0
-	| x |    | 0 | 
-	| x | =  | 0 |
-	| x |    | 0 | j = 3
-	| x |	   | x |
-
-	These are the "Unreduced" Householder vectors.
-
- **/
-template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
-{
-  int N ; Size(input,N);
-  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
-
-  if(abs(m) > 0.0){
-    T alpha = 0;
-
-    for(int i=k; i<j+1; i++){
-      v[i] = input[i]/m;
-      alpha = alpha + v[i]*conj(v[i]);
-    }
-    alpha = sqrt(alpha);
-    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
-
-    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
-    else                 v[k] = -alpha;
-  } else{
-    for(int i=k; i<j+1; i++){
-      v[i] = 0.0;
-    } 
-  }
-}
-
-/**
-	from input = x;
-	Compute the complex Householder vector, v, such that
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-
-	Px = alpha*e_dir
-
-	These are the "Unreduced" Householder vectors.
-
- **/
-
-template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
-{
-  int N = input.size();
-  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
-  
-  if(abs(m) > 0.0){
-    T alpha = 0;
-
-    for(int i=k; i<j+1; i++){
-      v[i] = input[i]/m;
-      alpha = alpha + v[i]*conj(v[i]);
-    }
-    
-    alpha = sqrt(alpha);
-    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
-	
-    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
-    else                  v[dir] = -alpha;
-  }else{
-    for(int i=k; i<j+1; i++){
-      v[i] = 0.0;
-    } 
-  }
-}
-
-/**
-	Compute the product PA if trans = 0
-	AP if trans = 1
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-	start at element l of matrix A
-	v is of length j - k + 1 of v are nonzero
- **/
-
-template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
-{
-  int N ; SizeSquare(A,N);
-
-  if(abs(beta) > 0.0){
-    for(int p=l; p<N; p++){
-      T s = 0;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
-	s *= beta;
-	for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
-      } else {
-	for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
-	s *= beta;
-	for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
-      }
-    }
-  }
-}
-
-/**
-	Compute the product PA if trans = 0
-	AP if trans = 1
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-	start at element l of matrix A
-	v is of length j - k + 1 of v are nonzero
-	A is tridiagonal
- **/
-template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
-{
-  if(abs(beta) > 0.0){
-
-    int N ; SizeSquare(A,N);
-
-    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
-
-    T s;
-    for(int p=l; p<M; p++){
-      s = 0;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
-      }else{
-	for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
-      }
-      s = beta*s;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
-      }else{
-	for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
-      }
-    }
-    for(int p=l; p<M; p++){
-      if(trans==0){
-	for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
-      }else{
-	for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
-      }
-    }
-  }
-}
-}
-#endif
diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index 3aa54360..acd67592 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -39,7 +39,9 @@ void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
                    int *info);
 #endif
 
-#include <Grid/algorithms/densematrix/DenseMatrix.h>
+template<class T> using DenseVector = std::vector<T>;
+
+//#include <Grid/algorithms/densematrix/DenseMatrix.h>
 #include <Grid/algorithms/iterative/EigenSort.h>
 
 namespace Grid {
@@ -47,104 +49,85 @@ namespace Grid {
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
-
-
 template<class Field> 
-    class ImplicitlyRestartedLanczos {
+class ImplicitlyRestartedLanczos {
 
-    const RealD small = 1.0e-16;
 public:       
-    int lock;
-    int get;
-    int Niter;
-    int converged;
+  int Niter;   // Max iterations
+  int Nstop;   // Number of evecs checked for convergence
+  int Nk;      // Number of converged sought
+  int Nm;      // Nm -- total number of vectors
 
-    int Nstop;   // Number of evecs checked for convergence
-    int Nk;      // Number of converged sought
-    int Np;      // Np -- Number of spare vecs in kryloc space
-    int Nm;      // Nm -- total number of vectors
+  RealD eresid;
 
-    RealD eresid;
+  ////////////////////////////////////
+  // Embedded objects
+  ////////////////////////////////////
+           SortEigen<Field> _sort;
+  LinearOperatorBase<Field> &_Linop;
+    OperatorFunction<Field> &_poly;
 
-    SortEigen<Field> _sort;
+  /////////////////////////
+  // Constructor
+  /////////////////////////
+ ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
+			    OperatorFunction<Field> & poly,   // polynmial
+			    int _Nstop, // sought vecs
+			    int _Nk,    // sought vecs
+			    int _Nm,    // total vecs
+			    RealD _eresid, // resid in lmdue deficit 
+			    int _Niter) : // Max iterations
+    _Linop(Linop),    _poly(poly),
+    Nstop(_Nstop), Nk(_Nk), Nm(_Nm),
+    eresid(_eresid),  Niter(_Niter)  { };
 
-//    GridCartesian &_fgrid;
-
-    LinearOperatorBase<Field> &_Linop;
-
-    OperatorFunction<Field>   &_poly;
-
-    /////////////////////////
-    // Constructor
-    /////////////////////////
-    void init(void){};
-    void Abort(int ff, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs);
-
-    ImplicitlyRestartedLanczos(
-				LinearOperatorBase<Field> &Linop, // op
-			       OperatorFunction<Field> & poly,   // polynmial
-			       int _Nstop, // sought vecs
-			       int _Nk, // sought vecs
-			       int _Nm, // spare vecs
-			       RealD _eresid, // resid in lmdue deficit 
-			       int _Niter) : // Max iterations
-      _Linop(Linop),
-      _poly(poly),
-      Nstop(_Nstop),
-      Nk(_Nk),
-      Nm(_Nm),
-      eresid(_eresid),
-      Niter(_Niter)
-    { 
-      Np = Nm-Nk; assert(Np>0);
-    };
-
-    ImplicitlyRestartedLanczos(
-				LinearOperatorBase<Field> &Linop, // op
+#if 0
+    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
 			       OperatorFunction<Field> & poly,   // polynmial
 			       int _Nk, // sought vecs
-			       int _Nm, // spare vecs
+			       int _Nm, // total vecs
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _Niter) : // Max iterations
-      _Linop(Linop),
-      _poly(poly),
-      Nstop(_Nk),
-      Nk(_Nk),
-      Nm(_Nm),
-      eresid(_eresid),
-      Niter(_Niter)
-    { 
-      Np = Nm-Nk; assert(Np>0);
-    };
+    _Linop(Linop),      _poly(poly),
+    Nstop(_Nk), Nk(_Nk), Nm(_Nm),      
+    eresid(_eresid),      Niter(_Niter) { };
+#endif
 
-    /////////////////////////
-    // Sanity checked this routine (step) against Saad.
-    /////////////////////////
-    void RitzMatrix(DenseVector<Field>& evec,int k){
+#if 0
+    void calc(DenseVector<RealD>& eval,
+	      DenseVector<Field>& evec,
+	      const Field& src,
+	      int& Nconv);
 
-      if(1) return;
+    void step(DenseVector<RealD>& lmd,
+	      DenseVector<RealD>& lme, 
+	      DenseVector<Field>& evec,
+	      Field& w,int Nm,int k);
 
-      GridBase *grid = evec[0]._grid;
-      Field w(grid);
-      std::cout << "RitzMatrix "<<std::endl;
-      for(int i=0;i<k;i++){
-	_poly(_Linop,evec[i],w);
-	std::cout << "["<<i<<"] ";
-	for(int j=0;j<k;j++){
-	  ComplexD in = innerProduct(evec[j],w);
-	  if ( fabs((double)i-j)>1 ) { 
-	    if (abs(in) >1.0e-9 )  { 
-	      std::cout<<"oops"<<std::endl;
-	      abort();
-	    } else 
-	      std::cout << " 0 ";
-	  } else { 
-	    std::cout << " "<<in<<" ";
-	  }
-	}
-	std::cout << std::endl;
-      }
-    }
+    void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) ;
+
+    static RealD normalise(Field& v) ;
+    void orthogonalize(Field& w, DenseVector<Field>& evec, int k);
+    void diagonalize(DenseVector<RealD>& lmd,
+		     DenseVector<RealD>& lme, 
+		     int N2, int N1,
+		     DenseVector<RealD>& Qt,
+		     GridBase *grid);
+
+    void qr_decomp(DenseVector<RealD>& lmd,
+		   DenseVector<RealD>& lme,
+		   int Nk, int Nm,
+		   DenseVector<RealD>& Qt,
+		   RealD Dsh, int kmin, int kmax);
+
+#ifdef USE_LAPACK
+    void diagonalize_lapack(DenseVector<RealD>& lmd,
+			    DenseVector<RealD>& lme, 
+			    int N1, int N2,
+			    DenseVector<RealD>& Qt,
+			    GridBase *grid);
+#endif
+#endif
 
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
@@ -161,12 +144,12 @@ public:
 	      DenseVector<Field>& evec,
 	      Field& w,int Nm,int k)
     {
+      const RealD tiny = 1.0e-20;
       assert( k< Nm );
       
       _poly(_Linop,evec[k],w);      // 3. wk:=Avk−βkv_{k−1}
-      if(k>0){
-	w -= lme[k-1] * evec[k-1];
-      }    
+
+      if(k>0) w -= lme[k-1] * evec[k-1];
 
       ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk)
       RealD     alph = real(zalph);
@@ -176,29 +159,20 @@ public:
       RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
                                  // 7. vk+1 := wk/βk+1
 
-//	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
-      const RealD tiny = 1.0e-20;
-      if ( beta < tiny ) { 
-	std::cout << " beta is tiny "<<beta<<std::endl;
-     }
       lmd[k] = alph;
-      lme[k]  = beta;
+      lme[k] = beta;
 
-      if (k>0) { 
-	orthogonalize(w,evec,k); // orthonormalise
-      }
-      
-      if(k < Nm-1) evec[k+1] = w;
+      if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise
+      if ( k < Nm-1) evec[k+1] = w;
+
+      if ( beta < tiny ) std::cout << " beta is tiny "<<beta<<std::endl;
     }
-
-    void qr_decomp(DenseVector<RealD>& lmd,
-		   DenseVector<RealD>& lme,
-		   int Nk,
-		   int Nm,
-		   DenseVector<RealD>& Qt,
-		   RealD Dsh, 
-		   int kmin,
-		   int kmax)
+      
+    void qr_decomp(DenseVector<RealD>& lmd,   // Nm 
+		   DenseVector<RealD>& lme,   // Nm 
+		   int Nk, int Nm,
+		   DenseVector<RealD>& Qt,     // Nm x Nm matrix
+		   RealD Dsh, int kmin, int kmax)
     {
       int k = kmin-1;
       RealD x;
@@ -218,7 +192,7 @@ public:
       lme[k+1] = c*lme[k+1];
       
       for(int i=0; i<Nk; ++i){
-	RealD Qtmp1 = Qt[i+Nm*k  ];
+	RealD Qtmp1 = Qt[i+Nm*k    ];
 	RealD Qtmp2 = Qt[i+Nm*(k+1)];
 	Qt[i+Nm*k    ] = c*Qtmp1 - s*Qtmp2;
 	Qt[i+Nm*(k+1)] = s*Qtmp1 + c*Qtmp2; 
@@ -254,92 +228,88 @@ public:
       }
     }
 
+
 #ifdef USE_LAPACK
     void diagonalize_lapack(DenseVector<RealD>& lmd,
-		     DenseVector<RealD>& lme, 
-		     int N1,
-		     int N2,
-		     DenseVector<RealD>& Qt,
-		     GridBase *grid){
-  const int size = Nm;
-//  tevals.resize(size);
-//  tevecs.resize(size);
-  int NN = N1;
-  double evals_tmp[NN];
-  double evec_tmp[NN][NN];
-  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
-//  double AA[NN][NN];
-  double DD[NN];
-  double EE[NN];
-  for (int i = 0; i< NN; i++)
-    for (int j = i - 1; j <= i + 1; j++)
-      if ( j < NN && j >= 0 ) {
-        if (i==j) DD[i] = lmd[i];
-        if (i==j) evals_tmp[i] = lmd[i];
-        if (j==(i-1)) EE[j] = lme[j];
+			    DenseVector<RealD>& lme, 
+			    int N1,
+			    int N2,
+			    DenseVector<RealD>& Qt,
+			    GridBase *grid)
+    {
+      const int size = Nm;
+      int NN = N1;
+      double evals_tmp[NN];
+      double evec_tmp[NN][NN];
+      memset(evec_tmp[0],0,sizeof(double)*NN*NN);
+      double DD[NN];
+      double EE[NN];
+      for (int i = 0; i< NN; i++) {
+	for (int j = i - 1; j <= i + 1; j++) {
+	  if ( j < NN && j >= 0 ) {
+	    if (i==j) DD[i] = lmd[i];
+	    if (i==j) evals_tmp[i] = lmd[i];
+	    if (j==(i-1)) EE[j] = lme[j];
+	  }
+	}
       }
-  int evals_found;
-  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
-  int liwork =  3+NN*10 ;
-  int iwork[liwork];
-  double work[lwork];
-  int isuppz[2*NN];
-  char jobz = 'V'; // calculate evals & evecs
-  char range = 'I'; // calculate all evals
-  //    char range = 'A'; // calculate all evals
-  char uplo = 'U'; // refer to upper half of original matrix
-  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
-  int ifail[NN];
-  int info;
-//  int total = QMP_get_number_of_nodes();
-//  int node = QMP_get_node_number();
-//  GridBase *grid = evec[0]._grid;
-  int total = grid->_Nprocessors;
-  int node = grid->_processor;
-  int interval = (NN/total)+1;
-  double vl = 0.0, vu = 0.0;
-  int il = interval*node+1 , iu = interval*(node+1);
-  if (iu > NN)  iu=NN;
-  double tol = 0.0;
-    if (1) {
-      memset(evals_tmp,0,sizeof(double)*NN);
-      if ( il <= NN){
-        printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
-        LAPACK_dstegr(&jobz, &range, &NN,
-            (double*)DD, (double*)EE,
-            &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
-            &tol, // tolerance
-            &evals_found, evals_tmp, (double*)evec_tmp, &NN,
-            isuppz,
-            work, &lwork, iwork, &liwork,
-            &info);
-        for (int i = iu-1; i>= il-1; i--){
-          printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]);
-          evals_tmp[i] = evals_tmp[i - (il-1)];
-          if (il>1) evals_tmp[i-(il-1)]=0.;
-          for (int j = 0; j< NN; j++){
-            evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
-            if (il>1) evec_tmp[i-(il-1)][j]=0.;
-          }
-        }
+      int evals_found;
+      int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
+      int liwork =  3+NN*10 ;
+      int iwork[liwork];
+      double work[lwork];
+      int isuppz[2*NN];
+      char jobz = 'V'; // calculate evals & evecs
+      char range = 'I'; // calculate all evals
+      //    char range = 'A'; // calculate all evals
+      char uplo = 'U'; // refer to upper half of original matrix
+      char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
+      int ifail[NN];
+      int info;
+      int total = grid->_Nprocessors;
+      int node  = grid->_processor;
+      int interval = (NN/total)+1;
+      double vl = 0.0, vu = 0.0;
+      int il = interval*node+1 , iu = interval*(node+1);
+      if (iu > NN)  iu=NN;
+      double tol = 0.0;
+      if (1) {
+	memset(evals_tmp,0,sizeof(double)*NN);
+	if ( il <= NN){
+	  LAPACK_dstegr(&jobz, &range, &NN,
+			(double*)DD, (double*)EE,
+			&vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
+			&tol, // tolerance
+			&evals_found, evals_tmp, (double*)evec_tmp, &NN,
+			isuppz,
+			work, &lwork, iwork, &liwork,
+			&info);
+	  for (int i = iu-1; i>= il-1; i--){
+	    evals_tmp[i] = evals_tmp[i - (il-1)];
+	    if (il>1) evals_tmp[i-(il-1)]=0.;
+	    for (int j = 0; j< NN; j++){
+	      evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
+	      if (il>1) evec_tmp[i-(il-1)][j]=0.;
+	    }
+	  }
+	}
+	{
+	  grid->GlobalSumVector(evals_tmp,NN);
+	  grid->GlobalSumVector((double*)evec_tmp,NN*NN);
+	}
+      } 
+      // cheating a bit.
+      // It is better to sort instead of just reversing it, 
+      // but the document of the routine says evals are sorted in increasing order. 
+      // qr gives evals in decreasing order.
+      for(int i=0;i<NN;i++){
+	for(int j=0;j<NN;j++)
+	  Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
+	lmd [NN-1-i]=evals_tmp[i];
       }
-      {
-//        QMP_sum_double_array(evals_tmp,NN);
-//        QMP_sum_double_array((double *)evec_tmp,NN*NN);
-         grid->GlobalSumVector(evals_tmp,NN);
-         grid->GlobalSumVector((double*)evec_tmp,NN*NN);
-      }
-    } 
-// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
-  for(int i=0;i<NN;i++){
-    for(int j=0;j<NN;j++)
-      Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
-      lmd [NN-1-i]=evals_tmp[i];
-  }
-}
+    }
 #endif
 
-
     void diagonalize(DenseVector<RealD>& lmd,
 		     DenseVector<RealD>& lme, 
 		     int N2,
@@ -354,24 +324,23 @@ public:
     if(!check_lapack)
 	return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
 
-	DenseVector <RealD> lmd2(N1);
-	DenseVector <RealD> lme2(N1);
-	DenseVector<RealD> Qt2(N1*N1);
-         for(int k=0; k<N1; ++k){
-	    lmd2[k] = lmd[k];
-	    lme2[k] = lme[k];
-	  }
-         for(int k=0; k<N1*N1; ++k)
-	Qt2[k] = Qt[k];
-
-//	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
+    DenseVector <RealD> lmd2(N1);
+    DenseVector <RealD> lme2(N1);
+    DenseVector<RealD> Qt2(N1*N1);
+    for(int k=0; k<N1; ++k){
+      lmd2[k] = lmd[k];
+      lme2[k] = lme[k];
+    }
+    for(int k=0; k<N1*N1; ++k){
+      Qt2[k] = Qt[k];
+    }
 #endif
 
       int Niter = 100*N1;
       int kmin = 1;
       int kmax = N2;
-      // (this should be more sophisticated)
 
+      // (this should be more sophisticated)
       for(int iter=0; iter<Niter; ++iter){
 
 	// determination of 2x2 leading submatrix
@@ -393,21 +362,17 @@ public:
 	}
 	Niter = iter;
 #ifdef USE_LAPACK
-    if(check_lapack){
-	const double SMALL=1e-8;
-	diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
-	DenseVector <RealD> lmd3(N2);
-         for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
-        _sort.push(lmd3,N2);
-        _sort.push(lmd2,N2);
-         for(int k=0; k<N2; ++k){
+	if(check_lapack){
+	  const double SMALL=1e-8;
+	  diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
+	  DenseVector <RealD> lmd3(N2);
+	  for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
+	  _sort.push(lmd3,N2);
+	  _sort.push(lmd2,N2);
+	  for(int k=0; k<N2; ++k){
 	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
-//	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
 	  }
-         for(int k=0; k<N1*N1; ++k){
-//	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
 	}
-    }
 #endif
 	return;
 
@@ -424,7 +389,6 @@ public:
       abort();
     }
 
-#if 1
     static RealD normalise(Field& v) 
     {
       RealD nn = norm2(v);
@@ -457,6 +421,7 @@ public:
       normalise(w);
     }
 
+
     void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) {
       for(int i=0; i<Qt.size(); ++i) Qt[i] = 0.0;
       for(int k=0; k<Nm; ++k) Qt[k + k*Nm] = 1.0;
@@ -488,10 +453,11 @@ until convergence
 	GridBase *grid = evec[0]._grid;
 	assert(grid == src._grid);
 
-	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
-	std::cout << " -- Nm = " << Nm << std::endl;
-	std::cout << " -- size of eval   = " << eval.size() << std::endl;
-	std::cout << " -- size of evec  = " << evec.size() << std::endl;
+	std::cout << " -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
+	std::cout << " -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
+	std::cout << " -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
+	std::cout << " -- size of eval = " << eval.size() << std::endl;
+	std::cout << " -- size of evec = " << evec.size() << std::endl;
 	
 	assert(Nm == evec.size() && Nm == eval.size());
 	
@@ -514,39 +480,25 @@ until convergence
 	RealD beta_k;
   
 	// Set initial vector
-	// (uniform vector) Why not src??
-	//	evec[0] = 1.0;
 	evec[0] = src;
 	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
-// << src._grid  << std::endl;
+
 	normalise(evec[0]);
 	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
-// << evec[0]._grid << std::endl;
 	
 	// Initial Nk steps
 	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
-//	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
-//	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
-	RitzMatrix(evec,Nk);
-	for(int k=0; k<Nk; ++k){
-//	std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
-//	std:: cout <<"lme " << k << " " << lme[k] << std::endl;
-	}
 
 	// Restarting loop begins
-	for(int iter = 0; iter<Niter; ++iter){
+	int iter;
+	for(iter = 0; iter<Niter; ++iter){
 
 	  std::cout<<"\n Restart iteration = "<< iter << std::endl;
 
-	  // 
-	  // Rudy does a sort first which looks very different. Getting fed up with sorting out the algo defs.
-	  // We loop over 
-	  //
 	  for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
+
 	  f *= lme[Nm-1];
 
-	  RitzMatrix(evec,k2);
-	  
 	  // getting eigenvalues
 	  for(int k=0; k<Nm; ++k){
 	    eval2[k] = eval[k+k1-1];
@@ -561,10 +513,9 @@ until convergence
 	  // Implicitly shifted QR transformations
 	  setUnit_Qt(Nm,Qt);
 	  for(int ip=k2; ip<Nm; ++ip){ 
-	std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
+	    //	    std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
 	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
-		
-	}
+	  }
     
 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
 	  
@@ -599,18 +550,14 @@ until convergence
 	  
 	  for(int j = 0; j<Nk; ++j){
 	    for(int k = 0; k<Nk; ++k){
-	    B[j].checkerboard = evec[k].checkerboard;
+	      B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+j*Nm] * evec[k];
 	    }
-//	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
 	  }
-//	_sort.push(eval2,B,Nk);
 
 	  Nconv = 0;
-	  //	  std::cout << std::setiosflags(std::ios_base::scientific);
 	  for(int i=0; i<Nk; ++i){
 
-//	    _poly(_Linop,B[i],v);
 	    _Linop.HermOp(B[i],v);
 	    
 	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
@@ -624,15 +571,13 @@ until convergence
 	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
 	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
 	    
-	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
+	    // change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
 	    if((vv<eresid*eresid) && (i == Nconv) ){
 	      Iconv[Nconv] = i;
 	      ++Nconv;
 	    }
 
 	  }  // i-loop end
-	  //	  std::cout << std::resetiosflags(std::ios_base::scientific);
-
 
 	  std::cout<<" #modes converged: "<<Nconv<<std::endl;
 
@@ -655,556 +600,10 @@ until convergence
       _sort.push(eval,evec,Nconv);
 
       std::cout << "\n Converged\n Summary :\n";
-      std::cout << " -- Iterations  = "<< Nconv  << "\n";
+      std::cout << " -- Iterations  = "<< iter   << "\n";
       std::cout << " -- beta(k)     = "<< beta_k << "\n";
       std::cout << " -- Nconv       = "<< Nconv  << "\n";
      }
-
-    /////////////////////////////////////////////////
-    // Adapted from Rudy's lanczos factor routine
-    /////////////////////////////////////////////////
-    int Lanczos_Factor(int start, int end,  int cont,
-		       DenseVector<Field> & bq, 
-		       Field &bf,
-		       DenseMatrix<RealD> &H){
-      
-      GridBase *grid = bq[0]._grid;
-
-      RealD beta;  
-      RealD sqbt;  
-      RealD alpha;
-
-      for(int i=start;i<Nm;i++){
-	for(int j=start;j<Nm;j++){
-	  H[i][j]=0.0;
-	}
-      }
-
-      std::cout<<"Lanczos_Factor start/end " <<start <<"/"<<end<<std::endl;
-
-      // Starting from scratch, bq[0] contains a random vector and |bq[0]| = 1
-      int first;
-      if(start == 0){
-
-	std::cout << "start == 0\n"; //TESTING
-
-	_poly(_Linop,bq[0],bf);
-
-	alpha = real(innerProduct(bq[0],bf));//alpha =  bq[0]^dag A bq[0]
-
-	std::cout << "alpha = " << alpha << std::endl;
-	
-	bf = bf - alpha * bq[0];  //bf =  A bq[0] - alpha bq[0]
-
-	H[0][0]=alpha;
-
-	std::cout << "Set H(0,0) to " << H[0][0] << std::endl;
-
-	first = 1;
-
-      } else {
-
-	first = start;
-
-      }
-
-      // I think start==0 and cont==zero are the same. Test this
-      // If so I can drop "cont" parameter?
-      if( cont ) assert(start!=0);
-
-      if( start==0 ) assert(cont!=0);
-
-      if( cont){
-
-	beta = 0;sqbt = 0;
-
-	std::cout << "cont is true so setting beta to zero\n";
-
-      }	else {
-
-	beta = norm2(bf);
-	sqbt = sqrt(beta);
-
-	std::cout << "beta = " << beta << std::endl;
-      }
-
-      for(int j=first;j<end;j++){
-
-	std::cout << "Factor j " << j <<std::endl;
-
-	if(cont){ // switches to factoring; understand start!=0 and initial bf value is right.
-	  bq[j] = bf; cont = false;
-	}else{
-	  bq[j] = (1.0/sqbt)*bf ;
-
-	  H[j][j-1]=H[j-1][j] = sqbt;
-	}
-
-	_poly(_Linop,bq[j],bf);
-
-	bf = bf - (1.0/sqbt)*bq[j-1]; 	       //bf = A bq[j] - beta bq[j-1] // PAB this comment was incorrect in beta term??
-
-	alpha = real(innerProduct(bq[j],bf));  //alpha = bq[j]^dag A bq[j]
-
-	bf = bf - alpha*bq[j];                 //bf = A bq[j] - beta bq[j-1] - alpha bq[j]
-	RealD fnorm = norm2(bf);
-
-	RealD bck = sqrt( real( conjugate(alpha)*alpha ) + beta );
-
-	beta = fnorm;
-	sqbt = sqrt(beta);
-	std::cout << "alpha = " << alpha << " fnorm = " << fnorm << '\n';
-
-	///Iterative refinement of orthogonality V = [ bq[0]  bq[1]  ...  bq[M] ]
-	int re = 0;
-	// FIXME undefined params; how set in Rudy's code
-	int ref =0;
-	Real rho = 1.0e-8;
-
-	while( re == ref || (sqbt < rho * bck && re < 5) ){
-
-	  Field tmp2(grid);
-	  Field tmp1(grid);
-
-	  //bex = V^dag bf
-	  DenseVector<ComplexD> bex(j+1);
-	  for(int k=0;k<j+1;k++){
-	    bex[k] = innerProduct(bq[k],bf);
-	  }
-	  
-	  zero_fermion(tmp2);
-	  //tmp2 = V s
-	  for(int l=0;l<j+1;l++){
-	    RealD nrm = norm2(bq[l]);
-	    axpy(tmp1,0.0,bq[l],bq[l]); scale(tmp1,bex[l]); 	//tmp1 = V[j] bex[j]
-	    axpy(tmp2,1.0,tmp2,tmp1);					//tmp2 += V[j] bex[j]
-	  }
-
-	  //bf = bf - V V^dag bf.   Subtracting off any component in span { V[j] } 
-	  RealD btc = axpy_norm(bf,-1.0,tmp2,bf);
-	  alpha = alpha + real(bex[j]);	      sqbt = sqrt(real(btc));	      
-	  // FIXME is alpha real in RUDY's code?
-	  RealD nmbex = 0;for(int k=0;k<j+1;k++){nmbex = nmbex + real( conjugate(bex[k])*bex[k]  );}
-	  bck = sqrt( nmbex );
-	  re++;
-	}
-	std::cout << "Iteratively refined orthogonality, changes alpha\n";
-	if(re > 1) std::cout << "orthagonality refined " << re << " times" <<std::endl;
-	H[j][j]=alpha;
-      }
-
-      return end;
-    }
-
-    void EigenSort(DenseVector<double> evals,
-		   DenseVector<Field>  evecs){
-      int N= evals.size();
-      _sort.push(evals,evecs, evals.size(),N);
-    }
-
-    void ImplicitRestart(int TM, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs, DenseVector<Field> &bq, Field &bf, int cont)
-    {
-      std::cout << "ImplicitRestart begin. Eigensort starting\n";
-
-      DenseMatrix<RealD> H; Resize(H,Nm,Nm);
-
-      EigenSort(evals, evecs);
-
-      ///Assign shifts
-      int K=Nk;
-      int M=Nm;
-      int P=Np;
-      int converged=0;
-      if(K - converged < 4) P = (M - K-1); //one
-      //      DenseVector<RealD> shifts(P + shift_extra.size());
-      DenseVector<RealD> shifts(P);
-      for(int k = 0; k < P; ++k)
-	shifts[k] = evals[k]; 
-
-      /// Shift to form a new H and q
-      DenseMatrix<RealD> Q; Resize(Q,TM,TM);
-      Unity(Q);
-      Shift(Q, shifts); // H is implicitly passed in in Rudy's Shift routine
-
-      int ff = K;
-
-      /// Shifted H defines a new K step Arnoldi factorization
-      RealD  beta = H[ff][ff-1]; 
-      RealD  sig  = Q[TM - 1][ff - 1];
-      std::cout << "beta = " << beta << " sig = " << real(sig) <<std::endl;
-
-      std::cout << "TM = " << TM << " ";
-      std::cout << norm2(bq[0]) << " -- before" <<std::endl;
-
-      /// q -> q Q
-      times_real(bq, Q, TM);
-
-      std::cout << norm2(bq[0]) << " -- after " << ff <<std::endl;
-      bf =  beta* bq[ff] + sig* bf;
-
-      /// Do the rest of the factorization
-      ff = Lanczos_Factor(ff, M,cont,bq,bf,H);
-      
-      if(ff < M)
-	Abort(ff, evals, evecs);
-    }
-
-///Run the Eigensolver
-    void Run(int cont, DenseVector<Field> &bq, Field &bf, DenseVector<DenseVector<RealD> > & evecs,DenseVector<RealD> &evals)
-    {
-      init();
-
-      int M=Nm;
-
-      DenseMatrix<RealD> H; Resize(H,Nm,Nm);
-      Resize(evals,Nm);
-      Resize(evecs,Nm);
-
-      int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with
-
-      if(ff < M) {
-	std::cout << "Krylov: aborting ff "<<ff <<" "<<M<<std::endl;
-	abort(); // Why would this happen?
-      }
-
-      int itcount = 0;
-      bool stop = false;
-
-      for(int it = 0; it < Niter && (converged < Nk); ++it) {
-
-	std::cout << "Krylov: Iteration --> " << it << std::endl;
-	int lock_num = lock ? converged : 0;
-	DenseVector<RealD> tevals(M - lock_num );
-	DenseMatrix<RealD> tevecs; Resize(tevecs,M - lock_num,M - lock_num);
-	  
-	//check residual of polynominal 
-	TestConv(H,M, tevals, tevecs);
-
-	if(converged >= Nk)
-	    break;
-
-	ImplicitRestart(ff, tevals,tevecs,H);
-      }
-      Wilkinson<RealD>(H, evals, evecs, small); 
-      //      Check();
-
-      std::cout << "Done  "<<std::endl;
-
-    }
-
-   ///H - shift I = QR; H = Q* H Q
-    void Shift(DenseMatrix<RealD> & H,DenseMatrix<RealD> &Q, DenseVector<RealD> shifts) {
-      
-      int P; Size(shifts,P);
-      int M; SizeSquare(Q,M);
-
-      Unity(Q);
-
-      int lock_num = lock ? converged : 0;
-
-      RealD t_Househoulder_vector(0.0);
-      RealD t_Househoulder_mult(0.0);
-
-      for(int i=0;i<P;i++){
-
-	RealD x, y, z;
-	DenseVector<RealD> ck(3), v(3);
-	  
-	x = H[lock_num+0][lock_num+0]-shifts[i];
-	y = H[lock_num+1][lock_num+0];
-	ck[0] = x; ck[1] = y; ck[2] = 0; 
-
-	normalise(ck);	///Normalization cancels in PHP anyway
-	RealD beta;
-
-	Householder_vector<RealD>(ck, 0, 2, v, beta);
-	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,0);
-	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,1);
-	///Accumulate eigenvector
-	Householder_mult<RealD>(Q,v,beta,0,lock_num+0,lock_num+2,1);
-	  
-	int sw = 0;
-	for(int k=lock_num+0;k<M-2;k++){
-
-	  x = H[k+1][k]; 
-	  y = H[k+2][k]; 
-	  z = (RealD)0.0;
-	  if(k+3 <= M-1){
-	    z = H[k+3][k];
-	  }else{
-	    sw = 1; v[2] = 0.0;
-	  }
-
-	  ck[0] = x; ck[1] = y; ck[2] = z;
-
-	  normalise(ck);
-
-	  Householder_vector<RealD>(ck, 0, 2-sw, v, beta);
-	  Householder_mult<RealD>(H,v, beta,0,k+1,k+3-sw,0);
-	  Householder_mult<RealD>(H,v, beta,0,k+1,k+3-sw,1);
-	  ///Accumulate eigenvector
-	  Householder_mult<RealD>(Q,v, beta,0,k+1,k+3-sw,1);
-	}
-      }
-    }
-
-    void TestConv(DenseMatrix<RealD> & H,int SS, 
-		  DenseVector<Field> &bq, Field &bf,
-		  DenseVector<RealD> &tevals, DenseVector<DenseVector<RealD> > &tevecs, 
-		  int lock, int converged)
-    {
-      std::cout << "Converged " << converged << " so far." << std::endl;
-      int lock_num = lock ? converged : 0;
-      int M = Nm;
-
-      ///Active Factorization
-      DenseMatrix<RealD> AH; Resize(AH,SS - lock_num,SS - lock_num );
-
-      AH = GetSubMtx(H,lock_num, SS, lock_num, SS);
-
-      int NN=tevals.size();
-      int AHsize=SS-lock_num;
-
-      RealD small=1.0e-16;
-      Wilkinson<RealD>(AH, tevals, tevecs, small);
-
-      EigenSort(tevals, tevecs);
-
-      RealD resid_nrm=  norm2(bf);
-
-      if(!lock) converged = 0;
-#if 0
-      for(int i = SS - lock_num - 1; i >= SS - Nk && i >= 0; --i){
-
-	RealD diff = 0;
-	diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm;
-
-	std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl;
-
-	if(diff < converged) {
-
-	  if(lock) {
-	    
-	    DenseMatrix<RealD> Q; Resize(Q,M,M);
-	    bool herm = true; 
-
-	    Lock(H, Q, tevals[i], converged, small, SS, herm);
-
-	    times_real(bq, Q, bq.size());
-	    bf = Q[M - 1][M - 1]* bf;
-	    lock_num++;
-	  }
-	  converged++;
-	  std::cout << " converged on eval " << converged << " of " << Nk << std::endl;
-	} else {
-	  break;
-	}
-      }
-#endif
-      std::cout << "Got " << converged << " so far " <<std::endl;	
-    }
-
-    ///Check
-    void Check(DenseVector<RealD> &evals,
-	       DenseVector<DenseVector<RealD> > &evecs) {
-
-      DenseVector<RealD> goodval(this->get);
-
-      EigenSort(evals,evecs);
-
-      int NM = Nm;
-
-      DenseVector< DenseVector<RealD> > V; Size(V,NM);
-      DenseVector<RealD> QZ(NM*NM);
-
-      for(int i = 0; i < NM; i++){
-	for(int j = 0; j < NM; j++){
-	  // evecs[i][j];
-	}
-      }
-    }
-
-
-/**
-   There is some matrix Q such that for any vector y
-   Q.e_1 = y and Q is unitary.
-**/
-  template<class T>
-  static T orthQ(DenseMatrix<T> &Q, DenseVector<T> y){
-    int N = y.size();	//Matrix Size
-    Fill(Q,0.0);
-    T tau;
-    for(int i=0;i<N;i++){
-      Q[i][0]=y[i];
-    }
-    T sig = conj(y[0])*y[0];
-    T tau0 = abs(sqrt(sig));
-    
-    for(int j=1;j<N;j++){
-      sig += conj(y[j])*y[j]; 
-      tau = abs(sqrt(sig) ); 	
-
-      if(abs(tau0) > 0.0){
-	
-	T gam = conj( (y[j]/tau)/tau0 );
-	for(int k=0;k<=j-1;k++){  
-	  Q[k][j]=-gam*y[k];
-	}
-	Q[j][j]=tau0/tau;
-      } else {
-	Q[j-1][j]=1.0;
-      }
-      tau0 = tau;
-    }
-    return tau;
-  }
-
-/**
-	There is some matrix Q such that for any vector y
-	Q.e_k = y and Q is unitary.
-**/
-  template< class T>
-  static T orthU(DenseMatrix<T> &Q, DenseVector<T> y){
-    T tau = orthQ(Q,y);
-    SL(Q);
-    return tau;
-  }
-
-
-/**
-	Wind up with a matrix with the first con rows untouched
-
-say con = 2
-	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
-	and the matrix is upper hessenberg
-	and with f and Q appropriately modidied with Q is the arnoldi factorization
-
-**/
-
-template<class T>
-static void Lock(DenseMatrix<T> &H, 	// Hess mtx	
-		 DenseMatrix<T> &Q, 	// Lock Transform
-		 T val, 		// value to be locked
-		 int con, 	// number already locked
-		 RealD small,
-		 int dfg,
-		 bool herm)
-{	
-  //ForceTridiagonal(H);
-
-  int M = H.dim;
-  DenseVector<T> vec; Resize(vec,M-con);
-
-  DenseMatrix<T> AH; Resize(AH,M-con,M-con);
-  AH = GetSubMtx(H,con, M, con, M);
-
-  DenseMatrix<T> QQ; Resize(QQ,M-con,M-con);
-
-  Unity(Q);   Unity(QQ);
-  
-  DenseVector<T> evals; Resize(evals,M-con);
-  DenseMatrix<T> evecs; Resize(evecs,M-con,M-con);
-
-  Wilkinson<T>(AH, evals, evecs, small);
-
-  int k=0;
-  RealD cold = abs( val - evals[k]); 
-  for(int i=1;i<M-con;i++){
-    RealD cnew = abs( val - evals[i]);
-    if( cnew < cold ){k = i; cold = cnew;}
-  }
-  vec = evecs[k];
-
-  ComplexD tau;
-  orthQ(QQ,vec);
-  //orthQM(QQ,AH,vec);
-
-  AH = Hermitian(QQ)*AH;
-  AH = AH*QQ;
-
-  for(int i=con;i<M;i++){
-    for(int j=con;j<M;j++){
-      Q[i][j]=QQ[i-con][j-con];
-      H[i][j]=AH[i-con][j-con];
-    }
-  }
-
-  for(int j = M-1; j>con+2; j--){
-
-    DenseMatrix<T> U; Resize(U,j-1-con,j-1-con);
-    DenseVector<T> z; Resize(z,j-1-con); 
-    T nm = norm(z); 
-    for(int k = con+0;k<j-1;k++){
-      z[k-con] = conj( H(j,k+1) );
-    }
-    normalise(z);
-
-    RealD tmp = 0;
-    for(int i=0;i<z.size()-1;i++){tmp = tmp + abs(z[i]);}
-
-    if(tmp < small/( (RealD)z.size()-1.0) ){ continue;}	
-
-    tau = orthU(U,z);
-
-    DenseMatrix<T> Hb; Resize(Hb,j-1-con,M);	
-	
-    for(int a = 0;a<M;a++){
-      for(int b = 0;b<j-1-con;b++){
-	T sum = 0;
-	for(int c = 0;c<j-1-con;c++){
-	  sum += H[a][con+1+c]*U[c][b];
-	}//sum += H(a,con+1+c)*U(c,b);}
-	Hb[b][a] = sum;
-      }
-    }
-	
-    for(int k=con+1;k<j;k++){
-      for(int l=0;l<M;l++){
-	H[l][k] = Hb[k-1-con][l];
-      }
-    }//H(Hb[k-1-con][l] , l,k);}}
-
-    DenseMatrix<T> Qb; Resize(Qb,M,M);	
-	
-    for(int a = 0;a<M;a++){
-      for(int b = 0;b<j-1-con;b++){
-	T sum = 0;
-	for(int c = 0;c<j-1-con;c++){
-	  sum += Q[a][con+1+c]*U[c][b];
-	}//sum += Q(a,con+1+c)*U(c,b);}
-	Qb[b][a] = sum;
-      }
-    }
-	
-    for(int k=con+1;k<j;k++){
-      for(int l=0;l<M;l++){
-	Q[l][k] = Qb[k-1-con][l];
-      }
-    }//Q(Qb[k-1-con][l] , l,k);}}
-
-    DenseMatrix<T> Hc; Resize(Hc,M,M);	
-	
-    for(int a = 0;a<j-1-con;a++){
-      for(int b = 0;b<M;b++){
-	T sum = 0;
-	for(int c = 0;c<j-1-con;c++){
-	  sum += conj( U[c][a] )*H[con+1+c][b];
-	}//sum += conj( U(c,a) )*H(con+1+c,b);}
-	Hc[b][a] = sum;
-      }
-    }
-
-    for(int k=0;k<M;k++){
-      for(int l=con+1;l<j;l++){
-	H[l][k] = Hc[k][l-1-con];
-      }
-    }//H(Hc[k][l-1-con] , l,k);}}
-
-  }
-}
-#endif
-
-
  };
 
 }
diff --git a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
index 118a8e25..3bcdc77a 100644
--- a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -102,7 +102,7 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
     FieldMetaData header;
     IldgReader _IldgReader;
     _IldgReader.open(config);
-    _IldgReader.readConfiguration(config,U,header);  // format from the header
+    _IldgReader.readConfiguration(U,header);  // format from the header
     _IldgReader.close();
 
     std::cout << GridLogMessage << "Read ILDG Configuration from " << config
diff --git a/tests/solver/Test_dwf_lanczos.cc b/tests/solver/Test_dwf_lanczos.cc
index bb978186..48cca378 100644
--- a/tests/solver/Test_dwf_lanczos.cc
+++ b/tests/solver/Test_dwf_lanczos.cc
@@ -54,7 +54,7 @@ int main (int argc, char ** argv)
   GridParallelRNG          RNG5rb(FrbGrid);  RNG5.SeedFixedIntegers(seeds5);
 
   LatticeGaugeField Umu(UGrid); 
-  SU3::TepidConfiguration(RNG4, Umu);
+  SU3::HotConfiguration(RNG4, Umu);
 
   std::vector<LatticeColourMatrix> U(4,UGrid);
   for(int mu=0;mu<Nd;mu++){

From 7e3528686080357933ff87400fadb181abfd8f35 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 21 Jun 2017 02:26:03 +0100
Subject: [PATCH 2/5] Simplified lanczos, added Eigen diagonalisation. Curious
 if we can deprecate dependencly on BLAS. Will see when we get 48^3 running on
 our BG/Q port

---
 .../iterative/BlockConjugateGradient.h        |    7 +-
 lib/algorithms/iterative/EigenSort.h          |   81 --
 .../iterative/ImplicitlyRestartedLanczos.h    | 1074 +++++++++--------
 tests/solver/Test_dwf_lanczos.cc              |    9 +-
 4 files changed, 547 insertions(+), 624 deletions(-)
 delete mode 100644 lib/algorithms/iterative/EigenSort.h

diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h
index f8b83b1f..9418f63c 100644
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -56,11 +56,8 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
   
   BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
-    : Tolerance(tol),
-    CGtype(cgtype),
-    blockDim(_Orthog),
-    MaxIterations(maxit),
-    ErrorOnNoConverge(err_on_no_conv){};
+    : Tolerance(tol), CGtype(cgtype),   blockDim(_Orthog),  MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
+  {};
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Thin QR factorisation (google it)
diff --git a/lib/algorithms/iterative/EigenSort.h b/lib/algorithms/iterative/EigenSort.h
deleted file mode 100644
index 23621544..00000000
--- a/lib/algorithms/iterative/EigenSort.h
+++ /dev/null
@@ -1,81 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/EigenSort.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_EIGENSORT_H
-#define GRID_EIGENSORT_H
-
-
-namespace Grid {
-    /////////////////////////////////////////////////////////////
-    // Eigen sorter to begin with
-    /////////////////////////////////////////////////////////////
-
-template<class Field>
-class SortEigen {
- private:
-  
-//hacking for testing for now
- private:
-  static bool less_lmd(RealD left,RealD right){
-    return left > right;
-  }  
-  static bool less_pair(std::pair<RealD,Field const*>& left,
-                        std::pair<RealD,Field const*>& right){
-    return left.first > (right.first);
-  }  
-  
-  
- public:
-
-  void push(DenseVector<RealD>& lmd,
-            DenseVector<Field>& evec,int N) {
-    DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
-    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
-    
-    DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());    
-    for(int i=0;i<lmd.size();++i)
-      emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
-
-    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
-
-    typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
-    for(int i=0;i<N;++i){
-      lmd[i]=it->first;
-      evec[i]=*(it->second);
-      ++it;
-    }
-  }
-  void push(DenseVector<RealD>& lmd,int N) {
-    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
-  }
-  bool saturated(RealD lmd, RealD thrs) {
-    return fabs(lmd) > fabs(thrs);
-  }
-};
-
-}
-#endif
diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index acd67592..571bf1b2 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -7,7 +7,8 @@
     Copyright (C) 2015
 
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung
+Author: Guido Cossu
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -31,35 +32,71 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 #include <string.h> //memset
 
-#ifdef USE_LAPACK
-void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
-                   double *vl, double *vu, int *il, int *iu, double *abstol,
-                   int *m, double *w, double *z, int *ldz, int *isuppz,
-                   double *work, int *lwork, int *iwork, int *liwork,
-                   int *info);
-#endif
-
-template<class T> using DenseVector = std::vector<T>;
-
-//#include <Grid/algorithms/densematrix/DenseMatrix.h>
-#include <Grid/algorithms/iterative/EigenSort.h>
-
 namespace Grid {
 
+  enum IRLdiagonalisation { 
+    IRLdiagonaliseWithDSTEGR,
+    IRLdiagonaliseWithQR,
+    IRLdiagonaliseWithEigen
+  };
+  ////////////////////////////////////////////////////////////////////////////////
+  // Helper class for sorting the evalues AND evectors by Field
+  // Use pointer swizzle on vectors
+  ////////////////////////////////////////////////////////////////////////////////
+template<class Field>
+class SortEigen {
+ private:
+  static bool less_lmd(RealD left,RealD right){
+    return left > right;
+  }  
+  static bool less_pair(std::pair<RealD,Field const*>& left,
+                        std::pair<RealD,Field const*>& right){
+    return left.first > (right.first);
+  }  
+  
+ public:
+  void push(std::vector<RealD>& lmd,std::vector<Field>& evec,int N) {
+    
+    ////////////////////////////////////////////////////////////////////////
+    // PAB: FIXME: VERY VERY VERY wasteful: takes a copy of the entire vector set.
+    //    : The vector reorder should be done by pointer swizzle somehow
+    ////////////////////////////////////////////////////////////////////////
+    std::vector<Field> cpy(lmd.size(),evec[0]._grid);
+    for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
+    
+    std::vector<std::pair<RealD, Field const*> > emod(lmd.size());    
+
+    for(int i=0;i<lmd.size();++i)  emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
+
+    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
+
+    typename std::vector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
+    for(int i=0;i<N;++i){
+      lmd[i]=it->first;
+      evec[i]=*(it->second);
+      ++it;
+    }
+  }
+  void push(std::vector<RealD>& lmd,int N) {
+    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
+  }
+  bool saturated(RealD lmd, RealD thrs) {
+    return fabs(lmd) > fabs(thrs);
+  }
+};
+
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
 template<class Field> 
 class ImplicitlyRestartedLanczos {
-
-public:       
-  int Niter;   // Max iterations
-  int Nstop;   // Number of evecs checked for convergence
-  int Nk;      // Number of converged sought
-  int Nm;      // Nm -- total number of vectors
-
+private:       
+  int MaxIter;   // Max iterations
+  int Nstop;     // Number of evecs checked for convergence
+  int Nk;        // Number of converged sought
+  int Nm;        // Nm -- total number of vectors
   RealD eresid;
-
+  IRLdiagonalisation diagonalisation;
   ////////////////////////////////////
   // Embedded objects
   ////////////////////////////////////
@@ -70,362 +107,20 @@ public:
   /////////////////////////
   // Constructor
   /////////////////////////
+public:       
  ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
-			    OperatorFunction<Field> & poly,   // polynmial
-			    int _Nstop, // sought vecs
+			    OperatorFunction<Field> & poly,   // polynomial
+			    int _Nstop, // really sought vecs
 			    int _Nk,    // sought vecs
 			    int _Nm,    // total vecs
-			    RealD _eresid, // resid in lmdue deficit 
-			    int _Niter) : // Max iterations
+			    RealD _eresid, // resid in lmd deficit 
+			    int _MaxIter,  // Max iterations
+			    IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen ) :
     _Linop(Linop),    _poly(poly),
-    Nstop(_Nstop), Nk(_Nk), Nm(_Nm),
-    eresid(_eresid),  Niter(_Niter)  { };
-
-#if 0
-    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
-			       OperatorFunction<Field> & poly,   // polynmial
-			       int _Nk, // sought vecs
-			       int _Nm, // total vecs
-			       RealD _eresid, // resid in lmdue deficit 
-			       int _Niter) : // Max iterations
-    _Linop(Linop),      _poly(poly),
-    Nstop(_Nk), Nk(_Nk), Nm(_Nm),      
-    eresid(_eresid),      Niter(_Niter) { };
-#endif
-
-#if 0
-    void calc(DenseVector<RealD>& eval,
-	      DenseVector<Field>& evec,
-	      const Field& src,
-	      int& Nconv);
-
-    void step(DenseVector<RealD>& lmd,
-	      DenseVector<RealD>& lme, 
-	      DenseVector<Field>& evec,
-	      Field& w,int Nm,int k);
-
-    void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) ;
-
-    static RealD normalise(Field& v) ;
-    void orthogonalize(Field& w, DenseVector<Field>& evec, int k);
-    void diagonalize(DenseVector<RealD>& lmd,
-		     DenseVector<RealD>& lme, 
-		     int N2, int N1,
-		     DenseVector<RealD>& Qt,
-		     GridBase *grid);
-
-    void qr_decomp(DenseVector<RealD>& lmd,
-		   DenseVector<RealD>& lme,
-		   int Nk, int Nm,
-		   DenseVector<RealD>& Qt,
-		   RealD Dsh, int kmin, int kmax);
-
-#ifdef USE_LAPACK
-    void diagonalize_lapack(DenseVector<RealD>& lmd,
-			    DenseVector<RealD>& lme, 
-			    int N1, int N2,
-			    DenseVector<RealD>& Qt,
-			    GridBase *grid);
-#endif
-#endif
-
-/* Saad PP. 195
-1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
-2. For k = 1,2,...,m Do:
-3. wk:=Avk−βkv_{k−1}      
-4. αk:=(wk,vk)       // 
-5. wk:=wk−αkvk       // wk orthog vk 
-6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-7. vk+1 := wk/βk+1
-8. EndDo
- */
-    void step(DenseVector<RealD>& lmd,
-	      DenseVector<RealD>& lme, 
-	      DenseVector<Field>& evec,
-	      Field& w,int Nm,int k)
-    {
-      const RealD tiny = 1.0e-20;
-      assert( k< Nm );
-      
-      _poly(_Linop,evec[k],w);      // 3. wk:=Avk−βkv_{k−1}
-
-      if(k>0) w -= lme[k-1] * evec[k-1];
-
-      ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk)
-      RealD     alph = real(zalph);
-
-      w = w - alph * evec[k];// 5. wk:=wk−αkvk
-
-      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-                                 // 7. vk+1 := wk/βk+1
-
-      lmd[k] = alph;
-      lme[k] = beta;
-
-      if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise
-      if ( k < Nm-1) evec[k+1] = w;
-
-      if ( beta < tiny ) std::cout << " beta is tiny "<<beta<<std::endl;
-    }
-      
-    void qr_decomp(DenseVector<RealD>& lmd,   // Nm 
-		   DenseVector<RealD>& lme,   // Nm 
-		   int Nk, int Nm,
-		   DenseVector<RealD>& Qt,     // Nm x Nm matrix
-		   RealD Dsh, int kmin, int kmax)
-    {
-      int k = kmin-1;
-      RealD x;
-
-      RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
-      RealD c = ( lmd[k] -Dsh) *Fden;
-      RealD s = -lme[k] *Fden;
-      
-      RealD tmpa1 = lmd[k];
-      RealD tmpa2 = lmd[k+1];
-      RealD tmpb  = lme[k];
-
-      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
-      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
-      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
-      x        =-s*lme[k+1];
-      lme[k+1] = c*lme[k+1];
-      
-      for(int i=0; i<Nk; ++i){
-	RealD Qtmp1 = Qt[i+Nm*k    ];
-	RealD Qtmp2 = Qt[i+Nm*(k+1)];
-	Qt[i+Nm*k    ] = c*Qtmp1 - s*Qtmp2;
-	Qt[i+Nm*(k+1)] = s*Qtmp1 + c*Qtmp2; 
-      }
-
-      // Givens transformations
-      for(int k = kmin; k < kmax-1; ++k){
-
-	RealD Fden = 1.0/hypot(x,lme[k-1]);
-	RealD c = lme[k-1]*Fden;
-	RealD s = - x*Fden;
-	
-	RealD tmpa1 = lmd[k];
-	RealD tmpa2 = lmd[k+1];
-	RealD tmpb  = lme[k];
-
-	lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
-	lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
-	lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
-	lme[k-1] = c*lme[k-1] -s*x;
-
-	if(k != kmax-2){
-	  x = -s*lme[k+1];
-	  lme[k+1] = c*lme[k+1];
-	}
-
-	for(int i=0; i<Nk; ++i){
-	  RealD Qtmp1 = Qt[i+Nm*k    ];
-	  RealD Qtmp2 = Qt[i+Nm*(k+1)];
-	  Qt[i+Nm*k    ] = c*Qtmp1 -s*Qtmp2;
-	  Qt[i+Nm*(k+1)] = s*Qtmp1 +c*Qtmp2;
-	}
-      }
-    }
-
-
-#ifdef USE_LAPACK
-    void diagonalize_lapack(DenseVector<RealD>& lmd,
-			    DenseVector<RealD>& lme, 
-			    int N1,
-			    int N2,
-			    DenseVector<RealD>& Qt,
-			    GridBase *grid)
-    {
-      const int size = Nm;
-      int NN = N1;
-      double evals_tmp[NN];
-      double evec_tmp[NN][NN];
-      memset(evec_tmp[0],0,sizeof(double)*NN*NN);
-      double DD[NN];
-      double EE[NN];
-      for (int i = 0; i< NN; i++) {
-	for (int j = i - 1; j <= i + 1; j++) {
-	  if ( j < NN && j >= 0 ) {
-	    if (i==j) DD[i] = lmd[i];
-	    if (i==j) evals_tmp[i] = lmd[i];
-	    if (j==(i-1)) EE[j] = lme[j];
-	  }
-	}
-      }
-      int evals_found;
-      int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
-      int liwork =  3+NN*10 ;
-      int iwork[liwork];
-      double work[lwork];
-      int isuppz[2*NN];
-      char jobz = 'V'; // calculate evals & evecs
-      char range = 'I'; // calculate all evals
-      //    char range = 'A'; // calculate all evals
-      char uplo = 'U'; // refer to upper half of original matrix
-      char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
-      int ifail[NN];
-      int info;
-      int total = grid->_Nprocessors;
-      int node  = grid->_processor;
-      int interval = (NN/total)+1;
-      double vl = 0.0, vu = 0.0;
-      int il = interval*node+1 , iu = interval*(node+1);
-      if (iu > NN)  iu=NN;
-      double tol = 0.0;
-      if (1) {
-	memset(evals_tmp,0,sizeof(double)*NN);
-	if ( il <= NN){
-	  LAPACK_dstegr(&jobz, &range, &NN,
-			(double*)DD, (double*)EE,
-			&vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
-			&tol, // tolerance
-			&evals_found, evals_tmp, (double*)evec_tmp, &NN,
-			isuppz,
-			work, &lwork, iwork, &liwork,
-			&info);
-	  for (int i = iu-1; i>= il-1; i--){
-	    evals_tmp[i] = evals_tmp[i - (il-1)];
-	    if (il>1) evals_tmp[i-(il-1)]=0.;
-	    for (int j = 0; j< NN; j++){
-	      evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
-	      if (il>1) evec_tmp[i-(il-1)][j]=0.;
-	    }
-	  }
-	}
-	{
-	  grid->GlobalSumVector(evals_tmp,NN);
-	  grid->GlobalSumVector((double*)evec_tmp,NN*NN);
-	}
-      } 
-      // cheating a bit.
-      // It is better to sort instead of just reversing it, 
-      // but the document of the routine says evals are sorted in increasing order. 
-      // qr gives evals in decreasing order.
-      for(int i=0;i<NN;i++){
-	for(int j=0;j<NN;j++)
-	  Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
-	lmd [NN-1-i]=evals_tmp[i];
-      }
-    }
-#endif
-
-    void diagonalize(DenseVector<RealD>& lmd,
-		     DenseVector<RealD>& lme, 
-		     int N2,
-		     int N1,
-		     DenseVector<RealD>& Qt,
-		     GridBase *grid)
-    {
-
-#ifdef USE_LAPACK
-    const int check_lapack=0; // just use lapack if 0, check against lapack if 1
-
-    if(!check_lapack)
-	return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
-
-    DenseVector <RealD> lmd2(N1);
-    DenseVector <RealD> lme2(N1);
-    DenseVector<RealD> Qt2(N1*N1);
-    for(int k=0; k<N1; ++k){
-      lmd2[k] = lmd[k];
-      lme2[k] = lme[k];
-    }
-    for(int k=0; k<N1*N1; ++k){
-      Qt2[k] = Qt[k];
-    }
-#endif
-
-      int Niter = 100*N1;
-      int kmin = 1;
-      int kmax = N2;
-
-      // (this should be more sophisticated)
-      for(int iter=0; iter<Niter; ++iter){
-
-	// determination of 2x2 leading submatrix
-	RealD dsub = lmd[kmax-1]-lmd[kmax-2];
-	RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
-	RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
-	// (Dsh: shift)
-	
-	// transformation
-	qr_decomp(lmd,lme,N2,N1,Qt,Dsh,kmin,kmax);
-	
-	// Convergence criterion (redef of kmin and kamx)
-	for(int j=kmax-1; j>= kmin; --j){
-	  RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
-	  if(fabs(lme[j-1])+dds > dds){
-	    kmax = j+1;
-	    goto continued;
-	  }
-	}
-	Niter = iter;
-#ifdef USE_LAPACK
-	if(check_lapack){
-	  const double SMALL=1e-8;
-	  diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
-	  DenseVector <RealD> lmd3(N2);
-	  for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
-	  _sort.push(lmd3,N2);
-	  _sort.push(lmd2,N2);
-	  for(int k=0; k<N2; ++k){
-	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
-	  }
-	}
-#endif
-	return;
-
-      continued:
-	for(int j=0; j<kmax-1; ++j){
-	  RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
-	  if(fabs(lme[j])+dds > dds){
-	    kmin = j+1;
-	    break;
-	  }
-	}
-      }
-      std::cout << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
-      abort();
-    }
-
-    static RealD normalise(Field& v) 
-    {
-      RealD nn = norm2(v);
-      nn = sqrt(nn);
-      v = v * (1.0/nn);
-      return nn;
-    }
-
-    void orthogonalize(Field& w,
-		       DenseVector<Field>& evec,
-		       int k)
-    {
-      typedef typename Field::scalar_type MyComplex;
-      MyComplex ip;
-
-      if ( 0 ) {
-	for(int j=0; j<k; ++j){
-	  normalise(evec[j]);
-	  for(int i=0;i<j;i++){
-	    ip = innerProduct(evec[i],evec[j]); // are the evecs normalised? ; this assumes so.
-	    evec[j] = evec[j] - ip *evec[i];
-	  }
-	}
-      }
-
-      for(int j=0; j<k; ++j){
-	ip = innerProduct(evec[j],w); // are the evecs normalised? ; this assumes so.
-	w = w - ip * evec[j];
-      }
-      normalise(w);
-    }
-
-
-    void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) {
-      for(int i=0; i<Qt.size(); ++i) Qt[i] = 0.0;
-      for(int k=0; k<Nm; ++k) Qt[k + k*Nm] = 1.0;
-    }
+      Nstop(_Nstop), Nk(_Nk), Nm(_Nm),
+      eresid(_eresid),  MaxIter(_MaxIter),
+      diagonalisation(_diagonalisation)
+      { };
 
 /* Rudy Arthur's thesis pp.137
 ------------------------
@@ -443,169 +138,482 @@ repeat
   HK =HM(1:K,1:K)
   →AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
 until convergence
- */
-    void calc(DenseVector<RealD>& eval,
-	      DenseVector<Field>& evec,
-	      const Field& src,
-	      int& Nconv)
-      {
-
-	GridBase *grid = evec[0]._grid;
-	assert(grid == src._grid);
-
-	std::cout << " -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
-	std::cout << " -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
-	std::cout << " -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
-	std::cout << " -- size of eval = " << eval.size() << std::endl;
-	std::cout << " -- size of evec = " << evec.size() << std::endl;
-	
-	assert(Nm == evec.size() && Nm == eval.size());
-	
-	DenseVector<RealD> lme(Nm);  
-	DenseVector<RealD> lme2(Nm);
-	DenseVector<RealD> eval2(Nm);
-	DenseVector<RealD> Qt(Nm*Nm);
-	DenseVector<int>   Iconv(Nm);
-
-	DenseVector<Field>  B(Nm,grid); // waste of space replicating
-	
-	Field f(grid);
-	Field v(grid);
-  
-	int k1 = 1;
-	int k2 = Nk;
-
-	Nconv = 0;
-
-	RealD beta_k;
-  
-	// Set initial vector
-	evec[0] = src;
-	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
-
-	normalise(evec[0]);
-	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
-	
-	// Initial Nk steps
-	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
-
-	// Restarting loop begins
-	int iter;
-	for(iter = 0; iter<Niter; ++iter){
-
-	  std::cout<<"\n Restart iteration = "<< iter << std::endl;
-
-	  for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
-
-	  f *= lme[Nm-1];
-
-	  // getting eigenvalues
-	  for(int k=0; k<Nm; ++k){
-	    eval2[k] = eval[k+k1-1];
-	    lme2[k] = lme[k+k1-1];
-	  }
-	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
-
-	  // sorting
-	  _sort.push(eval2,Nm);
-	  
-	  // Implicitly shifted QR transformations
-	  setUnit_Qt(Nm,Qt);
-	  for(int ip=k2; ip<Nm; ++ip){ 
-	    //	    std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
-	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
-	  }
+*/
+  void calc(std::vector<RealD>& eval,  std::vector<Field>& evec, const Field& src, int& Nconv)
+  {
     
-	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
-	  
-	  for(int j=k1-1; j<k2+1; ++j){
-	    for(int k=0; k<Nm; ++k){
-	    B[j].checkerboard = evec[k].checkerboard;
-	      B[j] += Qt[k+Nm*j] * evec[k];
-	    }
-	  }
-	  for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j];
-
-	  // Compressed vector f and beta(k2)
-	  f *= Qt[Nm-1+Nm*(k2-1)];
-	  f += lme[k2-1] * evec[k2];
-	  beta_k = norm2(f);
-	  beta_k = sqrt(beta_k);
-	  std::cout<<" beta(k) = "<<beta_k<<std::endl;
-
-	  RealD betar = 1.0/beta_k;
-	  evec[k2] = betar * f;
-	  lme[k2-1] = beta_k;
-
-	  // Convergence test
-	  for(int k=0; k<Nm; ++k){    
-	    eval2[k] = eval[k];
-	    lme2[k] = lme[k];
-	  }
-	  setUnit_Qt(Nm,Qt);
-	  diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
-	  
-	  for(int k = 0; k<Nk; ++k) B[k]=0.0;
-	  
-	  for(int j = 0; j<Nk; ++j){
-	    for(int k = 0; k<Nk; ++k){
-	      B[j].checkerboard = evec[k].checkerboard;
-	      B[j] += Qt[k+j*Nm] * evec[k];
-	    }
-	  }
-
-	  Nconv = 0;
-	  for(int i=0; i<Nk; ++i){
-
-	    _Linop.HermOp(B[i],v);
-	    
-	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
-	    RealD vden = norm2(B[i]);
-	    eval2[i] = vnum/vden;
-	    v -= eval2[i]*B[i];
-	    RealD vv = norm2(v);
-	    
-	    std::cout.precision(13);
-	    std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
-	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
-	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
-	    
-	    // change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
-	    if((vv<eresid*eresid) && (i == Nconv) ){
-	      Iconv[Nconv] = i;
-	      ++Nconv;
-	    }
-
-	  }  // i-loop end
-
-	  std::cout<<" #modes converged: "<<Nconv<<std::endl;
-
-	  if( Nconv>=Nstop ){
-	    goto converged;
-	  }
-	} // end of iter loop
+    GridBase *grid = evec[0]._grid;
+    assert(grid == src._grid);
+    
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogMessage <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 /  "<< MaxIter<< std::endl;
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogMessage <<" -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
+    std::cout << GridLogMessage <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
+    std::cout << GridLogMessage <<" -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
+    std::cout << GridLogMessage <<" -- size of eval = " << eval.size() << std::endl;
+    std::cout << GridLogMessage <<" -- size of evec = " << evec.size() << std::endl;
+    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
+      std::cout << GridLogMessage << "Diagonalisation is DSTEGR "<<std::endl;
+    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
+      std::cout << GridLogMessage << "Diagonalisation is QR "<<std::endl;
+    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+      std::cout << GridLogMessage << "Diagonalisation is Eigen "<<std::endl;
+    }
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    
+    assert(Nm == evec.size() && Nm == eval.size());
 	
-	std::cout<<"\n NOT converged.\n";
-	abort();
+    std::vector<RealD> lme(Nm);  
+    std::vector<RealD> lme2(Nm);
+    std::vector<RealD> eval2(Nm);
+    Eigen::MatrixXd    Qt = Eigen::MatrixXd::Zero(Nm,Nm);
+    std::vector<int>   Iconv(Nm);
+
+    std::vector<Field>  B(Nm,grid); // waste of space replicating
+    
+    Field f(grid);
+    Field v(grid);
+    
+    int k1 = 1;
+    int k2 = Nk;
+    
+    Nconv = 0;
+    
+    RealD beta_k;
+  
+    // Set initial vector
+    evec[0] = src;
+    std::cout << GridLogMessage <<"norm2(src)= " << norm2(src)<<std::endl;
+    
+    normalise(evec[0]);
+    std::cout << GridLogMessage <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
+    
+    // Initial Nk steps
+    for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
+    
+    // Restarting loop begins
+    int iter;
+    for(iter = 0; iter<MaxIter; ++iter){
+      
+      std::cout<< GridLogMessage <<" **********************"<< std::endl;
+      std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
+      std::cout<< GridLogMessage <<" **********************"<< std::endl;
+      
+      for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
+      
+      f *= lme[Nm-1];
+      
+      // getting eigenvalues
+      for(int k=0; k<Nm; ++k){
+	eval2[k] = eval[k+k1-1];
+	lme2[k] = lme[k+k1-1];
+      }
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
+
+      // sorting
+      _sort.push(eval2,Nm);
+      
+      // Implicitly shifted QR transformations
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      for(int ip=k2; ip<Nm; ++ip){ 
+	qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
+      }
+    
+      for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
+	  
+      for(int j=k1-1; j<k2+1; ++j){
+	for(int k=0; k<Nm; ++k){
+	  B[j].checkerboard = evec[k].checkerboard;
+	  B[j] += Qt(j,k) * evec[k];
+	}
+      }
+      for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j];
+      
+      // Compressed vector f and beta(k2)
+      f *= Qt(k2-1,Nm-1);
+      f += lme[k2-1] * evec[k2];
+      beta_k = norm2(f);
+      beta_k = sqrt(beta_k);
+      std::cout<< GridLogMessage<<" beta(k) = "<<beta_k<<std::endl;
+      
+      RealD betar = 1.0/beta_k;
+      evec[k2] = betar * f;
+      lme[k2-1] = beta_k;
+      
+      // Convergence test
+      for(int k=0; k<Nm; ++k){    
+	eval2[k] = eval[k];
+	lme2[k] = lme[k];
+      }
+      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+      diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
+      
+      for(int k = 0; k<Nk; ++k) B[k]=0.0;
+      
+      for(int j = 0; j<Nk; ++j){
+	for(int k = 0; k<Nk; ++k){
+	  B[j].checkerboard = evec[k].checkerboard;
+	  B[j] += Qt(j,k) * evec[k];
+	}
+      }
+
+      Nconv = 0;
+      for(int i=0; i<Nk; ++i){
 	
-      converged:
-       // Sorting
-       eval.resize(Nconv);
-       evec.resize(Nconv,grid);
-       for(int i=0; i<Nconv; ++i){
-         eval[i] = eval2[Iconv[i]];
-         evec[i] = B[Iconv[i]];
-       }
-      _sort.push(eval,evec,Nconv);
+	_Linop.HermOp(B[i],v);
+	    
+	RealD vnum = real(innerProduct(B[i],v)); // HermOp.
+	RealD vden = norm2(B[i]);
+	eval2[i] = vnum/vden;
+	v -= eval2[i]*B[i];
+	RealD vv = norm2(v);
+	
+	std::cout.precision(13);
+	std::cout << GridLogMessage << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
+	std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
+	std::cout << " |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
+	
+	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
+	if((vv<eresid*eresid) && (i == Nconv) ){
+	  Iconv[Nconv] = i;
+	  ++Nconv;
+	}
+	
+      }  // i-loop end
+      
+      std::cout<< GridLogMessage <<" #modes converged: "<<Nconv<<std::endl;
 
-      std::cout << "\n Converged\n Summary :\n";
-      std::cout << " -- Iterations  = "<< iter   << "\n";
-      std::cout << " -- beta(k)     = "<< beta_k << "\n";
-      std::cout << " -- Nconv       = "<< Nconv  << "\n";
-     }
- };
+      if( Nconv>=Nstop ){
+	goto converged;
+      }
+    } // end of iter loop
+    
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout<< GridLogError    <<" ImplicitlyRestartedLanczos::calc() NOT converged.";
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    abort();
+	
+  converged:
+    // Sorting
+    eval.resize(Nconv);
+    evec.resize(Nconv,grid);
+    for(int i=0; i<Nconv; ++i){
+      eval[i] = eval2[Iconv[i]];
+      evec[i] = B[Iconv[i]];
+    }
+    _sort.push(eval,evec,Nconv);
+    
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogMessage << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+    std::cout << GridLogMessage << " -- Iterations  = "<< iter   << "\n";
+    std::cout << GridLogMessage << " -- beta(k)     = "<< beta_k << "\n";
+    std::cout << GridLogMessage << " -- Nconv       = "<< Nconv  << "\n";
+    std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
+  }
 
-}
+private:
+/* Saad PP. 195
+1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
+2. For k = 1,2,...,m Do:
+3. wk:=Avk−βkv_{k−1}      
+4. αk:=(wk,vk)       // 
+5. wk:=wk−αkvk       // wk orthog vk 
+6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+7. vk+1 := wk/βk+1
+8. EndDo
+ */
+  void step(std::vector<RealD>& lmd,
+	    std::vector<RealD>& lme, 
+	    std::vector<Field>& evec,
+	    Field& w,int Nm,int k)
+  {
+    const RealD tiny = 1.0e-20;
+    assert( k< Nm );
+    
+    _poly(_Linop,evec[k],w);      // 3. wk:=Avk−βkv_{k−1}
+    
+    if(k>0) w -= lme[k-1] * evec[k-1];
+    
+    ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk)
+    RealD     alph = real(zalph);
+    
+    w = w - alph * evec[k];// 5. wk:=wk−αkvk
+    
+    RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+    // 7. vk+1 := wk/βk+1
+    
+    lmd[k] = alph;
+    lme[k] = beta;
+    
+    if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise
+    if ( k < Nm-1) evec[k+1] = w;
+    
+    if ( beta < tiny ) std::cout << GridLogMessage << " beta is tiny "<<beta<<std::endl;
+  }
+      
+  ///////////////////////////////////////////////////////////////////
+  // 
+  //
+  ///////////////////////////////////////////////////////////////////
+  void qr_decomp(std::vector<RealD>& lmd,   // Nm 
+		 std::vector<RealD>& lme,   // Nm 
+		 int Nk, int Nm,            // Nk, Nm
+		 Eigen::MatrixXd& Qt,       // Nm x Nm matrix
+		 RealD Dsh, int kmin, int kmax)
+  {
+    int k = kmin-1;
+    RealD x;
+    
+    RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
+    RealD c = ( lmd[k] -Dsh) *Fden;
+    RealD s = -lme[k] *Fden;
+      
+    RealD tmpa1 = lmd[k];
+    RealD tmpa2 = lmd[k+1];
+    RealD tmpb  = lme[k];
+
+    lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+    lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+    lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+    x        =-s*lme[k+1];
+    lme[k+1] = c*lme[k+1];
+      
+    for(int i=0; i<Nk; ++i){
+      RealD Qtmp1 = Qt(k,i);
+      RealD Qtmp2 = Qt(k+1,i);
+      Qt(k,i)  = c*Qtmp1 - s*Qtmp2;
+      Qt(k+1,i)= s*Qtmp1 + c*Qtmp2; 
+    }
+
+    // Givens transformations
+    for(int k = kmin; k < kmax-1; ++k){
+      
+      RealD Fden = 1.0/hypot(x,lme[k-1]);
+      RealD c = lme[k-1]*Fden;
+      RealD s = - x*Fden;
+	
+      RealD tmpa1 = lmd[k];
+      RealD tmpa2 = lmd[k+1];
+      RealD tmpb  = lme[k];
+
+      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+      lme[k-1] = c*lme[k-1] -s*x;
+
+      if(k != kmax-2){
+	x = -s*lme[k+1];
+	lme[k+1] = c*lme[k+1];
+      }
+
+      for(int i=0; i<Nk; ++i){
+	RealD Qtmp1 = Qt(k,i);
+	RealD Qtmp2 = Qt(k+1,i);
+	Qt(k,i)     = c*Qtmp1 -s*Qtmp2;
+	Qt(k+1,i)   = s*Qtmp1 +c*Qtmp2;
+      }
+    }
+  }
+
+  void diagonalize(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+		   int Nk, int Nm,   
+		   Eigen::MatrixXd & Qt,
+		   GridBase *grid)
+  {
+    Qt = Eigen::MatrixXd::Identity(Nm,Nm);
+    if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
+      diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid);
+    } else if ( diagonalisation == IRLdiagonaliseWithQR ) { 
+      diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid);
+    }  else if ( diagonalisation == IRLdiagonaliseWithEigen ) { 
+      diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
+    } else { 
+      assert(0);
+    }
+  }
+
+#ifdef USE_LAPACK
+void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
+                   double *vl, double *vu, int *il, int *iu, double *abstol,
+                   int *m, double *w, double *z, int *ldz, int *isuppz,
+                   double *work, int *lwork, int *iwork, int *liwork,
+                   int *info);
 #endif
 
+void diagonalize_lapack(std::vector<RealD>& lmd,
+			std::vector<RealD>& lme, 
+			int Nk, int Nm,  
+			Eigen::MatrixXd& Qt,
+			GridBase *grid)
+{
+#ifdef USE_LAPACK
+  const int size = Nm;
+  int NN = Nk;
+  double evals_tmp[NN];
+  double evec_tmp[NN][NN];
+  memset(evec_tmp[0],0,sizeof(double)*NN*NN);
+  double DD[NN];
+  double EE[NN];
+  for (int i = 0; i< NN; i++) {
+    for (int j = i - 1; j <= i + 1; j++) {
+      if ( j < NN && j >= 0 ) {
+	if (i==j) DD[i] = lmd[i];
+	if (i==j) evals_tmp[i] = lmd[i];
+	if (j==(i-1)) EE[j] = lme[j];
+      }
+    }
+  }
+  int evals_found;
+  int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
+  int liwork =  3+NN*10 ;
+  int iwork[liwork];
+  double work[lwork];
+  int isuppz[2*NN];
+  char jobz = 'V'; // calculate evals & evecs
+  char range = 'I'; // calculate all evals
+  //    char range = 'A'; // calculate all evals
+  char uplo = 'U'; // refer to upper half of original matrix
+  char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
+  int ifail[NN];
+  int info;
+  int total = grid->_Nprocessors;
+  int node  = grid->_processor;
+  int interval = (NN/total)+1;
+  double vl = 0.0, vu = 0.0;
+  int il = interval*node+1 , iu = interval*(node+1);
+  if (iu > NN)  iu=NN;
+  double tol = 0.0;
+  if (1) {
+    memset(evals_tmp,0,sizeof(double)*NN);
+    if ( il <= NN){
+      LAPACK_dstegr(&jobz, &range, &NN,
+		    (double*)DD, (double*)EE,
+		    &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
+		    &tol, // tolerance
+		    &evals_found, evals_tmp, (double*)evec_tmp, &NN,
+		    isuppz,
+		    work, &lwork, iwork, &liwork,
+		    &info);
+      for (int i = iu-1; i>= il-1; i--){
+	evals_tmp[i] = evals_tmp[i - (il-1)];
+	if (il>1) evals_tmp[i-(il-1)]=0.;
+	for (int j = 0; j< NN; j++){
+	  evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
+	  if (il>1) evec_tmp[i-(il-1)][j]=0.;
+	}
+      }
+    }
+    {
+      grid->GlobalSumVector(evals_tmp,NN);
+      grid->GlobalSumVector((double*)evec_tmp,NN*NN);
+    }
+  } 
+  // Safer to sort instead of just reversing it, 
+  // but the document of the routine says evals are sorted in increasing order. 
+  // qr gives evals in decreasing order.
+  for(int i=0;i<NN;i++){
+    lmd [NN-1-i]=evals_tmp[i];
+    for(int j=0;j<NN;j++){
+      Qt((NN-1-i),j)=evec_tmp[i][j];
+    }
+  }
+#else 
+  assert(0);
+#endif
+}
+
+  void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+		      int Nk, int Nm,   
+		      Eigen::MatrixXd & Qt,
+		      GridBase *grid)
+  {
+    int Niter = 100*Nm;
+    int kmin = 1;
+    int kmax = Nk;
+
+    // (this should be more sophisticated)
+    for(int iter=0; iter<Niter; ++iter){
+      
+      // determination of 2x2 leading submatrix
+      RealD dsub = lmd[kmax-1]-lmd[kmax-2];
+      RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
+      RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
+      // (Dsh: shift)
+	
+      // transformation
+      qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
+	
+      // Convergence criterion (redef of kmin and kamx)
+      for(int j=kmax-1; j>= kmin; --j){
+	RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
+	if(fabs(lme[j-1])+dds > dds){
+	  kmax = j+1;
+	  goto continued;
+	}
+      }
+      Niter = iter;
+      return;
+
+    continued:
+      for(int j=0; j<kmax-1; ++j){
+	RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
+	if(fabs(lme[j])+dds > dds){
+	  kmin = j+1;
+	  break;
+	}
+      }
+    }
+    std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
+    abort();
+  }
+
+  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+			 int Nk, int Nm,  
+			 Eigen::MatrixXd & Qt, // Nm x Nm
+			 GridBase *grid)
+  {
+    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
+
+    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
+    
+    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
+
+    for (int i = 0; i < Nk; i++) {
+      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
+    }
+    for (int i = 0; i < Nk; i++) {
+      for (int j = 0; j < Nk; j++) {
+	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
+      }
+    }
+  }
+
+
+  static RealD normalise(Field& v) 
+  {
+    RealD nn = norm2(v);
+    nn = sqrt(nn);
+    v = v * (1.0/nn);
+    return nn;
+  }
+  
+  void orthogonalize(Field& w, std::vector<Field>& evec, int k)
+  {
+    typedef typename Field::scalar_type MyComplex;
+    MyComplex ip;
+    
+    for(int j=0; j<k; ++j){
+      ip = innerProduct(evec[j],w); // are the evecs normalised? ; this assumes so.
+      w = w - ip * evec[j];
+    }
+    normalise(w);
+  }
+
+ };
+}
+#endif
diff --git a/tests/solver/Test_dwf_lanczos.cc b/tests/solver/Test_dwf_lanczos.cc
index 48cca378..1dd5dae3 100644
--- a/tests/solver/Test_dwf_lanczos.cc
+++ b/tests/solver/Test_dwf_lanczos.cc
@@ -92,16 +92,15 @@ int main (int argc, char ** argv)
 
   
   std::vector<RealD>          eval(Nm);
-  FermionField    src(FrbGrid); gaussian(RNG5rb,src);
+  FermionField    src(FrbGrid); 
+  gaussian(RNG5rb,src);
   std::vector<FermionField> evec(Nm,FrbGrid);
   for(int i=0;i<1;i++){
-    std::cout << i<<" / "<< Nm<< " grid pointer "<<evec[i]._grid<<std::endl;
+    std::cout << GridLogMessage <<i<<" / "<< Nm<< " grid pointer "<<evec[i]._grid<<std::endl;
   };
 
   int Nconv;
-  IRL.calc(eval,evec,
-	   src,
-	   Nconv);
+  IRL.calc(eval,evec,src,Nconv);
 
 
   Grid_finalize();

From e8b95bd35b00b25384de0019dd454af853883f08 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 21 Jun 2017 02:50:09 +0100
Subject: [PATCH 3/5] Clean up finished. Could shrink Lanczos to around 400
 lines at a push

---
 .../iterative/ImplicitlyRestartedLanczos.h    | 114 +++++++++---------
 tests/debug/Test_synthetic_lanczos.cc         |   4 +-
 2 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
index 571bf1b2..a8723f32 100644
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -39,10 +39,11 @@ namespace Grid {
     IRLdiagonaliseWithQR,
     IRLdiagonaliseWithEigen
   };
-  ////////////////////////////////////////////////////////////////////////////////
-  // Helper class for sorting the evalues AND evectors by Field
-  // Use pointer swizzle on vectors
-  ////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+// Helper class for sorting the evalues AND evectors by Field
+// Use pointer swizzle on vectors
+////////////////////////////////////////////////////////////////////////////////
 template<class Field>
 class SortEigen {
  private:
@@ -90,7 +91,9 @@ class SortEigen {
 /////////////////////////////////////////////////////////////
 template<class Field> 
 class ImplicitlyRestartedLanczos {
+
 private:       
+
   int MaxIter;   // Max iterations
   int Nstop;     // Number of evecs checked for convergence
   int Nk;        // Number of converged sought
@@ -122,6 +125,29 @@ public:
       diagonalisation(_diagonalisation)
       { };
 
+  ////////////////////////////////
+  // Helpers
+  ////////////////////////////////
+  static RealD normalise(Field& v) 
+  {
+    RealD nn = norm2(v);
+    nn = sqrt(nn);
+    v = v * (1.0/nn);
+    return nn;
+  }
+  
+  void orthogonalize(Field& w, std::vector<Field>& evec, int k)
+  {
+    typedef typename Field::scalar_type MyComplex;
+    MyComplex ip;
+    
+    for(int j=0; j<k; ++j){
+      ip = innerProduct(evec[j],w); 
+      w = w - ip * evec[j];
+    }
+    normalise(w);
+  }
+
 /* Rudy Arthur's thesis pp.137
 ------------------------
 Require: M > K P = M − K †
@@ -167,9 +193,10 @@ until convergence
     std::vector<RealD> lme(Nm);  
     std::vector<RealD> lme2(Nm);
     std::vector<RealD> eval2(Nm);
-    Eigen::MatrixXd    Qt = Eigen::MatrixXd::Zero(Nm,Nm);
-    std::vector<int>   Iconv(Nm);
 
+    Eigen::MatrixXd    Qt = Eigen::MatrixXd::Zero(Nm,Nm);
+
+    std::vector<int>   Iconv(Nm);
     std::vector<Field>  B(Nm,grid); // waste of space replicating
     
     Field f(grid);
@@ -218,6 +245,7 @@ until convergence
       // Implicitly shifted QR transformations
       Qt = Eigen::MatrixXd::Identity(Nm,Nm);
       for(int ip=k2; ip<Nm; ++ip){ 
+	// Eigen replacement for qr_decomp ???
 	qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
       }
     
@@ -354,10 +382,32 @@ private:
     if ( beta < tiny ) std::cout << GridLogMessage << " beta is tiny "<<beta<<std::endl;
   }
       
-  ///////////////////////////////////////////////////////////////////
-  // 
-  //
-  ///////////////////////////////////////////////////////////////////
+  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
+			 int Nk, int Nm,  
+			 Eigen::MatrixXd & Qt, // Nm x Nm
+			 GridBase *grid)
+  {
+    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
+
+    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
+    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
+    
+    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
+
+    for (int i = 0; i < Nk; i++) {
+      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
+    }
+    for (int i = 0; i < Nk; i++) {
+      for (int j = 0; j < Nk; j++) {
+	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
+      }
+    }
+  }
+  ///////////////////////////////////////////////////////////////////////////
+  // File could end here if settle on Eigen ???
+  ///////////////////////////////////////////////////////////////////////////
+
   void qr_decomp(std::vector<RealD>& lmd,   // Nm 
 		 std::vector<RealD>& lme,   // Nm 
 		 int Nk, int Nm,            // Nk, Nm
@@ -570,50 +620,6 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
     abort();
   }
 
-  void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, 
-			 int Nk, int Nm,  
-			 Eigen::MatrixXd & Qt, // Nm x Nm
-			 GridBase *grid)
-  {
-    Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
-
-    for(int i=0;i<Nk;i++)   TriDiag(i,i)   = lmd[i];
-    for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
-    for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
-    
-    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
-
-    for (int i = 0; i < Nk; i++) {
-      lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
-    }
-    for (int i = 0; i < Nk; i++) {
-      for (int j = 0; j < Nk; j++) {
-	Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
-      }
-    }
-  }
-
-
-  static RealD normalise(Field& v) 
-  {
-    RealD nn = norm2(v);
-    nn = sqrt(nn);
-    v = v * (1.0/nn);
-    return nn;
-  }
-  
-  void orthogonalize(Field& w, std::vector<Field>& evec, int k)
-  {
-    typedef typename Field::scalar_type MyComplex;
-    MyComplex ip;
-    
-    for(int j=0; j<k; ++j){
-      ip = innerProduct(evec[j],w); // are the evecs normalised? ; this assumes so.
-      w = w - ip * evec[j];
-    }
-    normalise(w);
-  }
-
  };
 }
 #endif
diff --git a/tests/debug/Test_synthetic_lanczos.cc b/tests/debug/Test_synthetic_lanczos.cc
index 8ffbcbe9..32fd6f32 100644
--- a/tests/debug/Test_synthetic_lanczos.cc
+++ b/tests/debug/Test_synthetic_lanczos.cc
@@ -133,8 +133,8 @@ int main (int argc, char ** argv)
   int Nconv;
   RealD eresid = 1.0e-6;
 
-  ImplicitlyRestartedLanczos<LatticeComplex> IRL(HermOp,X,Nk,Nm,eresid,Nit);
-  ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(HermOp,Cheby,Nk,Nm,eresid,Nit);
+  ImplicitlyRestartedLanczos<LatticeComplex> IRL(HermOp,X,Nk,Nk,Nm,eresid,Nit);
+  ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(HermOp,Cheby,Nk,Nk,Nm,eresid,Nit);
 
   LatticeComplex src(grid); gaussian(RNG,src);
   {

From ef4f2b8c410d449ff0beea1682cfc3de9bda3f79 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 21 Jun 2017 09:22:20 +0100
Subject: [PATCH 4/5] todo update

---
 TODO | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/TODO b/TODO
index eeb7dfa5..8f80903e 100644
--- a/TODO
+++ b/TODO
@@ -2,8 +2,8 @@ TODO:
 ---------------
 
 Large item work list:
-1)- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
-2)- MultiRHS with spread out extra dim
+1)- MultiRHS with spread out extra dim
+2)- Christoph's local basis expansion Lanczos
 3)- BG/Q port and check
 4)- Precision conversion and sort out localConvert      <-- partial
   - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
@@ -13,6 +13,7 @@ Large item work list:
 8)- HDCR resume
 
 Recent DONE 
+-- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE
 -- Binary I/O MPI2 IO                                  <-- DONE

From 9e56c6573007ccc857571aefa2ce3b6851f7b891 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 21 Jun 2017 14:02:58 +0100
Subject: [PATCH 5/5] Updated TODO list

---
 TODO | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TODO b/TODO
index 8f80903e..001c6c0c 100644
--- a/TODO
+++ b/TODO
@@ -2,7 +2,8 @@ TODO:
 ---------------
 
 Large item work list:
-1)- MultiRHS with spread out extra dim
+1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
+
 2)- Christoph's local basis expansion Lanczos
 3)- BG/Q port and check
 4)- Precision conversion and sort out localConvert      <-- partial