Improved the lancos

2025-07-11 18:57:06 +01:00 · 2017-06-20 18:46:01 +01:00
parent e9cc21900f
commit 0486ff8e79
7 changed files with 211 additions and 1712 deletions
--- a/28
+++ b/28
@ -1,24 +1,28 @@
 TODO:
 ---------------
-Peter's work list:
+Large item work list:
-1)- Precision conversion and sort out localConvert      <-- 
+1)- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
-2)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
+2)- MultiRHS with spread out extra dim
-
+3)- BG/Q port and check
-- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
+4)- Precision conversion and sort out localConvert      <-- partial
-- Physical propagator interface
+  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
-- Conserved currents
+5)- Physical propagator interface
-- GaugeFix into central location
+6)- Conserved currents
-- Multigrid Wilson and DWF, compare to other Multigrid implementations
+7)- Multigrid Wilson and DWF, compare to other Multigrid implementations
-- HDCR resume
+8)- HDCR resume
 Recent DONE 
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE
 -- Binary I/O MPI2 IO                                  <-- DONE
 -- Binary I/O speed up & x-strips                      <-- DONE
 -- Cut down the exterior overhead                      <-- DONE
 -- Interior legs from SHM comms                        <-- DONE
 -- Half-precision comms                                <-- DONE
-- Merge high precision reduction into develop        
+-- Merge high precision reduction into develop         <-- DONE
-- multiRHS DWF; benchmark on Cori/BNL for comms elimination
+-- BlockCG, BCGrQ                                      <-- DONE
 -- multiRHS DWF; benchmark on Cori/BNL for comms elimination <-- DONE
   -- slice* linalg routines for multiRHS, BlockCG    
 -----
--- a/lib/algorithms/densematrix/DenseMatrix.h
+++ b/lib/algorithms/densematrix/DenseMatrix.h
@ -1,137 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/DenseMatrix.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_DENSE_MATRIX_H
 #define GRID_DENSE_MATRIX_H
 namespace Grid {
    /////////////////////////////////////////////////////////////
    // Matrix untils
    /////////////////////////////////////////////////////////////
 template<class T> using DenseVector = std::vector<T>;
 template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
 template<class T> void Size(DenseVector<T> & vec, int &N) 
 { 
  N= vec.size();
 }
 template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
 { 
  N= mat.size();
  M= mat[0].size();
 }
 template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
 { 
  int M; Size(mat,N,M);
  assert(N==M);
 }
 template<class T> void Resize(DenseVector<T > & mat, int N) { 
  mat.resize(N);
 }
 template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
  mat.resize(N);
  for(int i=0;i<N;i++){
    mat[i].resize(M);
  }
 }
 template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
  int N,M;
  Size(mat,N,M);
  for(int i=0;i<N;i++){
  for(int j=0;j<M;j++){
    mat[i][j] = val;
  }}
 }
 /** Transpose of a matrix **/
 template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
  int N,M;
  Size(mat,N,M);
  DenseMatrix<T> C; Resize(C,M,N);
  for(int i=0;i<M;i++){
  for(int j=0;j<N;j++){
    C[i][j] = mat[j][i];
  }} 
  return C;
 }
 /** Set DenseMatrix to unit matrix **/
 template<class T> void Unity(DenseMatrix<T> &A){
  int N;  SizeSquare(A,N);
  for(int i=0;i<N;i++){
    for(int j=0;j<N;j++){
      if ( i==j ) A[i][j] = 1;
      else        A[i][j] = 0;
    } 
  } 
 }
 /** Add C * I to matrix **/
 template<class T>
 void PlusUnit(DenseMatrix<T> & A,T c){
  int dim;  SizeSquare(A,dim);
  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
 }
 /** return the Hermitian conjugate of matrix **/
 template<class T>
 DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
  int dim; SizeSquare(mat,dim);
  DenseMatrix<T> C; Resize(C,dim,dim);
  for(int i=0;i<dim;i++){
    for(int j=0;j<dim;j++){
      C[i][j] = conj(mat[j][i]);
    } 
  } 
  return C;
 }
 /**Get a square submatrix**/
 template <class T>
 DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
 {
  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
  for(int i = row_st; i<row_end; i++){
  for(int j = col_st; j<col_end; j++){
    H[i-row_st][j-col_st]=A[i][j];
  }}
  return H;
 }
 }
 #include "Householder.h"
 #include "Francis.h"
 #endif
--- a/lib/algorithms/densematrix/Francis.h
+++ b/lib/algorithms/densematrix/Francis.h
@ -1,525 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Francis.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef FRANCIS_H
 #define FRANCIS_H
 #include <cstdlib>
 #include <string>
 #include <cmath>
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
 #include <fstream>
 #include <complex>
 #include <algorithm>
 //#include <timer.h>
 //#include <lapacke.h>
 //#include <Eigen/Dense>
 namespace Grid {
 template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
 template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
 /**
  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
 H =
      x  x  x  x  x  x  x  x  x
      x  x  x  x  x  x  x  x  x
      0  x  x  x  x  x  x  x  x
      0  0  x  x  x  x  x  x  x
      0  0  0  x  x  x  x  x  x
      0  0  0  0  x  x  x  x  x
      0  0  0  0  0  x  x  x  x
      0  0  0  0  0  0  x  x  x
      0  0  0  0  0  0  0  x  x
 Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
 **/
 template <class T>
 int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
 {
  DenseMatrix<T> H = Hin; 
  int N ; SizeSquare(H,N);
  int M = N;
  Fill(evals,0);
  Fill(evecs,0);
  T s,t,x=0,y=0,z=0;
  T u,d;
  T apd,amd,bc;
  DenseVector<T> p(N,0);
  T nrm = Norm(H);    ///DenseMatrix Norm
  int n, m;
  int e = 0;
  int it = 0;
  int tot_it = 0;
  int l = 0;
  int r = 0;
  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
  DenseVector<int> trows(N,0);
  /// Check if the matrix is really hessenberg, if not abort
  RealD sth = 0;
  for(int j=0;j<N;j++){
    for(int i=j+2;i<N;i++){
      sth = abs(H[i][j]);
      if(sth > small){
 	std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
 	exit(1);
      }
    }
  }
  do{
    std::cout << "Francis QR Step N = " << N << std::endl;
    /** Check for convergence
      x  x  x  x  x
      0  x  x  x  x
      0  0  x  x  x
      0  0  x  x  x
      0  0  0  0  x
      for this matrix l = 4
     **/
    do{
      l = Chop_subdiag(H,nrm,e,small);
      r = 0;    ///May have converged on more than one eval
      ///Single eval
      if(l == N-1){
        evals[e] = H[l][l];
        N--; e++; r++; it = 0;
      }
      ///RealD eval
      if(l == N-2){
        trows[l+1] = 1;    ///Needed for UTSolve
        apd = H[l][l] + H[l+1][l+1];
        amd = H[l][l] - H[l+1][l+1];
        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
        N-=2; e+=2; r++; it = 0;
      }
    } while(r>0);
    if(N ==0) break;
    DenseVector<T > ck; Resize(ck,3);
    DenseVector<T> v;   Resize(v,3);
    for(int m = N-3; m >= l; m--){
      ///Starting vector essentially random shift.
      if(it%10 == 0 && N >= 3 && it > 0){
        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
        z = H[m+1][m]*H[m+2][m+1];
      }
      ///Starting vector implicit Q theorem
      else{
        s = (H[N-2][N-2] + H[N-1][N-1]);
        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
        z = H[m+1][m]*H[m+2][m+1];
      }
      ck[0] = x; ck[1] = y; ck[2] = z;
      if(m == l) break;
      /** Some stupid thing from numerical recipies, seems to work**/
      // PAB.. for heaven's sake quote page, purpose, evidence it works.
      //       what sort of comment is that!?!?!?
      u=abs(H[m][m-1])*(abs(y)+abs(z));
      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
      if ((T)abs(u+d) == (T)abs(d) ){
 	l = m; break;
      }
      //if (u < small){l = m; break;}
    }
    if(it > 100000){
     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
      exit(1);
    }
    normalize(ck);    ///Normalization cancels in PHP anyway
    T beta;
    Householder_vector<T >(ck, 0, 2, v, beta);
    Householder_mult<T >(H,v,beta,0,l,l+2,0);
    Householder_mult<T >(H,v,beta,0,l,l+2,1);
    ///Accumulate eigenvector
    Householder_mult<T >(P,v,beta,0,l,l+2,1);
    int sw = 0;      ///Are we on the last row?
    for(int k=l;k<N-2;k++){
      x = H[k+1][k];
      y = H[k+2][k];
      z = (T)0.0;
      if(k+3 <= N-1){
 	z = H[k+3][k];
      } else{
 	sw = 1; 
 	v[2] = (T)0.0;
      }
      ck[0] = x; ck[1] = y; ck[2] = z;
      normalize(ck);
      Householder_vector<T >(ck, 0, 2-sw, v, beta);
      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
      ///Accumulate eigenvector
      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
    }
    it++;
    tot_it++;
  }while(N > 1);
  N = evals.size();
  ///Annoying - UT solves in reverse order;
  DenseVector<T> tmp; Resize(tmp,N);
  for(int i=0;i<N;i++){
    tmp[i] = evals[N-i-1];
  } 
  evals = tmp;
  UTeigenvectors(H, trows, evals, evecs);
  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
  return tot_it;
 }
 template <class T>
 int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
 {
  /**
  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
  H =
  x  x  0  0  0  0
  x  x  x  0  0  0
  0  x  x  x  0  0
  0  0  x  x  x  0
  0  0  0  x  x  x
  0  0  0  0  x  x
  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
  return my_Wilkinson(Hin, evals, evecs, small, small);
 }
 template <class T>
 int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
 {
  int N; SizeSquare(Hin,N);
  int M = N;
  ///I don't want to modify the input but matricies must be passed by reference
  //Scale a matrix by its "norm"
  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
  DenseMatrix<T> H;  H = Hin;
  RealD Hnorm = abs(Norm(Hin));
  H = H * (1.0 / Hnorm);
  // TODO use openmp and memset
  Fill(evals,0);
  Fill(evecs,0);
  T s, t, x = 0, y = 0, z = 0;
  T u, d;
  T apd, amd, bc;
  DenseVector<T> p; Resize(p,N); Fill(p,0);
  T nrm = Norm(H);    ///DenseMatrix Norm
  int n, m;
  int e = 0;
  int it = 0;
  int tot_it = 0;
  int l = 0;
  int r = 0;
  DenseMatrix<T> P; Resize(P,N,N);
  Unity(P);
  DenseVector<int> trows(N, 0);
  /// Check if the matrix is really symm tridiag
  RealD sth = 0;
  for(int j = 0; j < N; ++j)
  {
    for(int i = j + 2; i < N; ++i)
    {
      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
      {
 	std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
 	std::cout << "Warning tridiagonalize and call again" << std::endl;
        // exit(1); // see what is going on
        //return;
      }
    }
  }
  do{
    do{
      //Jasper
      //Check if the subdiagonal term is small enough (<small)
      //if true then it is converged.
      //check start from H.dim - e - 1
      //How to deal with more than 2 are converged?
      //What if Chop_symm_subdiag return something int the middle?
      //--------------
      l = Chop_symm_subdiag(H,nrm, e, small);
      r = 0;    ///May have converged on more than one eval
      //Jasper
      //In this case
      // x  x  0  0  0  0
      // x  x  x  0  0  0
      // 0  x  x  x  0  0
      // 0  0  x  x  x  0
      // 0  0  0  x  x  0
      // 0  0  0  0  0  x  <- l
      //--------------
      ///Single eval
      if(l == N - 1)
      {
        evals[e] = H[l][l];
        N--;
        e++;
        r++;
        it = 0;
      }
      //Jasper
      // x  x  0  0  0  0
      // x  x  x  0  0  0
      // 0  x  x  x  0  0
      // 0  0  x  x  0  0
      // 0  0  0  0  x  x  <- l
      // 0  0  0  0  x  x
      //--------------
      ///RealD eval
      if(l == N - 2)
      {
        trows[l + 1] = 1;    ///Needed for UTSolve
        apd = H[l][l] + H[l + 1][ l + 1];
        amd = H[l][l] - H[l + 1][l + 1];
        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
        N -= 2;
        e += 2;
        r++;
        it = 0;
      }
    }while(r > 0);
    //Jasper
    //Already converged
    //--------------
    if(N == 0) break;
    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
    for(int m = N - 3; m >= l; m--)
    {
      ///Starting vector essentially random shift.
      if(it%10 == 0 && N >= 3 && it > 0)
      {
        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
        x = H[m][m] - t;
        z = H[m + 1][m];
      } else {
      ///Starting vector implicit Q theorem
        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
 	  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
        x = H[m][m] - t;
        z = H[m + 1][m];
      }
      //Jasper
      //why it is here????
      //-----------------------
      if(m == l)
        break;
      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
      if ((T)abs(u + d) == (T)abs(d))
      {
        l = m;
        break;
      }
    }
    //Jasper
    if(it > 1000000)
    {
      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
      exit(1);
    }
    //
    T s, c;
    Givens_calc<T>(x, z, c, s);
    Givens_mult<T>(H, l, l + 1, c, -s, 0);
    Givens_mult<T>(H, l, l + 1, c,  s, 1);
    Givens_mult<T>(P, l, l + 1, c,  s, 1);
    //
    for(int k = l; k < N - 2; ++k)
    {
      x = H.A[k + 1][k];
      z = H.A[k + 2][k];
      Givens_calc<T>(x, z, c, s);
      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
    }
    it++;
    tot_it++;
  }while(N > 1);
  N = evals.size();
  ///Annoying - UT solves in reverse order;
  DenseVector<T> tmp(N);
  for(int i = 0; i < N; ++i)
    tmp[i] = evals[N-i-1];
  evals = tmp;
  //
  UTeigenvectors(H, trows, evals, evecs);
  //UTSymmEigenvectors(H, trows, evals, evecs);
  for(int i = 0; i < evals.size(); ++i)
  {
    evecs[i] = P * evecs[i];
    normalize(evecs[i]);
    evals[i] = evals[i] * Hnorm;
  }
  // // FIXME this is to test
  // Hin.write("evecs3", evecs);
  // Hin.write("evals3", evals);
  // // check rsd
  // for(int i = 0; i < M; i++) {
  //   vector<T> Aevec = Hin * evecs[i];
  //   RealD norm2(0.);
  //   for(int j = 0; j < M; j++) {
  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
  //   }
  // }
  return tot_it;
 }
 template <class T>
 void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
  /**
  turn a matrix A =
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  x  x  x  x  x
  into
  x  x  x  x  x
  x  x  x  x  x
  0  x  x  x  x
  0  0  x  x  x
  0  0  0  x  x
  with householder rotations
  Slow.
  */
  int N ; SizeSquare(A,N);
  DenseVector<T > p; Resize(p,N); Fill(p,0);
  for(int k=start;k<N-2;k++){
    //cerr << "hess" << k << std::endl;
    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
    normalize(ck);    ///Normalization cancels in PHP anyway
    T beta;
    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
    ///Accumulate eigenvector
    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
  }
  /*for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
    A(0,k,l);
    }
    }*/
 }
 template <class T>
 void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
 ///Tridiagonalize a matrix
  int N; SizeSquare(A,N);
  Hess(A,Q,start);
  /*for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
    A(0,l,k);
    }
    }*/
 }
 template <class T>
 void ForceTridiagonal(DenseMatrix<T> &A){
 ///Tridiagonalize a matrix
  int N ; SizeSquare(A,N);
  for(int l=0;l<N-2;l++){
    for(int k=l+2;k<N;k++){
      A[l][k]=0;
      A[k][l]=0;
    }
  }
 }
 template <class T>
 int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
  int N; SizeSquare(Ain,N);
  DenseMatrix<T > A; A = Ain;
  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
  Tri(A,Q,0);
  int it = my_Wilkinson<T>(A, evals, evecs, small);
  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
  return it;
 }
 template <class T>
 int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  return my_Wilkinson(Ain, evals, evecs, small);
 }
 template <class T>
 int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
  return my_SymmEigensystem(Ain, evals, evecs, small);
 }
 template <class T>
 int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
 ///Solve a general eigensystem, not necessarily in tridiagonal form
  int N = Ain.dim;
  DenseMatrix<T > A(N); A = Ain;
  DenseMatrix<T > Q(N);Q.Unity();
  Hess(A,Q,0);
  int it = QReigensystem<T>(A, evals, evecs, small);
  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
  return it;
 }
 }
 #endif
--- a/lib/algorithms/densematrix/Householder.h
+++ b/lib/algorithms/densematrix/Householder.h
@ -1,242 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/Householder.h
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef HOUSEHOLDER_H
 #define HOUSEHOLDER_H
 #define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 #include <cstdlib>
 #include <string>
 #include <cmath>
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
 #include <fstream>
 #include <complex>
 #include <algorithm>
 namespace Grid {
 /** Comparison function for finding the max element in a vector **/
 template <class T> bool cf(T i, T j) { 
  return abs(i) < abs(j); 
 }
 /** 
 	Calculate a real Givens angle 
 **/
 template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
  RealD mz = (RealD)abs(z);
  if(mz==0.0){
    c = 1; s = 0;
  }
  if(mz >= (RealD)abs(y)){
    T t = -y/z;
    s = (T)1.0 / sqrt ((T)1.0 + t * t);
    c = s * t;
  } else {
    T t = -z/y;
    c = (T)1.0 / sqrt ((T)1.0 + t * t);
    s = c * t;
  }
 }
 template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
 {
  int q ; SizeSquare(A,q);
  if(dir == 0){
    for(int j=0;j<q;j++){
      T nu = A[i][j];
      T w  = A[k][j];
      A[i][j] = (c*nu + s*w);
      A[k][j] = (-s*nu + c*w);
    }
  }
  if(dir == 1){
    for(int j=0;j<q;j++){
      T nu = A[j][i];
      T w  = A[j][k];
      A[j][i] = (c*nu - s*w);
      A[j][k] = (s*nu + c*w);
    }
  }
 }
 /**
 	from input = x;
 	Compute the complex Householder vector, v, such that
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	P | x |    | x | k = 0
 	| x |    | 0 | 
 	| x | =  | 0 |
 	| x |    | 0 | j = 3
 	| x |	   | x |
 	These are the "Unreduced" Householder vectors.
 **/
 template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
 {
  int N ; Size(input,N);
  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
  if(abs(m) > 0.0){
    T alpha = 0;
    for(int i=k; i<j+1; i++){
      v[i] = input[i]/m;
      alpha = alpha + v[i]*conj(v[i]);
    }
    alpha = sqrt(alpha);
    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
    else                 v[k] = -alpha;
  } else{
    for(int i=k; i<j+1; i++){
      v[i] = 0.0;
    } 
  }
 }
 /**
 	from input = x;
 	Compute the complex Householder vector, v, such that
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	Px = alpha*e_dir
 	These are the "Unreduced" Householder vectors.
 **/
 template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
 {
  int N = input.size();
  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
  if(abs(m) > 0.0){
    T alpha = 0;
    for(int i=k; i<j+1; i++){
      v[i] = input[i]/m;
      alpha = alpha + v[i]*conj(v[i]);
    }
    alpha = sqrt(alpha);
    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
    else                  v[dir] = -alpha;
  }else{
    for(int i=k; i<j+1; i++){
      v[i] = 0.0;
    } 
  }
 }
 /**
 	Compute the product PA if trans = 0
 	AP if trans = 1
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	start at element l of matrix A
 	v is of length j - k + 1 of v are nonzero
 **/
 template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
 {
  int N ; SizeSquare(A,N);
  if(abs(beta) > 0.0){
    for(int p=l; p<N; p++){
      T s = 0;
      if(trans==0){
 	for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
 	s *= beta;
 	for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
      } else {
 	for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
 	s *= beta;
 	for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
      }
    }
  }
 }
 /**
 	Compute the product PA if trans = 0
 	AP if trans = 1
 	P = (I - b v transpose(v) )
 	b = 2/v.v
 	start at element l of matrix A
 	v is of length j - k + 1 of v are nonzero
 	A is tridiagonal
 **/
 template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
 {
  if(abs(beta) > 0.0){
    int N ; SizeSquare(A,N);
    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
    T s;
    for(int p=l; p<M; p++){
      s = 0;
      if(trans==0){
 	for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
      }else{
 	for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
      }
      s = beta*s;
      if(trans==0){
 	for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
      }else{
 	for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
      }
    }
    for(int p=l; p<M; p++){
      if(trans==0){
 	for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
      }else{
 	for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
      }
    }
  }
 }
 }
 #endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -39,7 +39,9 @@ void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
                   int *info);
 #endif
-#include <Grid/algorithms/densematrix/DenseMatrix.h>
+template<class T> using DenseVector = std::vector<T>;
 //#include <Grid/algorithms/densematrix/DenseMatrix.h>
 #include <Grid/algorithms/iterative/EigenSort.h>
 namespace Grid {
@ -47,104 +49,85 @@ namespace Grid {
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
 template<class Field> 
-    class ImplicitlyRestartedLanczos {
+class ImplicitlyRestartedLanczos {
    const RealD small = 1.0e-16;
 public:       
-    int lock;
+  int Niter;   // Max iterations
    int get;
    int Niter;
    int converged;
  int Nstop;   // Number of evecs checked for convergence
  int Nk;      // Number of converged sought
    int Np;      // Np -- Number of spare vecs in kryloc space
  int Nm;      // Nm -- total number of vectors
  RealD eresid;
  ////////////////////////////////////
  // Embedded objects
  ////////////////////////////////////
           SortEigen<Field> _sort;
 //    GridCartesian &_fgrid;
  LinearOperatorBase<Field> &_Linop;
    OperatorFunction<Field> &_poly;
  /////////////////////////
  // Constructor
  /////////////////////////
-    void init(void){};
+ ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
    void Abort(int ff, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs);
    ImplicitlyRestartedLanczos(
 				LinearOperatorBase<Field> &Linop, // op
 			    OperatorFunction<Field> & poly,   // polynmial
 			    int _Nstop, // sought vecs
 			    int _Nk,    // sought vecs
-			       int _Nm, // spare vecs
+			    int _Nm,    // total vecs
 			    RealD _eresid, // resid in lmdue deficit 
 			    int _Niter) : // Max iterations
-      _Linop(Linop),
+    _Linop(Linop),    _poly(poly),
-      _poly(poly),
+    Nstop(_Nstop), Nk(_Nk), Nm(_Nm),
-      Nstop(_Nstop),
+    eresid(_eresid),  Niter(_Niter)  { };
      Nk(_Nk),
      Nm(_Nm),
      eresid(_eresid),
      Niter(_Niter)
    { 
      Np = Nm-Nk; assert(Np>0);
    };
-    ImplicitlyRestartedLanczos(
+#if 0
-				LinearOperatorBase<Field> &Linop, // op
+    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
 			       OperatorFunction<Field> & poly,   // polynmial
 			       int _Nk, // sought vecs
-			       int _Nm, // spare vecs
+			       int _Nm, // total vecs
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _Niter) : // Max iterations
-      _Linop(Linop),
+    _Linop(Linop),      _poly(poly),
-      _poly(poly),
+    Nstop(_Nk), Nk(_Nk), Nm(_Nm),      
-      Nstop(_Nk),
+    eresid(_eresid),      Niter(_Niter) { };
-      Nk(_Nk),
+#endif
      Nm(_Nm),
      eresid(_eresid),
      Niter(_Niter)
    { 
      Np = Nm-Nk; assert(Np>0);
    };
-    /////////////////////////
+#if 0
-    // Sanity checked this routine (step) against Saad.
+    void calc(DenseVector<RealD>& eval,
-    /////////////////////////
+	      DenseVector<Field>& evec,
-    void RitzMatrix(DenseVector<Field>& evec,int k){
+	      const Field& src,
 	      int& Nconv);
-      if(1) return;
+    void step(DenseVector<RealD>& lmd,
 	      DenseVector<RealD>& lme, 
 	      DenseVector<Field>& evec,
 	      Field& w,int Nm,int k);
-      GridBase *grid = evec[0]._grid;
+    void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) ;
-      Field w(grid);
+
-      std::cout << "RitzMatrix "<<std::endl;
+    static RealD normalise(Field& v) ;
-      for(int i=0;i<k;i++){
+    void orthogonalize(Field& w, DenseVector<Field>& evec, int k);
-	_poly(_Linop,evec[i],w);
+    void diagonalize(DenseVector<RealD>& lmd,
-	std::cout << "["<<i<<"] ";
+		     DenseVector<RealD>& lme, 
-	for(int j=0;j<k;j++){
+		     int N2, int N1,
-	  ComplexD in = innerProduct(evec[j],w);
+		     DenseVector<RealD>& Qt,
-	  if ( fabs((double)i-j)>1 ) { 
+		     GridBase *grid);
-	    if (abs(in) >1.0e-9 )  { 
+
-	      std::cout<<"oops"<<std::endl;
+    void qr_decomp(DenseVector<RealD>& lmd,
-	      abort();
+		   DenseVector<RealD>& lme,
-	    } else 
+		   int Nk, int Nm,
-	      std::cout << " 0 ";
+		   DenseVector<RealD>& Qt,
-	  } else { 
+		   RealD Dsh, int kmin, int kmax);
-	    std::cout << " "<<in<<" ";
+
-	  }
+#ifdef USE_LAPACK
-	}
+    void diagonalize_lapack(DenseVector<RealD>& lmd,
-	std::cout << std::endl;
+			    DenseVector<RealD>& lme, 
-      }
+			    int N1, int N2,
-    }
+			    DenseVector<RealD>& Qt,
 			    GridBase *grid);
 #endif
 #endif
 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
@ -161,12 +144,12 @@ public:
 	      DenseVector<Field>& evec,
 	      Field& w,int Nm,int k)
    {
      const RealD tiny = 1.0e-20;
      assert( k< Nm );
      _poly(_Linop,evec[k],w);      // 3. wk:=Avk−βkv_{k−1}
-      if(k>0){
+
-	w -= lme[k-1] * evec[k-1];
+      if(k>0) w -= lme[k-1] * evec[k-1];
      }    
      ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk)
      RealD     alph = real(zalph);
@ -176,29 +159,20 @@ public:
      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
                                 // 7. vk+1 := wk/βk+1
 //	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
      const RealD tiny = 1.0e-20;
      if ( beta < tiny ) { 
 	std::cout << " beta is tiny "<<beta<<std::endl;
     }
      lmd[k] = alph;
      lme[k] = beta;
-      if (k>0) { 
+      if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise
-	orthogonalize(w,evec,k); // orthonormalise
+      if ( k < Nm-1) evec[k+1] = w;
      if ( beta < tiny ) std::cout << " beta is tiny "<<beta<<std::endl;
    }
-      if(k < Nm-1) evec[k+1] = w;
+    void qr_decomp(DenseVector<RealD>& lmd,   // Nm 
-    }
+		   DenseVector<RealD>& lme,   // Nm 
-
+		   int Nk, int Nm,
-    void qr_decomp(DenseVector<RealD>& lmd,
+		   DenseVector<RealD>& Qt,     // Nm x Nm matrix
-		   DenseVector<RealD>& lme,
+		   RealD Dsh, int kmin, int kmax)
 		   int Nk,
 		   int Nm,
 		   DenseVector<RealD>& Qt,
 		   RealD Dsh, 
 		   int kmin,
 		   int kmax)
    {
      int k = kmin-1;
      RealD x;
@ -254,30 +228,31 @@ public:
      }
    }
 #ifdef USE_LAPACK
    void diagonalize_lapack(DenseVector<RealD>& lmd,
 			    DenseVector<RealD>& lme, 
 			    int N1,
 			    int N2,
 			    DenseVector<RealD>& Qt,
-		     GridBase *grid){
+			    GridBase *grid)
    {
      const int size = Nm;
 //  tevals.resize(size);
 //  tevecs.resize(size);
      int NN = N1;
      double evals_tmp[NN];
      double evec_tmp[NN][NN];
      memset(evec_tmp[0],0,sizeof(double)*NN*NN);
 //  double AA[NN][NN];
      double DD[NN];
      double EE[NN];
-  for (int i = 0; i< NN; i++)
+      for (int i = 0; i< NN; i++) {
-    for (int j = i - 1; j <= i + 1; j++)
+	for (int j = i - 1; j <= i + 1; j++) {
 	  if ( j < NN && j >= 0 ) {
 	    if (i==j) DD[i] = lmd[i];
 	    if (i==j) evals_tmp[i] = lmd[i];
 	    if (j==(i-1)) EE[j] = lme[j];
 	  }
 	}
      }
      int evals_found;
      int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
      int liwork =  3+NN*10 ;
@ -291,9 +266,6 @@ public:
      char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
      int ifail[NN];
      int info;
 //  int total = QMP_get_number_of_nodes();
 //  int node = QMP_get_node_number();
 //  GridBase *grid = evec[0]._grid;
      int total = grid->_Nprocessors;
      int node  = grid->_processor;
      int interval = (NN/total)+1;
@ -304,7 +276,6 @@ public:
      if (1) {
 	memset(evals_tmp,0,sizeof(double)*NN);
 	if ( il <= NN){
        printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
 	  LAPACK_dstegr(&jobz, &range, &NN,
 			(double*)DD, (double*)EE,
 			&vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
@ -314,7 +285,6 @@ public:
 			work, &lwork, iwork, &liwork,
 			&info);
 	  for (int i = iu-1; i>= il-1; i--){
          printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]);
 	    evals_tmp[i] = evals_tmp[i - (il-1)];
 	    if (il>1) evals_tmp[i-(il-1)]=0.;
 	    for (int j = 0; j< NN; j++){
@ -324,22 +294,22 @@ public:
 	  }
 	}
 	{
 //        QMP_sum_double_array(evals_tmp,NN);
 //        QMP_sum_double_array((double *)evec_tmp,NN*NN);
 	  grid->GlobalSumVector(evals_tmp,NN);
 	  grid->GlobalSumVector((double*)evec_tmp,NN*NN);
 	}
      } 
-// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
+      // cheating a bit.
      // It is better to sort instead of just reversing it, 
      // but the document of the routine says evals are sorted in increasing order. 
      // qr gives evals in decreasing order.
      for(int i=0;i<NN;i++){
 	for(int j=0;j<NN;j++)
 	  Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
 	lmd [NN-1-i]=evals_tmp[i];
      }
-}
+    }
 #endif
    void diagonalize(DenseVector<RealD>& lmd,
 		     DenseVector<RealD>& lme, 
 		     int N2,
@ -361,17 +331,16 @@ public:
      lmd2[k] = lmd[k];
      lme2[k] = lme[k];
    }
-         for(int k=0; k<N1*N1; ++k)
+    for(int k=0; k<N1*N1; ++k){
      Qt2[k] = Qt[k];
-
+    }
 //	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
 #endif
      int Niter = 100*N1;
      int kmin = 1;
      int kmax = N2;
      // (this should be more sophisticated)
      // (this should be more sophisticated)
      for(int iter=0; iter<Niter; ++iter){
 	// determination of 2x2 leading submatrix
@ -402,10 +371,6 @@ public:
 	  _sort.push(lmd2,N2);
 	  for(int k=0; k<N2; ++k){
 	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
 //	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
 	  }
         for(int k=0; k<N1*N1; ++k){
 //	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
 	  }
 	}
 #endif
@ -424,7 +389,6 @@ public:
      abort();
    }
 #if 1
    static RealD normalise(Field& v) 
    {
      RealD nn = norm2(v);
@ -457,6 +421,7 @@ public:
      normalise(w);
    }
    void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) {
      for(int i=0; i<Qt.size(); ++i) Qt[i] = 0.0;
      for(int k=0; k<Nm; ++k) Qt[k + k*Nm] = 1.0;
@ -488,8 +453,9 @@ until convergence
 	GridBase *grid = evec[0]._grid;
 	assert(grid == src._grid);
-	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
+	std::cout << " -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
-	std::cout << " -- Nm = " << Nm << std::endl;
+	std::cout << " -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
 	std::cout << " -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
 	std::cout << " -- size of eval = " << eval.size() << std::endl;
 	std::cout << " -- size of evec = " << evec.size() << std::endl;
@ -514,38 +480,24 @@ until convergence
 	RealD beta_k;
 	// Set initial vector
 	// (uniform vector) Why not src??
 	//	evec[0] = 1.0;
 	evec[0] = src;
 	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
-// << src._grid  << std::endl;
+
 	normalise(evec[0]);
 	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
 // << evec[0]._grid << std::endl;
 	// Initial Nk steps
 	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
 //	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
 //	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
 	RitzMatrix(evec,Nk);
 	for(int k=0; k<Nk; ++k){
 //	std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
 //	std:: cout <<"lme " << k << " " << lme[k] << std::endl;
 	}
 	// Restarting loop begins
-	for(int iter = 0; iter<Niter; ++iter){
+	int iter;
 	for(iter = 0; iter<Niter; ++iter){
 	  std::cout<<"\n Restart iteration = "<< iter << std::endl;
 	  // 
 	  // Rudy does a sort first which looks very different. Getting fed up with sorting out the algo defs.
 	  // We loop over 
 	  //
 	  for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
 	  f *= lme[Nm-1];
-	  RitzMatrix(evec,k2);
+	  f *= lme[Nm-1];
 	  // getting eigenvalues
 	  for(int k=0; k<Nm; ++k){
@ -561,9 +513,8 @@ until convergence
 	  // Implicitly shifted QR transformations
 	  setUnit_Qt(Nm,Qt);
 	  for(int ip=k2; ip<Nm; ++ip){ 
-	std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
+	    //	    std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
 	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
 	  }
 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
@ -602,15 +553,11 @@ until convergence
 	      B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+j*Nm] * evec[k];
 	    }
 //	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
 	  }
 //	_sort.push(eval2,B,Nk);
 	  Nconv = 0;
 	  //	  std::cout << std::setiosflags(std::ios_base::scientific);
 	  for(int i=0; i<Nk; ++i){
 //	    _poly(_Linop,B[i],v);
 	    _Linop.HermOp(B[i],v);
 	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
@ -631,8 +578,6 @@ until convergence
 	    }
 	  }  // i-loop end
 	  //	  std::cout << std::resetiosflags(std::ios_base::scientific);
 	  std::cout<<" #modes converged: "<<Nconv<<std::endl;
@ -655,556 +600,10 @@ until convergence
      _sort.push(eval,evec,Nconv);
      std::cout << "\n Converged\n Summary :\n";
-      std::cout << " -- Iterations  = "<< Nconv  << "\n";
+      std::cout << " -- Iterations  = "<< iter   << "\n";
      std::cout << " -- beta(k)     = "<< beta_k << "\n";
      std::cout << " -- Nconv       = "<< Nconv  << "\n";
     }
    /////////////////////////////////////////////////
    // Adapted from Rudy's lanczos factor routine
    /////////////////////////////////////////////////
    int Lanczos_Factor(int start, int end,  int cont,
 		       DenseVector<Field> & bq, 
 		       Field &bf,
 		       DenseMatrix<RealD> &H){
      GridBase *grid = bq[0]._grid;
      RealD beta;  
      RealD sqbt;  
      RealD alpha;
      for(int i=start;i<Nm;i++){
 	for(int j=start;j<Nm;j++){
 	  H[i][j]=0.0;
 	}
      }
      std::cout<<"Lanczos_Factor start/end " <<start <<"/"<<end<<std::endl;
      // Starting from scratch, bq[0] contains a random vector and |bq[0]| = 1
      int first;
      if(start == 0){
 	std::cout << "start == 0\n"; //TESTING
 	_poly(_Linop,bq[0],bf);
 	alpha = real(innerProduct(bq[0],bf));//alpha =  bq[0]^dag A bq[0]
 	std::cout << "alpha = " << alpha << std::endl;
 	bf = bf - alpha * bq[0];  //bf =  A bq[0] - alpha bq[0]
 	H[0][0]=alpha;
 	std::cout << "Set H(0,0) to " << H[0][0] << std::endl;
 	first = 1;
      } else {
 	first = start;
      }
      // I think start==0 and cont==zero are the same. Test this
      // If so I can drop "cont" parameter?
      if( cont ) assert(start!=0);
      if( start==0 ) assert(cont!=0);
      if( cont){
 	beta = 0;sqbt = 0;
 	std::cout << "cont is true so setting beta to zero\n";
      }	else {
 	beta = norm2(bf);
 	sqbt = sqrt(beta);
 	std::cout << "beta = " << beta << std::endl;
      }
      for(int j=first;j<end;j++){
 	std::cout << "Factor j " << j <<std::endl;
 	if(cont){ // switches to factoring; understand start!=0 and initial bf value is right.
 	  bq[j] = bf; cont = false;
 	}else{
 	  bq[j] = (1.0/sqbt)*bf ;
 	  H[j][j-1]=H[j-1][j] = sqbt;
 	}
 	_poly(_Linop,bq[j],bf);
 	bf = bf - (1.0/sqbt)*bq[j-1]; 	       //bf = A bq[j] - beta bq[j-1] // PAB this comment was incorrect in beta term??
 	alpha = real(innerProduct(bq[j],bf));  //alpha = bq[j]^dag A bq[j]
 	bf = bf - alpha*bq[j];                 //bf = A bq[j] - beta bq[j-1] - alpha bq[j]
 	RealD fnorm = norm2(bf);
 	RealD bck = sqrt( real( conjugate(alpha)*alpha ) + beta );
 	beta = fnorm;
 	sqbt = sqrt(beta);
 	std::cout << "alpha = " << alpha << " fnorm = " << fnorm << '\n';
 	///Iterative refinement of orthogonality V = [ bq[0]  bq[1]  ...  bq[M] ]
 	int re = 0;
 	// FIXME undefined params; how set in Rudy's code
 	int ref =0;
 	Real rho = 1.0e-8;
 	while( re == ref || (sqbt < rho * bck && re < 5) ){
 	  Field tmp2(grid);
 	  Field tmp1(grid);
 	  //bex = V^dag bf
 	  DenseVector<ComplexD> bex(j+1);
 	  for(int k=0;k<j+1;k++){
 	    bex[k] = innerProduct(bq[k],bf);
 	  }
 	  zero_fermion(tmp2);
 	  //tmp2 = V s
 	  for(int l=0;l<j+1;l++){
 	    RealD nrm = norm2(bq[l]);
 	    axpy(tmp1,0.0,bq[l],bq[l]); scale(tmp1,bex[l]); 	//tmp1 = V[j] bex[j]
 	    axpy(tmp2,1.0,tmp2,tmp1);					//tmp2 += V[j] bex[j]
 	  }
 	  //bf = bf - V V^dag bf.   Subtracting off any component in span { V[j] } 
 	  RealD btc = axpy_norm(bf,-1.0,tmp2,bf);
 	  alpha = alpha + real(bex[j]);	      sqbt = sqrt(real(btc));	      
 	  // FIXME is alpha real in RUDY's code?
 	  RealD nmbex = 0;for(int k=0;k<j+1;k++){nmbex = nmbex + real( conjugate(bex[k])*bex[k]  );}
 	  bck = sqrt( nmbex );
 	  re++;
 	}
 	std::cout << "Iteratively refined orthogonality, changes alpha\n";
 	if(re > 1) std::cout << "orthagonality refined " << re << " times" <<std::endl;
 	H[j][j]=alpha;
      }
      return end;
    }
    void EigenSort(DenseVector<double> evals,
 		   DenseVector<Field>  evecs){
      int N= evals.size();
      _sort.push(evals,evecs, evals.size(),N);
    }
    void ImplicitRestart(int TM, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs, DenseVector<Field> &bq, Field &bf, int cont)
    {
      std::cout << "ImplicitRestart begin. Eigensort starting\n";
      DenseMatrix<RealD> H; Resize(H,Nm,Nm);
      EigenSort(evals, evecs);
      ///Assign shifts
      int K=Nk;
      int M=Nm;
      int P=Np;
      int converged=0;
      if(K - converged < 4) P = (M - K-1); //one
      //      DenseVector<RealD> shifts(P + shift_extra.size());
      DenseVector<RealD> shifts(P);
      for(int k = 0; k < P; ++k)
 	shifts[k] = evals[k]; 
      /// Shift to form a new H and q
      DenseMatrix<RealD> Q; Resize(Q,TM,TM);
      Unity(Q);
      Shift(Q, shifts); // H is implicitly passed in in Rudy's Shift routine
      int ff = K;
      /// Shifted H defines a new K step Arnoldi factorization
      RealD  beta = H[ff][ff-1]; 
      RealD  sig  = Q[TM - 1][ff - 1];
      std::cout << "beta = " << beta << " sig = " << real(sig) <<std::endl;
      std::cout << "TM = " << TM << " ";
      std::cout << norm2(bq[0]) << " -- before" <<std::endl;
      /// q -> q Q
      times_real(bq, Q, TM);
      std::cout << norm2(bq[0]) << " -- after " << ff <<std::endl;
      bf =  beta* bq[ff] + sig* bf;
      /// Do the rest of the factorization
      ff = Lanczos_Factor(ff, M,cont,bq,bf,H);
      if(ff < M)
 	Abort(ff, evals, evecs);
    }
 ///Run the Eigensolver
    void Run(int cont, DenseVector<Field> &bq, Field &bf, DenseVector<DenseVector<RealD> > & evecs,DenseVector<RealD> &evals)
    {
      init();
      int M=Nm;
      DenseMatrix<RealD> H; Resize(H,Nm,Nm);
      Resize(evals,Nm);
      Resize(evecs,Nm);
      int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with
      if(ff < M) {
 	std::cout << "Krylov: aborting ff "<<ff <<" "<<M<<std::endl;
 	abort(); // Why would this happen?
      }
      int itcount = 0;
      bool stop = false;
      for(int it = 0; it < Niter && (converged < Nk); ++it) {
 	std::cout << "Krylov: Iteration --> " << it << std::endl;
 	int lock_num = lock ? converged : 0;
 	DenseVector<RealD> tevals(M - lock_num );
 	DenseMatrix<RealD> tevecs; Resize(tevecs,M - lock_num,M - lock_num);
 	//check residual of polynominal 
 	TestConv(H,M, tevals, tevecs);
 	if(converged >= Nk)
 	    break;
 	ImplicitRestart(ff, tevals,tevecs,H);
      }
      Wilkinson<RealD>(H, evals, evecs, small); 
      //      Check();
      std::cout << "Done  "<<std::endl;
    }
   ///H - shift I = QR; H = Q* H Q
    void Shift(DenseMatrix<RealD> & H,DenseMatrix<RealD> &Q, DenseVector<RealD> shifts) {
      int P; Size(shifts,P);
      int M; SizeSquare(Q,M);
      Unity(Q);
      int lock_num = lock ? converged : 0;
      RealD t_Househoulder_vector(0.0);
      RealD t_Househoulder_mult(0.0);
      for(int i=0;i<P;i++){
 	RealD x, y, z;
 	DenseVector<RealD> ck(3), v(3);
 	x = H[lock_num+0][lock_num+0]-shifts[i];
 	y = H[lock_num+1][lock_num+0];
 	ck[0] = x; ck[1] = y; ck[2] = 0; 
 	normalise(ck);	///Normalization cancels in PHP anyway
 	RealD beta;
 	Householder_vector<RealD>(ck, 0, 2, v, beta);
 	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,0);
 	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,1);
 	///Accumulate eigenvector
 	Householder_mult<RealD>(Q,v,beta,0,lock_num+0,lock_num+2,1);
 	int sw = 0;
 	for(int k=lock_num+0;k<M-2;k++){
 	  x = H[k+1][k]; 
 	  y = H[k+2][k]; 
 	  z = (RealD)0.0;
 	  if(k+3 <= M-1){
 	    z = H[k+3][k];
 	  }else{
 	    sw = 1; v[2] = 0.0;
 	  }
 	  ck[0] = x; ck[1] = y; ck[2] = z;
 	  normalise(ck);
 	  Householder_vector<RealD>(ck, 0, 2-sw, v, beta);
 	  Householder_mult<RealD>(H,v, beta,0,k+1,k+3-sw,0);
 	  Householder_mult<RealD>(H,v, beta,0,k+1,k+3-sw,1);
 	  ///Accumulate eigenvector
 	  Householder_mult<RealD>(Q,v, beta,0,k+1,k+3-sw,1);
 	}
      }
    }
    void TestConv(DenseMatrix<RealD> & H,int SS, 
 		  DenseVector<Field> &bq, Field &bf,
 		  DenseVector<RealD> &tevals, DenseVector<DenseVector<RealD> > &tevecs, 
 		  int lock, int converged)
    {
      std::cout << "Converged " << converged << " so far." << std::endl;
      int lock_num = lock ? converged : 0;
      int M = Nm;
      ///Active Factorization
      DenseMatrix<RealD> AH; Resize(AH,SS - lock_num,SS - lock_num );
      AH = GetSubMtx(H,lock_num, SS, lock_num, SS);
      int NN=tevals.size();
      int AHsize=SS-lock_num;
      RealD small=1.0e-16;
      Wilkinson<RealD>(AH, tevals, tevecs, small);
      EigenSort(tevals, tevecs);
      RealD resid_nrm=  norm2(bf);
      if(!lock) converged = 0;
 #if 0
      for(int i = SS - lock_num - 1; i >= SS - Nk && i >= 0; --i){
 	RealD diff = 0;
 	diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm;
 	std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl;
 	if(diff < converged) {
 	  if(lock) {
 	    DenseMatrix<RealD> Q; Resize(Q,M,M);
 	    bool herm = true; 
 	    Lock(H, Q, tevals[i], converged, small, SS, herm);
 	    times_real(bq, Q, bq.size());
 	    bf = Q[M - 1][M - 1]* bf;
 	    lock_num++;
 	  }
 	  converged++;
 	  std::cout << " converged on eval " << converged << " of " << Nk << std::endl;
 	} else {
 	  break;
 	}
      }
 #endif
      std::cout << "Got " << converged << " so far " <<std::endl;	
    }
    ///Check
    void Check(DenseVector<RealD> &evals,
 	       DenseVector<DenseVector<RealD> > &evecs) {
      DenseVector<RealD> goodval(this->get);
      EigenSort(evals,evecs);
      int NM = Nm;
      DenseVector< DenseVector<RealD> > V; Size(V,NM);
      DenseVector<RealD> QZ(NM*NM);
      for(int i = 0; i < NM; i++){
 	for(int j = 0; j < NM; j++){
 	  // evecs[i][j];
 	}
      }
    }
 /**
   There is some matrix Q such that for any vector y
   Q.e_1 = y and Q is unitary.
 **/
  template<class T>
  static T orthQ(DenseMatrix<T> &Q, DenseVector<T> y){
    int N = y.size();	//Matrix Size
    Fill(Q,0.0);
    T tau;
    for(int i=0;i<N;i++){
      Q[i][0]=y[i];
    }
    T sig = conj(y[0])*y[0];
    T tau0 = abs(sqrt(sig));
    for(int j=1;j<N;j++){
      sig += conj(y[j])*y[j]; 
      tau = abs(sqrt(sig) ); 	
      if(abs(tau0) > 0.0){
 	T gam = conj( (y[j]/tau)/tau0 );
 	for(int k=0;k<=j-1;k++){  
 	  Q[k][j]=-gam*y[k];
 	}
 	Q[j][j]=tau0/tau;
      } else {
 	Q[j-1][j]=1.0;
      }
      tau0 = tau;
    }
    return tau;
  }
 /**
 	There is some matrix Q such that for any vector y
 	Q.e_k = y and Q is unitary.
 **/
  template< class T>
  static T orthU(DenseMatrix<T> &Q, DenseVector<T> y){
    T tau = orthQ(Q,y);
    SL(Q);
    return tau;
  }
 /**
 	Wind up with a matrix with the first con rows untouched
 say con = 2
 	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
 	and the matrix is upper hessenberg
 	and with f and Q appropriately modidied with Q is the arnoldi factorization
 **/
 template<class T>
 static void Lock(DenseMatrix<T> &H, 	// Hess mtx	
 		 DenseMatrix<T> &Q, 	// Lock Transform
 		 T val, 		// value to be locked
 		 int con, 	// number already locked
 		 RealD small,
 		 int dfg,
 		 bool herm)
 {	
  //ForceTridiagonal(H);
  int M = H.dim;
  DenseVector<T> vec; Resize(vec,M-con);
  DenseMatrix<T> AH; Resize(AH,M-con,M-con);
  AH = GetSubMtx(H,con, M, con, M);
  DenseMatrix<T> QQ; Resize(QQ,M-con,M-con);
  Unity(Q);   Unity(QQ);
  DenseVector<T> evals; Resize(evals,M-con);
  DenseMatrix<T> evecs; Resize(evecs,M-con,M-con);
  Wilkinson<T>(AH, evals, evecs, small);
  int k=0;
  RealD cold = abs( val - evals[k]); 
  for(int i=1;i<M-con;i++){
    RealD cnew = abs( val - evals[i]);
    if( cnew < cold ){k = i; cold = cnew;}
  }
  vec = evecs[k];
  ComplexD tau;
  orthQ(QQ,vec);
  //orthQM(QQ,AH,vec);
  AH = Hermitian(QQ)*AH;
  AH = AH*QQ;
  for(int i=con;i<M;i++){
    for(int j=con;j<M;j++){
      Q[i][j]=QQ[i-con][j-con];
      H[i][j]=AH[i-con][j-con];
    }
  }
  for(int j = M-1; j>con+2; j--){
    DenseMatrix<T> U; Resize(U,j-1-con,j-1-con);
    DenseVector<T> z; Resize(z,j-1-con); 
    T nm = norm(z); 
    for(int k = con+0;k<j-1;k++){
      z[k-con] = conj( H(j,k+1) );
    }
    normalise(z);
    RealD tmp = 0;
    for(int i=0;i<z.size()-1;i++){tmp = tmp + abs(z[i]);}
    if(tmp < small/( (RealD)z.size()-1.0) ){ continue;}	
    tau = orthU(U,z);
    DenseMatrix<T> Hb; Resize(Hb,j-1-con,M);	
    for(int a = 0;a<M;a++){
      for(int b = 0;b<j-1-con;b++){
 	T sum = 0;
 	for(int c = 0;c<j-1-con;c++){
 	  sum += H[a][con+1+c]*U[c][b];
 	}//sum += H(a,con+1+c)*U(c,b);}
 	Hb[b][a] = sum;
      }
    }
    for(int k=con+1;k<j;k++){
      for(int l=0;l<M;l++){
 	H[l][k] = Hb[k-1-con][l];
      }
    }//H(Hb[k-1-con][l] , l,k);}}
    DenseMatrix<T> Qb; Resize(Qb,M,M);	
    for(int a = 0;a<M;a++){
      for(int b = 0;b<j-1-con;b++){
 	T sum = 0;
 	for(int c = 0;c<j-1-con;c++){
 	  sum += Q[a][con+1+c]*U[c][b];
 	}//sum += Q(a,con+1+c)*U(c,b);}
 	Qb[b][a] = sum;
      }
    }
    for(int k=con+1;k<j;k++){
      for(int l=0;l<M;l++){
 	Q[l][k] = Qb[k-1-con][l];
      }
    }//Q(Qb[k-1-con][l] , l,k);}}
    DenseMatrix<T> Hc; Resize(Hc,M,M);	
    for(int a = 0;a<j-1-con;a++){
      for(int b = 0;b<M;b++){
 	T sum = 0;
 	for(int c = 0;c<j-1-con;c++){
 	  sum += conj( U[c][a] )*H[con+1+c][b];
 	}//sum += conj( U(c,a) )*H(con+1+c,b);}
 	Hc[b][a] = sum;
      }
    }
    for(int k=0;k<M;k++){
      for(int l=con+1;l<j;l++){
 	H[l][k] = Hc[k][l-1-con];
      }
    }//H(Hc[k][l-1-con] , l,k);}}
  }
 }
 #endif
 };
 }
--- a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
@ -102,7 +102,7 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
    FieldMetaData header;
    IldgReader _IldgReader;
    _IldgReader.open(config);
-    _IldgReader.readConfiguration(config,U,header);  // format from the header
+    _IldgReader.readConfiguration(U,header);  // format from the header
    _IldgReader.close();
    std::cout << GridLogMessage << "Read ILDG Configuration from " << config
--- a/tests/solver/Test_dwf_lanczos.cc
+++ b/tests/solver/Test_dwf_lanczos.cc
@ -54,7 +54,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG5rb(FrbGrid);  RNG5.SeedFixedIntegers(seeds5);
  LatticeGaugeField Umu(UGrid); 
-  SU3::TepidConfiguration(RNG4, Umu);
+  SU3::HotConfiguration(RNG4, Umu);
  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){