Improved the lancos

2025-10-18 23:14:44 +01:00 · 2017-06-20 18:46:01 +01:00
parent e9cc21900f
commit 0486ff8e79
7 changed files with 211 additions and 1712 deletions
--- a/28
+++ b/28
@@ -1,24 +1,28 @@
 TODO:
 ---------------

-Peter's work list:
-1)- Precision conversion and sort out localConvert      <-- 
-2)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
-
-- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
-- Physical propagator interface
-- Conserved currents
-- GaugeFix into central location
-- Multigrid Wilson and DWF, compare to other Multigrid implementations
-- HDCR resume
+Large item work list:
+1)- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- 
+2)- MultiRHS with spread out extra dim
+3)- BG/Q port and check
+4)- Precision conversion and sort out localConvert      <-- partial
+  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
+5)- Physical propagator interface
+6)- Conserved currents
+7)- Multigrid Wilson and DWF, compare to other Multigrid implementations
+8)- HDCR resume

 Recent DONE 
+-- GaugeFix into central location                      <-- DONE
+-- Scidac and Ildg metadata handling                   <-- DONE
+-- Binary I/O MPI2 IO                                  <-- DONE
 -- Binary I/O speed up & x-strips                      <-- DONE
 -- Cut down the exterior overhead                      <-- DONE
 -- Interior legs from SHM comms                        <-- DONE
 -- Half-precision comms                                <-- DONE
-- Merge high precision reduction into develop        
-- multiRHS DWF; benchmark on Cori/BNL for comms elimination
+-- Merge high precision reduction into develop         <-- DONE
+-- BlockCG, BCGrQ                                      <-- DONE
+-- multiRHS DWF; benchmark on Cori/BNL for comms elimination <-- DONE
   -- slice* linalg routines for multiRHS, BlockCG    

 -----
--- a/lib/algorithms/densematrix/DenseMatrix.h
+++ b/lib/algorithms/densematrix/DenseMatrix.h
@@ -1,137 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/DenseMatrix.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_DENSE_MATRIX_H
-#define GRID_DENSE_MATRIX_H
-
-namespace Grid {
-    /////////////////////////////////////////////////////////////
-    // Matrix untils
-    /////////////////////////////////////////////////////////////
-
-template<class T> using DenseVector = std::vector<T>;
-template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
-
-template<class T> void Size(DenseVector<T> & vec, int &N) 
-{ 
-  N= vec.size();
-}
-template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
-{ 
-  N= mat.size();
-  M= mat[0].size();
-}
-
-template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
-{ 
-  int M; Size(mat,N,M);
-  assert(N==M);
-}
-
-template<class T> void Resize(DenseVector<T > & mat, int N) { 
-  mat.resize(N);
-}
-template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
-  mat.resize(N);
-  for(int i=0;i<N;i++){
-    mat[i].resize(M);
-  }
-}
-template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
-  int N,M;
-  Size(mat,N,M);
-  for(int i=0;i<N;i++){
-  for(int j=0;j<M;j++){
-    mat[i][j] = val;
-  }}
-}
-
-/** Transpose of a matrix **/
-template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
-  int N,M;
-  Size(mat,N,M);
-  DenseMatrix<T> C; Resize(C,M,N);
-  for(int i=0;i<M;i++){
-  for(int j=0;j<N;j++){
-    C[i][j] = mat[j][i];
-  }} 
-  return C;
-}
-/** Set DenseMatrix to unit matrix **/
-template<class T> void Unity(DenseMatrix<T> &A){
-  int N;  SizeSquare(A,N);
-  for(int i=0;i<N;i++){
-    for(int j=0;j<N;j++){
-      if ( i==j ) A[i][j] = 1;
-      else        A[i][j] = 0;
-    } 
-  } 
-}
-
-/** Add C * I to matrix **/
-template<class T>
-void PlusUnit(DenseMatrix<T> & A,T c){
-  int dim;  SizeSquare(A,dim);
-  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
-}
-
-/** return the Hermitian conjugate of matrix **/
-template<class T>
-DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
-
-  int dim; SizeSquare(mat,dim);
-
-  DenseMatrix<T> C; Resize(C,dim,dim);
-
-  for(int i=0;i<dim;i++){
-    for(int j=0;j<dim;j++){
-      C[i][j] = conj(mat[j][i]);
-    } 
-  } 
-  return C;
-}
-/**Get a square submatrix**/
-template <class T>
-DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
-{
-  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
-
-  for(int i = row_st; i<row_end; i++){
-  for(int j = col_st; j<col_end; j++){
-    H[i-row_st][j-col_st]=A[i][j];
-  }}
-  return H;
-}
-
-}
-
-#include "Householder.h"
-#include "Francis.h"
-
-#endif
-
--- a/lib/algorithms/densematrix/Francis.h
+++ b/lib/algorithms/densematrix/Francis.h
@@ -1,525 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/Francis.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef FRANCIS_H
-#define FRANCIS_H
-
-#include <cstdlib>
-#include <string>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <fstream>
-#include <complex>
-#include <algorithm>
-
-//#include <timer.h>
-//#include <lapacke.h>
-//#include <Eigen/Dense>
-
-namespace Grid {
-
-template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
-template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
-
-/**
-  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
-H =
-      x  x  x  x  x  x  x  x  x
-      x  x  x  x  x  x  x  x  x
-      0  x  x  x  x  x  x  x  x
-      0  0  x  x  x  x  x  x  x
-      0  0  0  x  x  x  x  x  x
-      0  0  0  0  x  x  x  x  x
-      0  0  0  0  0  x  x  x  x
-      0  0  0  0  0  0  x  x  x
-      0  0  0  0  0  0  0  x  x
-Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
-**/
-template <class T>
-int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
-{
-  DenseMatrix<T> H = Hin; 
-
-  int N ; SizeSquare(H,N);
-  int M = N;
-
-  Fill(evals,0);
-  Fill(evecs,0);
-
-  T s,t,x=0,y=0,z=0;
-  T u,d;
-  T apd,amd,bc;
-  DenseVector<T> p(N,0);
-  T nrm = Norm(H);    ///DenseMatrix Norm
-  int n, m;
-  int e = 0;
-  int it = 0;
-  int tot_it = 0;
-  int l = 0;
-  int r = 0;
-  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
-  DenseVector<int> trows(N,0);
-
-  /// Check if the matrix is really hessenberg, if not abort
-  RealD sth = 0;
-  for(int j=0;j<N;j++){
-    for(int i=j+2;i<N;i++){
-      sth = abs(H[i][j]);
-      if(sth > small){
-	std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
-	exit(1);
-      }
-    }
-  }
-
-  do{
-    std::cout << "Francis QR Step N = " << N << std::endl;
-    /** Check for convergence
-      x  x  x  x  x
-      0  x  x  x  x
-      0  0  x  x  x
-      0  0  x  x  x
-      0  0  0  0  x
-      for this matrix l = 4
-     **/
-    do{
-      l = Chop_subdiag(H,nrm,e,small);
-      r = 0;    ///May have converged on more than one eval
-      ///Single eval
-      if(l == N-1){
-        evals[e] = H[l][l];
-        N--; e++; r++; it = 0;
-      }
-      ///RealD eval
-      if(l == N-2){
-        trows[l+1] = 1;    ///Needed for UTSolve
-        apd = H[l][l] + H[l+1][l+1];
-        amd = H[l][l] - H[l+1][l+1];
-        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
-        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
-        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
-        N-=2; e+=2; r++; it = 0;
-      }
-    } while(r>0);
-
-    if(N ==0) break;
-
-    DenseVector<T > ck; Resize(ck,3);
-    DenseVector<T> v;   Resize(v,3);
-
-    for(int m = N-3; m >= l; m--){
-      ///Starting vector essentially random shift.
-      if(it%10 == 0 && N >= 3 && it > 0){
-        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
-        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
-        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
-        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
-        z = H[m+1][m]*H[m+2][m+1];
-      }
-      ///Starting vector implicit Q theorem
-      else{
-        s = (H[N-2][N-2] + H[N-1][N-1]);
-        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
-        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
-        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
-        z = H[m+1][m]*H[m+2][m+1];
-      }
-      ck[0] = x; ck[1] = y; ck[2] = z;
-
-      if(m == l) break;
-
-      /** Some stupid thing from numerical recipies, seems to work**/
-      // PAB.. for heaven's sake quote page, purpose, evidence it works.
-      //       what sort of comment is that!?!?!?
-      u=abs(H[m][m-1])*(abs(y)+abs(z));
-      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
-      if ((T)abs(u+d) == (T)abs(d) ){
-	l = m; break;
-      }
-
-      //if (u < small){l = m; break;}
-    }
-    if(it > 100000){
-     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
-     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
-      exit(1);
-    }
-    normalize(ck);    ///Normalization cancels in PHP anyway
-    T beta;
-    Householder_vector<T >(ck, 0, 2, v, beta);
-    Householder_mult<T >(H,v,beta,0,l,l+2,0);
-    Householder_mult<T >(H,v,beta,0,l,l+2,1);
-    ///Accumulate eigenvector
-    Householder_mult<T >(P,v,beta,0,l,l+2,1);
-    int sw = 0;      ///Are we on the last row?
-    for(int k=l;k<N-2;k++){
-      x = H[k+1][k];
-      y = H[k+2][k];
-      z = (T)0.0;
-      if(k+3 <= N-1){
-	z = H[k+3][k];
-      } else{
-	sw = 1; 
-	v[2] = (T)0.0;
-      }
-      ck[0] = x; ck[1] = y; ck[2] = z;
-      normalize(ck);
-      Householder_vector<T >(ck, 0, 2-sw, v, beta);
-      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
-      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
-      ///Accumulate eigenvector
-      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
-    }
-    it++;
-    tot_it++;
-  }while(N > 1);
-  N = evals.size();
-  ///Annoying - UT solves in reverse order;
-  DenseVector<T> tmp; Resize(tmp,N);
-  for(int i=0;i<N;i++){
-    tmp[i] = evals[N-i-1];
-  } 
-  evals = tmp;
-  UTeigenvectors(H, trows, evals, evecs);
-  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
-  return tot_it;
-}
-
-template <class T>
-int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
-{
-  /**
-  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
-  H =
-  x  x  0  0  0  0
-  x  x  x  0  0  0
-  0  x  x  x  0  0
-  0  0  x  x  x  0
-  0  0  0  x  x  x
-  0  0  0  0  x  x
-  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
-  return my_Wilkinson(Hin, evals, evecs, small, small);
-}
-
-template <class T>
-int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
-{
-  int N; SizeSquare(Hin,N);
-  int M = N;
-
-  ///I don't want to modify the input but matricies must be passed by reference
-  //Scale a matrix by its "norm"
-  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
-  DenseMatrix<T> H;  H = Hin;
-  
-  RealD Hnorm = abs(Norm(Hin));
-  H = H * (1.0 / Hnorm);
-
-  // TODO use openmp and memset
-  Fill(evals,0);
-  Fill(evecs,0);
-
-  T s, t, x = 0, y = 0, z = 0;
-  T u, d;
-  T apd, amd, bc;
-  DenseVector<T> p; Resize(p,N); Fill(p,0);
-
-  T nrm = Norm(H);    ///DenseMatrix Norm
-  int n, m;
-  int e = 0;
-  int it = 0;
-  int tot_it = 0;
-  int l = 0;
-  int r = 0;
-  DenseMatrix<T> P; Resize(P,N,N);
-  Unity(P);
-  DenseVector<int> trows(N, 0);
-  /// Check if the matrix is really symm tridiag
-  RealD sth = 0;
-  for(int j = 0; j < N; ++j)
-  {
-    for(int i = j + 2; i < N; ++i)
-    {
-      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
-      {
-	std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
-	std::cout << "Warning tridiagonalize and call again" << std::endl;
-        // exit(1); // see what is going on
-        //return;
-      }
-    }
-  }
-
-  do{
-    do{
-      //Jasper
-      //Check if the subdiagonal term is small enough (<small)
-      //if true then it is converged.
-      //check start from H.dim - e - 1
-      //How to deal with more than 2 are converged?
-      //What if Chop_symm_subdiag return something int the middle?
-      //--------------
-      l = Chop_symm_subdiag(H,nrm, e, small);
-      r = 0;    ///May have converged on more than one eval
-      //Jasper
-      //In this case
-      // x  x  0  0  0  0
-      // x  x  x  0  0  0
-      // 0  x  x  x  0  0
-      // 0  0  x  x  x  0
-      // 0  0  0  x  x  0
-      // 0  0  0  0  0  x  <- l
-      //--------------
-      ///Single eval
-      if(l == N - 1)
-      {
-        evals[e] = H[l][l];
-        N--;
-        e++;
-        r++;
-        it = 0;
-      }
-      //Jasper
-      // x  x  0  0  0  0
-      // x  x  x  0  0  0
-      // 0  x  x  x  0  0
-      // 0  0  x  x  0  0
-      // 0  0  0  0  x  x  <- l
-      // 0  0  0  0  x  x
-      //--------------
-      ///RealD eval
-      if(l == N - 2)
-      {
-        trows[l + 1] = 1;    ///Needed for UTSolve
-        apd = H[l][l] + H[l + 1][ l + 1];
-        amd = H[l][l] - H[l + 1][l + 1];
-        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
-        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
-        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
-        N -= 2;
-        e += 2;
-        r++;
-        it = 0;
-      }
-    }while(r > 0);
-    //Jasper
-    //Already converged
-    //--------------
-    if(N == 0) break;
-
-    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
-
-    for(int m = N - 3; m >= l; m--)
-    {
-      ///Starting vector essentially random shift.
-      if(it%10 == 0 && N >= 3 && it > 0)
-      {
-        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
-        x = H[m][m] - t;
-        z = H[m + 1][m];
-      } else {
-      ///Starting vector implicit Q theorem
-        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
-        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
-	  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
-        x = H[m][m] - t;
-        z = H[m + 1][m];
-      }
-      //Jasper
-      //why it is here????
-      //-----------------------
-      if(m == l)
-        break;
-
-      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
-      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
-      if ((T)abs(u + d) == (T)abs(d))
-      {
-        l = m;
-        break;
-      }
-    }
-    //Jasper
-    if(it > 1000000)
-    {
-      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
-      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
-      exit(1);
-    }
-    //
-    T s, c;
-    Givens_calc<T>(x, z, c, s);
-    Givens_mult<T>(H, l, l + 1, c, -s, 0);
-    Givens_mult<T>(H, l, l + 1, c,  s, 1);
-    Givens_mult<T>(P, l, l + 1, c,  s, 1);
-    //
-    for(int k = l; k < N - 2; ++k)
-    {
-      x = H.A[k + 1][k];
-      z = H.A[k + 2][k];
-      Givens_calc<T>(x, z, c, s);
-      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
-      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
-      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
-    }
-    it++;
-    tot_it++;
-  }while(N > 1);
-
-  N = evals.size();
-  ///Annoying - UT solves in reverse order;
-  DenseVector<T> tmp(N);
-  for(int i = 0; i < N; ++i)
-    tmp[i] = evals[N-i-1];
-  evals = tmp;
-  //
-  UTeigenvectors(H, trows, evals, evecs);
-  //UTSymmEigenvectors(H, trows, evals, evecs);
-  for(int i = 0; i < evals.size(); ++i)
-  {
-    evecs[i] = P * evecs[i];
-    normalize(evecs[i]);
-    evals[i] = evals[i] * Hnorm;
-  }
-  // // FIXME this is to test
-  // Hin.write("evecs3", evecs);
-  // Hin.write("evals3", evals);
-  // // check rsd
-  // for(int i = 0; i < M; i++) {
-  //   vector<T> Aevec = Hin * evecs[i];
-  //   RealD norm2(0.);
-  //   for(int j = 0; j < M; j++) {
-  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
-  //   }
-  // }
-  return tot_it;
-}
-
-template <class T>
-void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
-
-  /**
-  turn a matrix A =
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  x  x  x  x  x
-  into
-  x  x  x  x  x
-  x  x  x  x  x
-  0  x  x  x  x
-  0  0  x  x  x
-  0  0  0  x  x
-  with householder rotations
-  Slow.
-  */
-  int N ; SizeSquare(A,N);
-  DenseVector<T > p; Resize(p,N); Fill(p,0);
-
-  for(int k=start;k<N-2;k++){
-    //cerr << "hess" << k << std::endl;
-    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
-    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
-    normalize(ck);    ///Normalization cancels in PHP anyway
-    T beta;
-    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
-    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
-    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
-    ///Accumulate eigenvector
-    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
-  }
-  /*for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-    A(0,k,l);
-    }
-    }*/
-}
-
-template <class T>
-void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
-///Tridiagonalize a matrix
-  int N; SizeSquare(A,N);
-  Hess(A,Q,start);
-  /*for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-    A(0,l,k);
-    }
-    }*/
-}
-
-template <class T>
-void ForceTridiagonal(DenseMatrix<T> &A){
-///Tridiagonalize a matrix
-  int N ; SizeSquare(A,N);
-  for(int l=0;l<N-2;l++){
-    for(int k=l+2;k<N;k++){
-      A[l][k]=0;
-      A[k][l]=0;
-    }
-  }
-}
-
-template <class T>
-int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
-  int N; SizeSquare(Ain,N);
-  DenseMatrix<T > A; A = Ain;
-  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
-  Tri(A,Q,0);
-  int it = my_Wilkinson<T>(A, evals, evecs, small);
-  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
-  return it;
-}
-
-
-template <class T>
-int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  return my_Wilkinson(Ain, evals, evecs, small);
-}
-
-template <class T>
-int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-  return my_SymmEigensystem(Ain, evals, evecs, small);
-}
-
-template <class T>
-int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
-///Solve a general eigensystem, not necessarily in tridiagonal form
-  int N = Ain.dim;
-  DenseMatrix<T > A(N); A = Ain;
-  DenseMatrix<T > Q(N);Q.Unity();
-  Hess(A,Q,0);
-  int it = QReigensystem<T>(A, evals, evecs, small);
-  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
-  return it;
-}
-
-}
-#endif
--- a/lib/algorithms/densematrix/Householder.h
+++ b/lib/algorithms/densematrix/Householder.h
@@ -1,242 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/Householder.h
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef HOUSEHOLDER_H
-#define HOUSEHOLDER_H
-
-#define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-#define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-#define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
-
-#include <cstdlib>
-#include <string>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <fstream>
-#include <complex>
-#include <algorithm>
-
-namespace Grid {
-/** Comparison function for finding the max element in a vector **/
-template <class T> bool cf(T i, T j) { 
-  return abs(i) < abs(j); 
-}
-
-/** 
-	Calculate a real Givens angle 
- **/
-template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
-
-  RealD mz = (RealD)abs(z);
-  
-  if(mz==0.0){
-    c = 1; s = 0;
-  }
-  if(mz >= (RealD)abs(y)){
-    T t = -y/z;
-    s = (T)1.0 / sqrt ((T)1.0 + t * t);
-    c = s * t;
-  } else {
-    T t = -z/y;
-    c = (T)1.0 / sqrt ((T)1.0 + t * t);
-    s = c * t;
-  }
-}
-
-template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
-{
-  int q ; SizeSquare(A,q);
-
-  if(dir == 0){
-    for(int j=0;j<q;j++){
-      T nu = A[i][j];
-      T w  = A[k][j];
-      A[i][j] = (c*nu + s*w);
-      A[k][j] = (-s*nu + c*w);
-    }
-  }
-
-  if(dir == 1){
-    for(int j=0;j<q;j++){
-      T nu = A[j][i];
-      T w  = A[j][k];
-      A[j][i] = (c*nu - s*w);
-      A[j][k] = (s*nu + c*w);
-    }
-  }
-}
-
-/**
-	from input = x;
-	Compute the complex Householder vector, v, such that
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-
-	P | x |    | x | k = 0
-	| x |    | 0 | 
-	| x | =  | 0 |
-	| x |    | 0 | j = 3
-	| x |	   | x |
-
-	These are the "Unreduced" Householder vectors.
-
- **/
-template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
-{
-  int N ; Size(input,N);
-  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
-
-  if(abs(m) > 0.0){
-    T alpha = 0;
-
-    for(int i=k; i<j+1; i++){
-      v[i] = input[i]/m;
-      alpha = alpha + v[i]*conj(v[i]);
-    }
-    alpha = sqrt(alpha);
-    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
-
-    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
-    else                 v[k] = -alpha;
-  } else{
-    for(int i=k; i<j+1; i++){
-      v[i] = 0.0;
-    } 
-  }
-}
-
-/**
-	from input = x;
-	Compute the complex Householder vector, v, such that
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-
-	Px = alpha*e_dir
-
-	These are the "Unreduced" Householder vectors.
-
- **/
-
-template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
-{
-  int N = input.size();
-  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
-  
-  if(abs(m) > 0.0){
-    T alpha = 0;
-
-    for(int i=k; i<j+1; i++){
-      v[i] = input[i]/m;
-      alpha = alpha + v[i]*conj(v[i]);
-    }
-    
-    alpha = sqrt(alpha);
-    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
-	
-    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
-    else                  v[dir] = -alpha;
-  }else{
-    for(int i=k; i<j+1; i++){
-      v[i] = 0.0;
-    } 
-  }
-}
-
-/**
-	Compute the product PA if trans = 0
-	AP if trans = 1
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-	start at element l of matrix A
-	v is of length j - k + 1 of v are nonzero
- **/
-
-template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
-{
-  int N ; SizeSquare(A,N);
-
-  if(abs(beta) > 0.0){
-    for(int p=l; p<N; p++){
-      T s = 0;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
-	s *= beta;
-	for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
-      } else {
-	for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
-	s *= beta;
-	for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
-      }
-    }
-  }
-}
-
-/**
-	Compute the product PA if trans = 0
-	AP if trans = 1
-	P = (I - b v transpose(v) )
-	b = 2/v.v
-	start at element l of matrix A
-	v is of length j - k + 1 of v are nonzero
-	A is tridiagonal
- **/
-template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
-{
-  if(abs(beta) > 0.0){
-
-    int N ; SizeSquare(A,N);
-
-    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
-
-    T s;
-    for(int p=l; p<M; p++){
-      s = 0;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
-      }else{
-	for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
-      }
-      s = beta*s;
-      if(trans==0){
-	for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
-      }else{
-	for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
-      }
-    }
-    for(int p=l; p<M; p++){
-      if(trans==0){
-	for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
-      }else{
-	for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
-      }
-    }
-  }
-}
-}
-#endif
--- a/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -39,7 +39,9 @@ void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
                   int *info);
 #endif

-#include <Grid/algorithms/densematrix/DenseMatrix.h>
+template<class T> using DenseVector = std::vector<T>;
+
+//#include <Grid/algorithms/densematrix/DenseMatrix.h>
 #include <Grid/algorithms/iterative/EigenSort.h>

 namespace Grid {
@@ -47,104 +49,85 @@ namespace Grid {
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
-
-
 template<class Field> 
 class ImplicitlyRestartedLanczos {

-    const RealD small = 1.0e-16;
 public:       
-    int lock;
-    int get;
-    int Niter;
-    int converged;
-
+  int Niter;   // Max iterations
  int Nstop;   // Number of evecs checked for convergence
  int Nk;      // Number of converged sought
-    int Np;      // Np -- Number of spare vecs in kryloc space
  int Nm;      // Nm -- total number of vectors

  RealD eresid;

+  ////////////////////////////////////
+  // Embedded objects
+  ////////////////////////////////////
           SortEigen<Field> _sort;
-
-//    GridCartesian &_fgrid;
-
  LinearOperatorBase<Field> &_Linop;
-
    OperatorFunction<Field> &_poly;

  /////////////////////////
  // Constructor
  /////////////////////////
-    void init(void){};
-    void Abort(int ff, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs);
-
-    ImplicitlyRestartedLanczos(
-				LinearOperatorBase<Field> &Linop, // op
+ ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
 			    OperatorFunction<Field> & poly,   // polynmial
 			    int _Nstop, // sought vecs
 			    int _Nk,    // sought vecs
-			       int _Nm, // spare vecs
+			    int _Nm,    // total vecs
 			    RealD _eresid, // resid in lmdue deficit 
 			    int _Niter) : // Max iterations
-      _Linop(Linop),
-      _poly(poly),
-      Nstop(_Nstop),
-      Nk(_Nk),
-      Nm(_Nm),
-      eresid(_eresid),
-      Niter(_Niter)
-    { 
-      Np = Nm-Nk; assert(Np>0);
-    };
+    _Linop(Linop),    _poly(poly),
+    Nstop(_Nstop), Nk(_Nk), Nm(_Nm),
+    eresid(_eresid),  Niter(_Niter)  { };

-    ImplicitlyRestartedLanczos(
-				LinearOperatorBase<Field> &Linop, // op
+#if 0
+    ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
 			       OperatorFunction<Field> & poly,   // polynmial
 			       int _Nk, // sought vecs
-			       int _Nm, // spare vecs
+			       int _Nm, // total vecs
 			       RealD _eresid, // resid in lmdue deficit 
 			       int _Niter) : // Max iterations
-      _Linop(Linop),
-      _poly(poly),
-      Nstop(_Nk),
-      Nk(_Nk),
-      Nm(_Nm),
-      eresid(_eresid),
-      Niter(_Niter)
-    { 
-      Np = Nm-Nk; assert(Np>0);
-    };
+    _Linop(Linop),      _poly(poly),
+    Nstop(_Nk), Nk(_Nk), Nm(_Nm),      
+    eresid(_eresid),      Niter(_Niter) { };
+#endif

-    /////////////////////////
-    // Sanity checked this routine (step) against Saad.
-    /////////////////////////
-    void RitzMatrix(DenseVector<Field>& evec,int k){
+#if 0
+    void calc(DenseVector<RealD>& eval,
+	      DenseVector<Field>& evec,
+	      const Field& src,
+	      int& Nconv);

-      if(1) return;
+    void step(DenseVector<RealD>& lmd,
+	      DenseVector<RealD>& lme, 
+	      DenseVector<Field>& evec,
+	      Field& w,int Nm,int k);

-      GridBase *grid = evec[0]._grid;
-      Field w(grid);
-      std::cout << "RitzMatrix "<<std::endl;
-      for(int i=0;i<k;i++){
-	_poly(_Linop,evec[i],w);
-	std::cout << "["<<i<<"] ";
-	for(int j=0;j<k;j++){
-	  ComplexD in = innerProduct(evec[j],w);
-	  if ( fabs((double)i-j)>1 ) { 
-	    if (abs(in) >1.0e-9 )  { 
-	      std::cout<<"oops"<<std::endl;
-	      abort();
-	    } else 
-	      std::cout << " 0 ";
-	  } else { 
-	    std::cout << " "<<in<<" ";
-	  }
-	}
-	std::cout << std::endl;
-      }
-    }
+    void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) ;
+
+    static RealD normalise(Field& v) ;
+    void orthogonalize(Field& w, DenseVector<Field>& evec, int k);
+    void diagonalize(DenseVector<RealD>& lmd,
+		     DenseVector<RealD>& lme, 
+		     int N2, int N1,
+		     DenseVector<RealD>& Qt,
+		     GridBase *grid);
+
+    void qr_decomp(DenseVector<RealD>& lmd,
+		   DenseVector<RealD>& lme,
+		   int Nk, int Nm,
+		   DenseVector<RealD>& Qt,
+		   RealD Dsh, int kmin, int kmax);
+
+#ifdef USE_LAPACK
+    void diagonalize_lapack(DenseVector<RealD>& lmd,
+			    DenseVector<RealD>& lme, 
+			    int N1, int N2,
+			    DenseVector<RealD>& Qt,
+			    GridBase *grid);
+#endif
+#endif

 /* Saad PP. 195
 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
@@ -161,12 +144,12 @@ public:
 	      DenseVector<Field>& evec,
 	      Field& w,int Nm,int k)
    {
+      const RealD tiny = 1.0e-20;
      assert( k< Nm );
      
      _poly(_Linop,evec[k],w);      // 3. wk:=Avk−βkv_{k−1}
-      if(k>0){
-	w -= lme[k-1] * evec[k-1];
-      }    
+
+      if(k>0) w -= lme[k-1] * evec[k-1];

      ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk)
      RealD     alph = real(zalph);
@@ -176,29 +159,20 @@ public:
      RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
                                 // 7. vk+1 := wk/βk+1

-//	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
-      const RealD tiny = 1.0e-20;
-      if ( beta < tiny ) { 
-	std::cout << " beta is tiny "<<beta<<std::endl;
-     }
      lmd[k] = alph;
      lme[k] = beta;

-      if (k>0) { 
-	orthogonalize(w,evec,k); // orthonormalise
-      }
-      
+      if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise
      if ( k < Nm-1) evec[k+1] = w;
+
+      if ( beta < tiny ) std::cout << " beta is tiny "<<beta<<std::endl;
    }
      
-    void qr_decomp(DenseVector<RealD>& lmd,
-		   DenseVector<RealD>& lme,
-		   int Nk,
-		   int Nm,
-		   DenseVector<RealD>& Qt,
-		   RealD Dsh, 
-		   int kmin,
-		   int kmax)
+    void qr_decomp(DenseVector<RealD>& lmd,   // Nm 
+		   DenseVector<RealD>& lme,   // Nm 
+		   int Nk, int Nm,
+		   DenseVector<RealD>& Qt,     // Nm x Nm matrix
+		   RealD Dsh, int kmin, int kmax)
    {
      int k = kmin-1;
      RealD x;
@@ -254,30 +228,31 @@ public:
      }
    }

+
 #ifdef USE_LAPACK
    void diagonalize_lapack(DenseVector<RealD>& lmd,
 			    DenseVector<RealD>& lme, 
 			    int N1,
 			    int N2,
 			    DenseVector<RealD>& Qt,
-		     GridBase *grid){
+			    GridBase *grid)
+    {
      const int size = Nm;
-//  tevals.resize(size);
-//  tevecs.resize(size);
      int NN = N1;
      double evals_tmp[NN];
      double evec_tmp[NN][NN];
      memset(evec_tmp[0],0,sizeof(double)*NN*NN);
-//  double AA[NN][NN];
      double DD[NN];
      double EE[NN];
-  for (int i = 0; i< NN; i++)
-    for (int j = i - 1; j <= i + 1; j++)
+      for (int i = 0; i< NN; i++) {
+	for (int j = i - 1; j <= i + 1; j++) {
 	  if ( j < NN && j >= 0 ) {
 	    if (i==j) DD[i] = lmd[i];
 	    if (i==j) evals_tmp[i] = lmd[i];
 	    if (j==(i-1)) EE[j] = lme[j];
 	  }
+	}
+      }
      int evals_found;
      int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
      int liwork =  3+NN*10 ;
@@ -291,9 +266,6 @@ public:
      char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
      int ifail[NN];
      int info;
-//  int total = QMP_get_number_of_nodes();
-//  int node = QMP_get_node_number();
-//  GridBase *grid = evec[0]._grid;
      int total = grid->_Nprocessors;
      int node  = grid->_processor;
      int interval = (NN/total)+1;
@@ -304,7 +276,6 @@ public:
      if (1) {
 	memset(evals_tmp,0,sizeof(double)*NN);
 	if ( il <= NN){
-        printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
 	  LAPACK_dstegr(&jobz, &range, &NN,
 			(double*)DD, (double*)EE,
 			&vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
@@ -314,7 +285,6 @@ public:
 			work, &lwork, iwork, &liwork,
 			&info);
 	  for (int i = iu-1; i>= il-1; i--){
-          printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]);
 	    evals_tmp[i] = evals_tmp[i - (il-1)];
 	    if (il>1) evals_tmp[i-(il-1)]=0.;
 	    for (int j = 0; j< NN; j++){
@@ -324,13 +294,14 @@ public:
 	  }
 	}
 	{
-//        QMP_sum_double_array(evals_tmp,NN);
-//        QMP_sum_double_array((double *)evec_tmp,NN*NN);
 	  grid->GlobalSumVector(evals_tmp,NN);
 	  grid->GlobalSumVector((double*)evec_tmp,NN*NN);
 	}
      } 
-// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
+      // cheating a bit.
+      // It is better to sort instead of just reversing it, 
+      // but the document of the routine says evals are sorted in increasing order. 
+      // qr gives evals in decreasing order.
      for(int i=0;i<NN;i++){
 	for(int j=0;j<NN;j++)
 	  Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
@@ -339,7 +310,6 @@ public:
    }
 #endif

-
    void diagonalize(DenseVector<RealD>& lmd,
 		     DenseVector<RealD>& lme, 
 		     int N2,
@@ -361,17 +331,16 @@ public:
      lmd2[k] = lmd[k];
      lme2[k] = lme[k];
    }
-         for(int k=0; k<N1*N1; ++k)
+    for(int k=0; k<N1*N1; ++k){
      Qt2[k] = Qt[k];
-
-//	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
+    }
 #endif

      int Niter = 100*N1;
      int kmin = 1;
      int kmax = N2;
-      // (this should be more sophisticated)

+      // (this should be more sophisticated)
      for(int iter=0; iter<Niter; ++iter){

 	// determination of 2x2 leading submatrix
@@ -402,10 +371,6 @@ public:
 	  _sort.push(lmd2,N2);
 	  for(int k=0; k<N2; ++k){
 	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
-//	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
-	  }
-         for(int k=0; k<N1*N1; ++k){
-//	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
 	  }
 	}
 #endif
@@ -424,7 +389,6 @@ public:
      abort();
    }

-#if 1
    static RealD normalise(Field& v) 
    {
      RealD nn = norm2(v);
@@ -457,6 +421,7 @@ public:
      normalise(w);
    }

+
    void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) {
      for(int i=0; i<Qt.size(); ++i) Qt[i] = 0.0;
      for(int k=0; k<Nm; ++k) Qt[k + k*Nm] = 1.0;
@@ -488,8 +453,9 @@ until convergence
 	GridBase *grid = evec[0]._grid;
 	assert(grid == src._grid);

-	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
-	std::cout << " -- Nm = " << Nm << std::endl;
+	std::cout << " -- seek   Nk    = " << Nk    <<" vectors"<< std::endl;
+	std::cout << " -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
+	std::cout << " -- total  Nm    = " << Nm    <<" vectors"<< std::endl;
 	std::cout << " -- size of eval = " << eval.size() << std::endl;
 	std::cout << " -- size of evec = " << evec.size() << std::endl;
 	
@@ -514,38 +480,24 @@ until convergence
 	RealD beta_k;
  
 	// Set initial vector
-	// (uniform vector) Why not src??
-	//	evec[0] = 1.0;
 	evec[0] = src;
 	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
-// << src._grid  << std::endl;
+
 	normalise(evec[0]);
 	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
-// << evec[0]._grid << std::endl;
 	
 	// Initial Nk steps
 	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
-//	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
-//	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
-	RitzMatrix(evec,Nk);
-	for(int k=0; k<Nk; ++k){
-//	std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
-//	std:: cout <<"lme " << k << " " << lme[k] << std::endl;
-	}

 	// Restarting loop begins
-	for(int iter = 0; iter<Niter; ++iter){
+	int iter;
+	for(iter = 0; iter<Niter; ++iter){

 	  std::cout<<"\n Restart iteration = "<< iter << std::endl;

-	  // 
-	  // Rudy does a sort first which looks very different. Getting fed up with sorting out the algo defs.
-	  // We loop over 
-	  //
 	  for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
-	  f *= lme[Nm-1];

-	  RitzMatrix(evec,k2);
+	  f *= lme[Nm-1];

 	  // getting eigenvalues
 	  for(int k=0; k<Nm; ++k){
@@ -561,9 +513,8 @@ until convergence
 	  // Implicitly shifted QR transformations
 	  setUnit_Qt(Nm,Qt);
 	  for(int ip=k2; ip<Nm; ++ip){ 
-	std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
+	    //	    std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
 	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
-		
 	  }
    
 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
@@ -602,15 +553,11 @@ until convergence
 	      B[j].checkerboard = evec[k].checkerboard;
 	      B[j] += Qt[k+j*Nm] * evec[k];
 	    }
-//	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
 	  }
-//	_sort.push(eval2,B,Nk);

 	  Nconv = 0;
-	  //	  std::cout << std::setiosflags(std::ios_base::scientific);
 	  for(int i=0; i<Nk; ++i){

-//	    _poly(_Linop,B[i],v);
 	    _Linop.HermOp(B[i],v);
 	    
 	    RealD vnum = real(innerProduct(B[i],v)); // HermOp.
@@ -631,8 +578,6 @@ until convergence
 	    }

 	  }  // i-loop end
-	  //	  std::cout << std::resetiosflags(std::ios_base::scientific);
-

 	  std::cout<<" #modes converged: "<<Nconv<<std::endl;

@@ -655,556 +600,10 @@ until convergence
      _sort.push(eval,evec,Nconv);

      std::cout << "\n Converged\n Summary :\n";
-      std::cout << " -- Iterations  = "<< Nconv  << "\n";
+      std::cout << " -- Iterations  = "<< iter   << "\n";
      std::cout << " -- beta(k)     = "<< beta_k << "\n";
      std::cout << " -- Nconv       = "<< Nconv  << "\n";
     }
-
-    /////////////////////////////////////////////////
-    // Adapted from Rudy's lanczos factor routine
-    /////////////////////////////////////////////////
-    int Lanczos_Factor(int start, int end,  int cont,
-		       DenseVector<Field> & bq, 
-		       Field &bf,
-		       DenseMatrix<RealD> &H){
-      
-      GridBase *grid = bq[0]._grid;
-
-      RealD beta;  
-      RealD sqbt;  
-      RealD alpha;
-
-      for(int i=start;i<Nm;i++){
-	for(int j=start;j<Nm;j++){
-	  H[i][j]=0.0;
-	}
-      }
-
-      std::cout<<"Lanczos_Factor start/end " <<start <<"/"<<end<<std::endl;
-
-      // Starting from scratch, bq[0] contains a random vector and |bq[0]| = 1
-      int first;
-      if(start == 0){
-
-	std::cout << "start == 0\n"; //TESTING
-
-	_poly(_Linop,bq[0],bf);
-
-	alpha = real(innerProduct(bq[0],bf));//alpha =  bq[0]^dag A bq[0]
-
-	std::cout << "alpha = " << alpha << std::endl;
-	
-	bf = bf - alpha * bq[0];  //bf =  A bq[0] - alpha bq[0]
-
-	H[0][0]=alpha;
-
-	std::cout << "Set H(0,0) to " << H[0][0] << std::endl;
-
-	first = 1;
-
-      } else {
-
-	first = start;
-
-      }
-
-      // I think start==0 and cont==zero are the same. Test this
-      // If so I can drop "cont" parameter?
-      if( cont ) assert(start!=0);
-
-      if( start==0 ) assert(cont!=0);
-
-      if( cont){
-
-	beta = 0;sqbt = 0;
-
-	std::cout << "cont is true so setting beta to zero\n";
-
-      }	else {
-
-	beta = norm2(bf);
-	sqbt = sqrt(beta);
-
-	std::cout << "beta = " << beta << std::endl;
-      }
-
-      for(int j=first;j<end;j++){
-
-	std::cout << "Factor j " << j <<std::endl;
-
-	if(cont){ // switches to factoring; understand start!=0 and initial bf value is right.
-	  bq[j] = bf; cont = false;
-	}else{
-	  bq[j] = (1.0/sqbt)*bf ;
-
-	  H[j][j-1]=H[j-1][j] = sqbt;
-	}
-
-	_poly(_Linop,bq[j],bf);
-
-	bf = bf - (1.0/sqbt)*bq[j-1]; 	       //bf = A bq[j] - beta bq[j-1] // PAB this comment was incorrect in beta term??
-
-	alpha = real(innerProduct(bq[j],bf));  //alpha = bq[j]^dag A bq[j]
-
-	bf = bf - alpha*bq[j];                 //bf = A bq[j] - beta bq[j-1] - alpha bq[j]
-	RealD fnorm = norm2(bf);
-
-	RealD bck = sqrt( real( conjugate(alpha)*alpha ) + beta );
-
-	beta = fnorm;
-	sqbt = sqrt(beta);
-	std::cout << "alpha = " << alpha << " fnorm = " << fnorm << '\n';
-
-	///Iterative refinement of orthogonality V = [ bq[0]  bq[1]  ...  bq[M] ]
-	int re = 0;
-	// FIXME undefined params; how set in Rudy's code
-	int ref =0;
-	Real rho = 1.0e-8;
-
-	while( re == ref || (sqbt < rho * bck && re < 5) ){
-
-	  Field tmp2(grid);
-	  Field tmp1(grid);
-
-	  //bex = V^dag bf
-	  DenseVector<ComplexD> bex(j+1);
-	  for(int k=0;k<j+1;k++){
-	    bex[k] = innerProduct(bq[k],bf);
-	  }
-	  
-	  zero_fermion(tmp2);
-	  //tmp2 = V s
-	  for(int l=0;l<j+1;l++){
-	    RealD nrm = norm2(bq[l]);
-	    axpy(tmp1,0.0,bq[l],bq[l]); scale(tmp1,bex[l]); 	//tmp1 = V[j] bex[j]
-	    axpy(tmp2,1.0,tmp2,tmp1);					//tmp2 += V[j] bex[j]
-	  }
-
-	  //bf = bf - V V^dag bf.   Subtracting off any component in span { V[j] } 
-	  RealD btc = axpy_norm(bf,-1.0,tmp2,bf);
-	  alpha = alpha + real(bex[j]);	      sqbt = sqrt(real(btc));	      
-	  // FIXME is alpha real in RUDY's code?
-	  RealD nmbex = 0;for(int k=0;k<j+1;k++){nmbex = nmbex + real( conjugate(bex[k])*bex[k]  );}
-	  bck = sqrt( nmbex );
-	  re++;
-	}
-	std::cout << "Iteratively refined orthogonality, changes alpha\n";
-	if(re > 1) std::cout << "orthagonality refined " << re << " times" <<std::endl;
-	H[j][j]=alpha;
-      }
-
-      return end;
-    }
-
-    void EigenSort(DenseVector<double> evals,
-		   DenseVector<Field>  evecs){
-      int N= evals.size();
-      _sort.push(evals,evecs, evals.size(),N);
-    }
-
-    void ImplicitRestart(int TM, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs, DenseVector<Field> &bq, Field &bf, int cont)
-    {
-      std::cout << "ImplicitRestart begin. Eigensort starting\n";
-
-      DenseMatrix<RealD> H; Resize(H,Nm,Nm);
-
-      EigenSort(evals, evecs);
-
-      ///Assign shifts
-      int K=Nk;
-      int M=Nm;
-      int P=Np;
-      int converged=0;
-      if(K - converged < 4) P = (M - K-1); //one
-      //      DenseVector<RealD> shifts(P + shift_extra.size());
-      DenseVector<RealD> shifts(P);
-      for(int k = 0; k < P; ++k)
-	shifts[k] = evals[k]; 
-
-      /// Shift to form a new H and q
-      DenseMatrix<RealD> Q; Resize(Q,TM,TM);
-      Unity(Q);
-      Shift(Q, shifts); // H is implicitly passed in in Rudy's Shift routine
-
-      int ff = K;
-
-      /// Shifted H defines a new K step Arnoldi factorization
-      RealD  beta = H[ff][ff-1]; 
-      RealD  sig  = Q[TM - 1][ff - 1];
-      std::cout << "beta = " << beta << " sig = " << real(sig) <<std::endl;
-
-      std::cout << "TM = " << TM << " ";
-      std::cout << norm2(bq[0]) << " -- before" <<std::endl;
-
-      /// q -> q Q
-      times_real(bq, Q, TM);
-
-      std::cout << norm2(bq[0]) << " -- after " << ff <<std::endl;
-      bf =  beta* bq[ff] + sig* bf;
-
-      /// Do the rest of the factorization
-      ff = Lanczos_Factor(ff, M,cont,bq,bf,H);
-      
-      if(ff < M)
-	Abort(ff, evals, evecs);
-    }
-
-///Run the Eigensolver
-    void Run(int cont, DenseVector<Field> &bq, Field &bf, DenseVector<DenseVector<RealD> > & evecs,DenseVector<RealD> &evals)
-    {
-      init();
-
-      int M=Nm;
-
-      DenseMatrix<RealD> H; Resize(H,Nm,Nm);
-      Resize(evals,Nm);
-      Resize(evecs,Nm);
-
-      int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with
-
-      if(ff < M) {
-	std::cout << "Krylov: aborting ff "<<ff <<" "<<M<<std::endl;
-	abort(); // Why would this happen?
-      }
-
-      int itcount = 0;
-      bool stop = false;
-
-      for(int it = 0; it < Niter && (converged < Nk); ++it) {
-
-	std::cout << "Krylov: Iteration --> " << it << std::endl;
-	int lock_num = lock ? converged : 0;
-	DenseVector<RealD> tevals(M - lock_num );
-	DenseMatrix<RealD> tevecs; Resize(tevecs,M - lock_num,M - lock_num);
-	  
-	//check residual of polynominal 
-	TestConv(H,M, tevals, tevecs);
-
-	if(converged >= Nk)
-	    break;
-
-	ImplicitRestart(ff, tevals,tevecs,H);
-      }
-      Wilkinson<RealD>(H, evals, evecs, small); 
-      //      Check();
-
-      std::cout << "Done  "<<std::endl;
-
-    }
-
-   ///H - shift I = QR; H = Q* H Q
-    void Shift(DenseMatrix<RealD> & H,DenseMatrix<RealD> &Q, DenseVector<RealD> shifts) {
-      
-      int P; Size(shifts,P);
-      int M; SizeSquare(Q,M);
-
-      Unity(Q);
-
-      int lock_num = lock ? converged : 0;
-
-      RealD t_Househoulder_vector(0.0);
-      RealD t_Househoulder_mult(0.0);
-
-      for(int i=0;i<P;i++){
-
-	RealD x, y, z;
-	DenseVector<RealD> ck(3), v(3);
-	  
-	x = H[lock_num+0][lock_num+0]-shifts[i];
-	y = H[lock_num+1][lock_num+0];
-	ck[0] = x; ck[1] = y; ck[2] = 0; 
-
-	normalise(ck);	///Normalization cancels in PHP anyway
-	RealD beta;
-
-	Householder_vector<RealD>(ck, 0, 2, v, beta);
-	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,0);
-	Householder_mult<RealD>(H,v,beta,0,lock_num+0,lock_num+2,1);
-	///Accumulate eigenvector
-	Householder_mult<RealD>(Q,v,beta,0,lock_num+0,lock_num+2,1);
-	  
-	int sw = 0;
-	for(int k=lock_num+0;k<M-2;k++){
-
-	  x = H[k+1][k]; 
-	  y = H[k+2][k]; 
-	  z = (RealD)0.0;
-	  if(k+3 <= M-1){
-	    z = H[k+3][k];
-	  }else{
-	    sw = 1; v[2] = 0.0;
-	  }
-
-	  ck[0] = x; ck[1] = y; ck[2] = z;
-
-	  normalise(ck);
-
-	  Householder_vector<RealD>(ck, 0, 2-sw, v, beta);
-	  Householder_mult<RealD>(H,v, beta,0,k+1,k+3-sw,0);
-	  Householder_mult<RealD>(H,v, beta,0,k+1,k+3-sw,1);
-	  ///Accumulate eigenvector
-	  Householder_mult<RealD>(Q,v, beta,0,k+1,k+3-sw,1);
-	}
-      }
-    }
-
-    void TestConv(DenseMatrix<RealD> & H,int SS, 
-		  DenseVector<Field> &bq, Field &bf,
-		  DenseVector<RealD> &tevals, DenseVector<DenseVector<RealD> > &tevecs, 
-		  int lock, int converged)
-    {
-      std::cout << "Converged " << converged << " so far." << std::endl;
-      int lock_num = lock ? converged : 0;
-      int M = Nm;
-
-      ///Active Factorization
-      DenseMatrix<RealD> AH; Resize(AH,SS - lock_num,SS - lock_num );
-
-      AH = GetSubMtx(H,lock_num, SS, lock_num, SS);
-
-      int NN=tevals.size();
-      int AHsize=SS-lock_num;
-
-      RealD small=1.0e-16;
-      Wilkinson<RealD>(AH, tevals, tevecs, small);
-
-      EigenSort(tevals, tevecs);
-
-      RealD resid_nrm=  norm2(bf);
-
-      if(!lock) converged = 0;
-#if 0
-      for(int i = SS - lock_num - 1; i >= SS - Nk && i >= 0; --i){
-
-	RealD diff = 0;
-	diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm;
-
-	std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl;
-
-	if(diff < converged) {
-
-	  if(lock) {
-	    
-	    DenseMatrix<RealD> Q; Resize(Q,M,M);
-	    bool herm = true; 
-
-	    Lock(H, Q, tevals[i], converged, small, SS, herm);
-
-	    times_real(bq, Q, bq.size());
-	    bf = Q[M - 1][M - 1]* bf;
-	    lock_num++;
-	  }
-	  converged++;
-	  std::cout << " converged on eval " << converged << " of " << Nk << std::endl;
-	} else {
-	  break;
-	}
-      }
-#endif
-      std::cout << "Got " << converged << " so far " <<std::endl;	
-    }
-
-    ///Check
-    void Check(DenseVector<RealD> &evals,
-	       DenseVector<DenseVector<RealD> > &evecs) {
-
-      DenseVector<RealD> goodval(this->get);
-
-      EigenSort(evals,evecs);
-
-      int NM = Nm;
-
-      DenseVector< DenseVector<RealD> > V; Size(V,NM);
-      DenseVector<RealD> QZ(NM*NM);
-
-      for(int i = 0; i < NM; i++){
-	for(int j = 0; j < NM; j++){
-	  // evecs[i][j];
-	}
-      }
-    }
-
-
-/**
-   There is some matrix Q such that for any vector y
-   Q.e_1 = y and Q is unitary.
-**/
-  template<class T>
-  static T orthQ(DenseMatrix<T> &Q, DenseVector<T> y){
-    int N = y.size();	//Matrix Size
-    Fill(Q,0.0);
-    T tau;
-    for(int i=0;i<N;i++){
-      Q[i][0]=y[i];
-    }
-    T sig = conj(y[0])*y[0];
-    T tau0 = abs(sqrt(sig));
-    
-    for(int j=1;j<N;j++){
-      sig += conj(y[j])*y[j]; 
-      tau = abs(sqrt(sig) ); 	
-
-      if(abs(tau0) > 0.0){
-	
-	T gam = conj( (y[j]/tau)/tau0 );
-	for(int k=0;k<=j-1;k++){  
-	  Q[k][j]=-gam*y[k];
-	}
-	Q[j][j]=tau0/tau;
-      } else {
-	Q[j-1][j]=1.0;
-      }
-      tau0 = tau;
-    }
-    return tau;
-  }
-
-/**
-	There is some matrix Q such that for any vector y
-	Q.e_k = y and Q is unitary.
-**/
-  template< class T>
-  static T orthU(DenseMatrix<T> &Q, DenseVector<T> y){
-    T tau = orthQ(Q,y);
-    SL(Q);
-    return tau;
-  }
-
-
-/**
-	Wind up with a matrix with the first con rows untouched
-
-say con = 2
-	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
-	and the matrix is upper hessenberg
-	and with f and Q appropriately modidied with Q is the arnoldi factorization
-
-**/
-
-template<class T>
-static void Lock(DenseMatrix<T> &H, 	// Hess mtx	
-		 DenseMatrix<T> &Q, 	// Lock Transform
-		 T val, 		// value to be locked
-		 int con, 	// number already locked
-		 RealD small,
-		 int dfg,
-		 bool herm)
-{	
-  //ForceTridiagonal(H);
-
-  int M = H.dim;
-  DenseVector<T> vec; Resize(vec,M-con);
-
-  DenseMatrix<T> AH; Resize(AH,M-con,M-con);
-  AH = GetSubMtx(H,con, M, con, M);
-
-  DenseMatrix<T> QQ; Resize(QQ,M-con,M-con);
-
-  Unity(Q);   Unity(QQ);
-  
-  DenseVector<T> evals; Resize(evals,M-con);
-  DenseMatrix<T> evecs; Resize(evecs,M-con,M-con);
-
-  Wilkinson<T>(AH, evals, evecs, small);
-
-  int k=0;
-  RealD cold = abs( val - evals[k]); 
-  for(int i=1;i<M-con;i++){
-    RealD cnew = abs( val - evals[i]);
-    if( cnew < cold ){k = i; cold = cnew;}
-  }
-  vec = evecs[k];
-
-  ComplexD tau;
-  orthQ(QQ,vec);
-  //orthQM(QQ,AH,vec);
-
-  AH = Hermitian(QQ)*AH;
-  AH = AH*QQ;
-
-  for(int i=con;i<M;i++){
-    for(int j=con;j<M;j++){
-      Q[i][j]=QQ[i-con][j-con];
-      H[i][j]=AH[i-con][j-con];
-    }
-  }
-
-  for(int j = M-1; j>con+2; j--){
-
-    DenseMatrix<T> U; Resize(U,j-1-con,j-1-con);
-    DenseVector<T> z; Resize(z,j-1-con); 
-    T nm = norm(z); 
-    for(int k = con+0;k<j-1;k++){
-      z[k-con] = conj( H(j,k+1) );
-    }
-    normalise(z);
-
-    RealD tmp = 0;
-    for(int i=0;i<z.size()-1;i++){tmp = tmp + abs(z[i]);}
-
-    if(tmp < small/( (RealD)z.size()-1.0) ){ continue;}	
-
-    tau = orthU(U,z);
-
-    DenseMatrix<T> Hb; Resize(Hb,j-1-con,M);	
-	
-    for(int a = 0;a<M;a++){
-      for(int b = 0;b<j-1-con;b++){
-	T sum = 0;
-	for(int c = 0;c<j-1-con;c++){
-	  sum += H[a][con+1+c]*U[c][b];
-	}//sum += H(a,con+1+c)*U(c,b);}
-	Hb[b][a] = sum;
-      }
-    }
-	
-    for(int k=con+1;k<j;k++){
-      for(int l=0;l<M;l++){
-	H[l][k] = Hb[k-1-con][l];
-      }
-    }//H(Hb[k-1-con][l] , l,k);}}
-
-    DenseMatrix<T> Qb; Resize(Qb,M,M);	
-	
-    for(int a = 0;a<M;a++){
-      for(int b = 0;b<j-1-con;b++){
-	T sum = 0;
-	for(int c = 0;c<j-1-con;c++){
-	  sum += Q[a][con+1+c]*U[c][b];
-	}//sum += Q(a,con+1+c)*U(c,b);}
-	Qb[b][a] = sum;
-      }
-    }
-	
-    for(int k=con+1;k<j;k++){
-      for(int l=0;l<M;l++){
-	Q[l][k] = Qb[k-1-con][l];
-      }
-    }//Q(Qb[k-1-con][l] , l,k);}}
-
-    DenseMatrix<T> Hc; Resize(Hc,M,M);	
-	
-    for(int a = 0;a<j-1-con;a++){
-      for(int b = 0;b<M;b++){
-	T sum = 0;
-	for(int c = 0;c<j-1-con;c++){
-	  sum += conj( U[c][a] )*H[con+1+c][b];
-	}//sum += conj( U(c,a) )*H(con+1+c,b);}
-	Hc[b][a] = sum;
-      }
-    }
-
-    for(int k=0;k<M;k++){
-      for(int l=con+1;l<j;l++){
-	H[l][k] = Hc[k][l-1-con];
-      }
-    }//H(Hc[k][l-1-con] , l,k);}}
-
-  }
-}
-#endif
-
-
 };

 }
--- a/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
+++ b/lib/qcd/hmc/checkpointers/ILDGCheckpointer.h
@@ -102,7 +102,7 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
    FieldMetaData header;
    IldgReader _IldgReader;
    _IldgReader.open(config);
-    _IldgReader.readConfiguration(config,U,header);  // format from the header
+    _IldgReader.readConfiguration(U,header);  // format from the header
    _IldgReader.close();

    std::cout << GridLogMessage << "Read ILDG Configuration from " << config
--- a/tests/solver/Test_dwf_lanczos.cc
+++ b/tests/solver/Test_dwf_lanczos.cc
@@ -54,7 +54,7 @@ int main (int argc, char ** argv)
  GridParallelRNG          RNG5rb(FrbGrid);  RNG5.SeedFixedIntegers(seeds5);

  LatticeGaugeField Umu(UGrid); 
-  SU3::TepidConfiguration(RNG4, Umu);
+  SU3::HotConfiguration(RNG4, Umu);

  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){