From 19d0590579daa2b458bda98559c2ff3fb3ce97dd Mon Sep 17 00:00:00 2001
From: Chulwoo Jung <chulwoo@ic2submit01.sdcc.bnl.gov>
Date: Fri, 25 Apr 2025 10:48:22 -0400
Subject: [PATCH 1/7] Checking in for merging

---
 benchmarks/Benchmark_usqcd.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc
index e400138b..da78b91e 100644
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -868,7 +868,7 @@ int main (int argc, char ** argv)
   int do_su4=0;
   int do_memory=1;
   int do_comms =1;
-  int do_blas  =1;
+  int do_blas  =0;
   int do_dslash=1;
 
   int sel=4;

From 8419cc5c64c2fc25a7702ce77b84967f608894f9 Mon Sep 17 00:00:00 2001
From: Chulwoo Jung <chulwoo@ic2submit02.sdcc.bnl.gov>
Date: Fri, 11 Jul 2025 15:57:23 -0400
Subject: [PATCH 2/7] specflow evec I/O added,

---
 Grid/threads/ThreadReduction.h         |  5 +++++
 tests/lanczos/Test_dwf_G5R5.cc         | 11 +++++++---
 tests/lanczos/Test_wilson_DWFKernel.cc | 10 +++++++--
 tests/lanczos/Test_wilson_specflow.cc  | 28 ++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/Grid/threads/ThreadReduction.h b/Grid/threads/ThreadReduction.h
index f0d24d50..2cb3e90d 100644
--- a/Grid/threads/ThreadReduction.h
+++ b/Grid/threads/ThreadReduction.h
@@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once 
 
+#ifndef MIN
+#define MIN(x,y) ((x)>(y)?(y):(x))
+#endif
+
+
 // Introduce a class to gain deterministic bit reproducible reduction.
 // make static; perhaps just a namespace is required.
 NAMESPACE_BEGIN(Grid);
diff --git a/tests/lanczos/Test_dwf_G5R5.cc b/tests/lanczos/Test_dwf_G5R5.cc
index 0fd1bc35..1027e4af 100644
--- a/tests/lanczos/Test_dwf_G5R5.cc
+++ b/tests/lanczos/Test_dwf_G5R5.cc
@@ -33,9 +33,13 @@ using namespace std;
 using namespace Grid;
  ;
 
-//typedef WilsonFermionD FermionOp;
+#if 0
 typedef DomainWallFermionD FermionOp;
 typedef typename DomainWallFermionD::FermionField FermionField;
+#else
+typedef MobiusFermionD FermionOp;
+typedef typename MobiusFermionD::FermionField FermionField;
+#endif
 
 
 RealD AllZero(RealD x) { return 0.; }
@@ -170,10 +174,11 @@ int main(int argc, char** argv) {
   int Nm = Nk + Np;
   int MaxIt = 10000;
   RealD resid = 1.0e-5;
-
+  RealD mob_b=1.5;
 
 //while ( mass > - 5.0){
-  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+//  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.);
   MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
 //  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
   Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
diff --git a/tests/lanczos/Test_wilson_DWFKernel.cc b/tests/lanczos/Test_wilson_DWFKernel.cc
index 30cbd1b3..ab60d780 100644
--- a/tests/lanczos/Test_wilson_DWFKernel.cc
+++ b/tests/lanczos/Test_wilson_DWFKernel.cc
@@ -113,6 +113,9 @@ struct LanczosParameters: Serializable {
   GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 				RealD, resid,
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -204,7 +207,6 @@ int main(int argc, char** argv) {
   int Nstop = 5;
   int Nk = 10;
   int Np = 90;
-  int Nm = Nk + Np;
   int MaxIt = 10000;
   RealD resid = 1.0e-5;
 
@@ -226,10 +228,14 @@ int main(int argc, char** argv) {
     XmlWriter HMCwr("LanParams.xml.out");
     write(HMCwr,"LanczosParameters",LanParams);
   }
-
+  Nstop=LanParams.Nstop;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
   mass=LanParams.mass;
   resid=LanParams.resid;
 
+  int Nm = Nk + Np;
+
 
 while ( mass > - 5.0){
   FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
diff --git a/tests/lanczos/Test_wilson_specflow.cc b/tests/lanczos/Test_wilson_specflow.cc
index e9bd04df..a23afa94 100644
--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@@ -27,6 +27,7 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
+#include <Grid/parallelIO/IldgIOtypes.h>
 
 using namespace std;
 using namespace Grid;
@@ -38,11 +39,28 @@ typedef typename WilsonFermionD::FermionField FermionField;
 
 RealD AllZero(RealD x) { return 0.; }
 
+template <class T> void writeFile(T& in, std::string const fname){
+#if 1
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}
+
+
 namespace Grid {
 
 struct LanczosParameters: Serializable {
   GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -158,6 +176,10 @@ int main(int argc, char** argv) {
   }
 
   mass=LanParams.mass;
+  Nstop=LanParams.Nstop;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
+  Nm = Nk + Np;
 
 
 while ( mass > - 5.0){
@@ -202,6 +224,12 @@ while ( mass > - 5.0){
     tmp = g5*evec[i];
     dot = innerProduct(tmp,evec[i]);
     std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
+//    if ( i<1)
+    {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
+        auto evdensity = localInnerProduct(evec[i],evec[i] );
+	writeFile(evdensity,evfile);
+    }
   }
   src  = evec[0]+evec[1]+evec[2];
   mass += -0.1;

From c606f5dca063cdc5eaccedc49b0446a53fa66315 Mon Sep 17 00:00:00 2001
From: Chulwoo Jung <chulwoo@aurora-uan-0009.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Wed, 6 Aug 2025 16:51:14 +0000
Subject: [PATCH 3/7] Move out src initialization for re-use / Adding
 antiperiodic BC

---
 tests/lanczos/Test_wilson_specflow.cc | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/lanczos/Test_wilson_specflow.cc b/tests/lanczos/Test_wilson_specflow.cc
index a23afa94..4e60a81d 100644
--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@@ -58,6 +58,7 @@ namespace Grid {
 struct LanczosParameters: Serializable {
   GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
+		  		RealD, mstep , 
 				Integer, Nstop,
                                 Integer, Nk,
                                 Integer, Np,
@@ -181,9 +182,14 @@ int main(int argc, char** argv) {
   Np=LanParams.Np;
   Nm = Nk + Np;
 
+  FermionField src(FGrid);
+  gaussian(RNG5, src);
+  std::vector<Complex> boundary = {1,1,1,-1};
+  FermionOp::ImplParams Params(boundary);
 
-while ( mass > - 5.0){
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+
+while ( mass > - 2.5){
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
   MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
   //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
   Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
@@ -204,8 +210,6 @@ while ( mass > - 5.0){
   ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
 
   std::vector<RealD> eval(Nm);
-  FermionField src(FGrid);
-  gaussian(RNG5, src);
   std::vector<FermionField> evec(Nm, FGrid);
   for (int i = 0; i < 1; i++) {
     std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
@@ -232,7 +236,9 @@ while ( mass > - 5.0){
     }
   }
   src  = evec[0]+evec[1]+evec[2];
-  mass += -0.1;
+  src  += evec[3]+evec[4]+evec[5];
+  src  += evec[6]+evec[7]+evec[8];
+  mass += LanParams.mstep;
 }
 
   Grid_finalize();

From 2bf9179d2c5d47def054876323fdb3816c7227a1 Mon Sep 17 00:00:00 2001
From: Chulwoo Jung <chulwoo@aurora-uan-0009.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Wed, 6 Aug 2025 16:52:51 +0000
Subject: [PATCH 4/7] Adding mass step

---
 tests/lanczos/LanParams.xml | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/lanczos/LanParams.xml b/tests/lanczos/LanParams.xml
index 5328b46b..1f967894 100644
--- a/tests/lanczos/LanParams.xml
+++ b/tests/lanczos/LanParams.xml
@@ -1,14 +1,15 @@
 <?xml version="1.0"?>
 <grid>
   <LanczosParameters>
-    <mass>0.00107</mass>
+    <mass>-1.025</mass>
+    <mstep>-0.025</mstep>
     <M5>1.8</M5>
     <Ls>48</Ls>
     <Nstop>10</Nstop>
-    <Nk>15</Nk>
-    <Np>85</Np>
-    <ChebyLow>0.003</ChebyLow>
-    <ChebyHigh>60</ChebyHigh>
-    <ChebyOrder>201</ChebyOrder>
+    <Nk>12</Nk>
+    <Np>30</Np>
+    <ChebyLow>0.1</ChebyLow>
+    <ChebyHigh>50</ChebyHigh>
+    <ChebyOrder>51</ChebyOrder>
   </LanczosParameters>
 </grid>

From 7780d88d26bd25ad69efa1e9687113bcca0c6d6e Mon Sep 17 00:00:00 2001
From: Chulwoo Jung <chulwoo@aurora-uan-0009.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Wed, 6 Aug 2025 23:41:53 +0000
Subject: [PATCH 5/7] Adding simple lanczos, boundary to specflow(!)

---
 Grid/algorithms/Algorithms.h              |   1 +
 Grid/algorithms/iterative/SimpleLanczos.h | 931 ++++++++++++++++++++++
 tests/lanczos/Test_wilson_lanczos.cc      |  19 +-
 tests/lanczos/Test_wilson_specflow.cc     |   4 +
 4 files changed, 950 insertions(+), 5 deletions(-)
 create mode 100644 Grid/algorithms/iterative/SimpleLanczos.h

diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h
index 38f6587c..6f22fea4 100644
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -73,6 +73,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/SimpleLanczos.h>
 #include <Grid/algorithms/iterative/PowerMethod.h>
 #include <Grid/algorithms/iterative/AdefGeneric.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
diff --git a/Grid/algorithms/iterative/SimpleLanczos.h b/Grid/algorithms/iterative/SimpleLanczos.h
new file mode 100644
index 00000000..0aedc006
--- /dev/null
+++ b/Grid/algorithms/iterative/SimpleLanczos.h
@@ -0,0 +1,931 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LANC_H
+#define GRID_LANC_H
+
+#include <string.h>		//memset
+
+#ifdef USE_LAPACK
+#ifdef USE_MKL
+#include<mkl_lapack.h>
+#else
+void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
+		    double *vl, double *vu, int *il, int *iu, double *abstol,
+		    int *m, double *w, double *z, int *ldz, int *isuppz,
+		    double *work, int *lwork, int *iwork, int *liwork,
+		    int *info);
+//#include <lapacke/lapacke.h>
+#endif
+#endif
+
+//#include <Grid/algorithms/densematrix/DenseMatrix.h>
+
+// eliminate temorary vector in calc()
+#define MEM_SAVE
+
+namespace Grid
+{
+
+  struct Bisection
+  {
+
+#if 0
+    static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
+			  std::vector < RealD > &BETA,
+			  std::vector < RealD > &eig)
+    {
+      int i, j;
+        std::vector < RealD > evec1 (row_num + 3);
+        std::vector < RealD > evec2 (row_num + 3);
+      RealD eps2;
+        ALPHA[1] = 0.;
+        BETHA[1] = 0.;
+      for (i = 0; i < row_num - 1; i++)
+	{
+	  ALPHA[i + 1] = A[i * (row_num + 1)].real ();
+	  BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
+	}
+      ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
+        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
+        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
+
+      // Do we really need to sort here?
+      int begin = 1;
+      int end = row_num;
+      int swapped = 1;
+      while (swapped)
+	{
+	  swapped = 0;
+	  for (i = begin; i < end; i++)
+	    {
+	      if (mag (evec2[i]) > mag (evec2[i + 1]))
+		{
+		  swap (evec2 + i, evec2 + i + 1);
+		  swapped = 1;
+		}
+	    }
+	  end--;
+	  for (i = end - 1; i >= begin; i--)
+	    {
+	      if (mag (evec2[i]) > mag (evec2[i + 1]))
+		{
+		  swap (evec2 + i, evec2 + i + 1);
+		  swapped = 1;
+		}
+	    }
+	  begin++;
+	}
+
+      for (i = 0; i < row_num; i++)
+	{
+	  for (j = 0; j < row_num; j++)
+	    {
+	      if (i == j)
+		H[i * row_num + j] = evec2[i + 1];
+	      else
+		H[i * row_num + j] = 0.;
+	    }
+	}
+    }
+#endif
+
+    static void bisec (std::vector < RealD > &c,
+		       std::vector < RealD > &b,
+		       int n,
+		       int m1,
+		       int m2,
+		       RealD eps1,
+		       RealD relfeh, std::vector < RealD > &x, RealD & eps2)
+    {
+      std::vector < RealD > wu (n + 2);
+
+      RealD h, q, x1, xu, x0, xmin, xmax;
+      int i, a, k;
+
+      b[1] = 0.0;
+      xmin = c[n] - fabs (b[n]);
+      xmax = c[n] + fabs (b[n]);
+      for (i = 1; i < n; i++)
+	{
+	  h = fabs (b[i]) + fabs (b[i + 1]);
+	  if (c[i] + h > xmax)
+	    xmax = c[i] + h;
+	  if (c[i] - h < xmin)
+	    xmin = c[i] - h;
+	}
+      xmax *= 2.;
+
+      eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
+      if (eps1 <= 0.0)
+	eps1 = eps2;
+      eps2 = 0.5 * eps1 + 7.0 * (eps2);
+      x0 = xmax;
+      for (i = m1; i <= m2; i++)
+	{
+	  x[i] = xmax;
+	  wu[i] = xmin;
+	}
+
+      for (k = m2; k >= m1; k--)
+	{
+	  xu = xmin;
+	  i = k;
+	  do
+	    {
+	      if (xu < wu[i])
+		{
+		  xu = wu[i];
+		  i = m1 - 1;
+		}
+	      i--;
+	    }
+	  while (i >= m1);
+	  if (x0 > x[k])
+	    x0 = x[k];
+	  while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
+	    {
+	      x1 = (xu + x0) / 2;
+
+	      a = 0;
+	      q = 1.0;
+	      for (i = 1; i <= n; i++)
+		{
+		  q =
+		    c[i] - x1 -
+		    ((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
+		  if (q < 0)
+		    a++;
+		}
+//      printf("x1=%0.14e a=%d\n",x1,a);
+	      if (a < k)
+		{
+		  if (a < m1)
+		    {
+		      xu = x1;
+		      wu[m1] = x1;
+		    }
+		  else
+		    {
+		      xu = x1;
+		      wu[a + 1] = x1;
+		      if (x[a] > x1)
+			x[a] = x1;
+		    }
+		}
+	      else
+		x0 = x1;
+	    }
+	  printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
+	  x[k] = (x0 + xu) / 2;
+	}
+    }
+  };
+
+/////////////////////////////////////////////////////////////
+// Implicitly restarted lanczos
+/////////////////////////////////////////////////////////////
+
+
+  template < class Field > class SimpleLanczos
+  {
+
+    const RealD small = 1.0e-16;
+  public:
+    int lock;
+    int get;
+    int Niter;
+    int converged;
+
+    int Nstop;			// Number of evecs checked for convergence
+    int Nk;			// Number of converged sought
+    int Np;			// Np -- Number of spare vecs in kryloc space
+    int Nm;			// Nm -- total number of vectors
+
+
+    RealD OrthoTime;
+
+    RealD eresid;
+
+//    SortEigen < Field > _sort;
+
+    LinearFunction < Field > &_Linop;
+
+//    OperatorFunction < Field > &_poly;
+
+    /////////////////////////
+    // Constructor
+    /////////////////////////
+    void init (void)
+    {
+    };
+//    void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector  < RealD > >&evecs);
+
+    SimpleLanczos (LinearFunction < Field > &Linop,	// op
+//		   OperatorFunction < Field > &poly,	// polynmial
+		   int _Nstop,	// sought vecs
+		   int _Nk,	// sought vecs
+		   int _Nm,	// spare vecs
+		   RealD _eresid,	// resid in lmdue deficit 
+		   int _Niter):	// Max iterations
+     
+      _Linop (Linop),
+ //     _poly (poly),
+      Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
+    {
+      Np = Nm - Nk;
+      assert (Np > 0);
+    };
+
+    /////////////////////////
+    // Sanity checked this routine (step) against Saad.
+    /////////////////////////
+    void RitzMatrix (std::vector < Field > &evec, int k)
+    {
+
+      if (1)
+	return;
+
+      GridBase *grid = evec[0].Grid();
+      Field w (grid);
+      std::cout << GridLogMessage << "RitzMatrix " << std::endl;
+      for (int i = 0; i < k; i++)
+	{
+	  _Linop(evec[i], w);
+//      _poly(_Linop,evec[i],w);
+	  std::cout << GridLogMessage << "[" << i << "] ";
+	  for (int j = 0; j < k; j++)
+	    {
+	      ComplexD in = innerProduct (evec[j], w);
+	      if (fabs ((double) i - j) > 1)
+		{
+		  if (abs (in) > 1.0e-9)
+		    {
+		      std::cout << GridLogMessage << "oops" << std::endl;
+		      abort ();
+		    }
+		  else
+		    std::cout << GridLogMessage << " 0 ";
+		}
+	      else
+		{
+		  std::cout << GridLogMessage << " " << in << " ";
+		}
+	    }
+	  std::cout << GridLogMessage << std::endl;
+	}
+    }
+
+    void step (std::vector < RealD > &lmd,
+	       std::vector < RealD > &lme,
+	       Field & last, Field & current, Field & next, uint64_t k)
+    {
+      if (lmd.size () <= k)
+	lmd.resize (k + Nm);
+      if (lme.size () <= k)
+	lme.resize (k + Nm);
+
+
+//      _poly(_Linop,current,next );   // 3. wk:=Avk−βkv_{k−1}
+      _Linop(current, next);	// 3. wk:=Avk−βkv_{k−1}
+      if (k > 0)
+	{
+	  next -= lme[k - 1] * last;
+	}
+//      std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
+
+      ComplexD zalph = innerProduct (current, next);	// 4. αk:=(wk,vk)
+      RealD alph = real (zalph);
+
+      next = next - alph * current;	// 5. wk:=wk−αkvk
+//      std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
+
+      RealD beta = normalise (next);	// 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+      // 7. vk+1 := wk/βk+1
+//       norm=beta;
+
+      int interval = Nm / 100 + 1;
+      if ((k % interval) == 0)
+	std::
+	  cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
+	  beta << std::endl;
+      const RealD tiny = 1.0e-20;
+      if (beta < tiny)
+	{
+	  std::cout << GridLogMessage << " beta is tiny " << beta << std::
+	    endl;
+	}
+      lmd[k] = alph;
+      lme[k] = beta;
+
+    }
+
+    void qr_decomp (std::vector < RealD > &lmd,
+		    std::vector  < RealD > &lme,
+		    int Nk,
+		    int Nm,
+		    std::vector  < RealD > &Qt, RealD Dsh, int kmin, int kmax)
+    {
+      int k = kmin - 1;
+      RealD x;
+
+      RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
+      RealD c = (lmd[k] - Dsh) * Fden;
+      RealD s = -lme[k] * Fden;
+
+      RealD tmpa1 = lmd[k];
+      RealD tmpa2 = lmd[k + 1];
+      RealD tmpb = lme[k];
+
+      lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
+      lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
+      lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
+      x = -s * lme[k + 1];
+      lme[k + 1] = c * lme[k + 1];
+
+      for (int i = 0; i < Nk; ++i)
+	{
+	  RealD Qtmp1 = Qt[i + Nm * k];
+	  RealD Qtmp2 = Qt[i + Nm * (k + 1)];
+	  Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
+	  Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
+	}
+
+      // Givens transformations
+      for (int k = kmin; k < kmax - 1; ++k)
+	{
+
+	  RealD Fden = 1.0 / hypot (x, lme[k - 1]);
+	  RealD c = lme[k - 1] * Fden;
+	  RealD s = -x * Fden;
+
+	  RealD tmpa1 = lmd[k];
+	  RealD tmpa2 = lmd[k + 1];
+	  RealD tmpb = lme[k];
+
+	  lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
+	  lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
+	  lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
+	  lme[k - 1] = c * lme[k - 1] - s * x;
+
+	  if (k != kmax - 2)
+	    {
+	      x = -s * lme[k + 1];
+	      lme[k + 1] = c * lme[k + 1];
+	    }
+
+	  for (int i = 0; i < Nk; ++i)
+	    {
+	      RealD Qtmp1 = Qt[i + Nm * k];
+	      RealD Qtmp2 = Qt[i + Nm * (k + 1)];
+	      Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
+	      Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
+	    }
+	}
+    }
+
+#if 0
+#ifdef USE_LAPACK
+#ifdef USE_MKL
+#define LAPACK_INT MKL_INT
+#else
+#define LAPACK_INT long long
+#endif
+    void diagonalize_lapack (std::vector  < RealD > &lmd, std::vector  < RealD > &lme, int N1,	// all
+			     int N2,	// get
+			     GridBase * grid)
+    {
+      const int size = Nm;
+      LAPACK_INT NN = N1;
+      double evals_tmp[NN];
+      double DD[NN];
+      double EE[NN];
+      for (int i = 0; i < NN; i++)
+	for (int j = i - 1; j <= i + 1; j++)
+	  if (j < NN && j >= 0)
+	    {
+	      if (i == j)
+		DD[i] = lmd[i];
+	      if (i == j)
+		evals_tmp[i] = lmd[i];
+	      if (j == (i - 1))
+		EE[j] = lme[j];
+	    }
+      LAPACK_INT evals_found;
+      LAPACK_INT lwork =
+	((18 * NN) >
+	 (1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
+      LAPACK_INT liwork = 3 + NN * 10;
+      LAPACK_INT iwork[liwork];
+      double work[lwork];
+      LAPACK_INT isuppz[2 * NN];
+      char jobz = 'N';		// calculate evals only
+      char range = 'I';		// calculate il-th to iu-th evals
+      //    char range = 'A'; // calculate all evals
+      char uplo = 'U';		// refer to upper half of original matrix
+      char compz = 'I';		// Compute eigenvectors of tridiagonal matrix
+      int ifail[NN];
+      LAPACK_INT info;
+//  int total = QMP_get_number_of_nodes();
+//  int node = QMP_get_node_number();
+//  GridBase *grid = evec[0]._grid;
+      int total = grid->_Nprocessors;
+      int node = grid->_processor;
+      int interval = (NN / total) + 1;
+      double vl = 0.0, vu = 0.0;
+      LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
+      if (iu > NN)
+	iu = NN;
+      double tol = 0.0;
+      if (1)
+	{
+	  memset (evals_tmp, 0, sizeof (double) * NN);
+	  if (il <= NN)
+	    {
+	      printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
+#ifdef USE_MKL
+	      dstegr (&jobz, &range, &NN,
+#else
+	      LAPACK_dstegr (&jobz, &range, &NN,
+#endif
+			     (double *) DD, (double *) EE, &vl, &vu, &il, &iu,	// these four are ignored if second parameteris 'A'
+			     &tol,	// tolerance
+			     &evals_found, evals_tmp, (double *) NULL, &NN,
+			     isuppz, work, &lwork, iwork, &liwork, &info);
+	      for (int i = iu - 1; i >= il - 1; i--)
+		{
+		  printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
+			  evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
+		  evals_tmp[i] = evals_tmp[i - (il - 1)];
+		  if (il > 1)
+		    evals_tmp[i - (il - 1)] = 0.;
+		}
+	    }
+	  {
+	    grid->GlobalSumVector (evals_tmp, NN);
+	  }
+	}
+// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
+    }
+#undef LAPACK_INT
+#endif
+
+
+    void diagonalize (std::vector  < RealD > &lmd,
+		      std::vector  < RealD > &lme,
+		      int N2, int N1, GridBase * grid)
+    {
+
+#ifdef USE_LAPACK
+      const int check_lapack = 0;	// just use lapack if 0, check against lapack if 1
+
+      if (!check_lapack)
+	return diagonalize_lapack (lmd, lme, N2, N1, grid);
+
+//      diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
+#endif
+    }
+#endif
+
+    static RealD normalise (Field & v)
+    {
+      RealD nn = norm2 (v);
+      nn = sqrt (nn);
+      v = v * (1.0 / nn);
+      return nn;
+    }
+
+    void orthogonalize (Field & w, std::vector < Field > &evec, int k)
+    {
+      double t0 = -usecond () / 1e6;
+      typedef typename Field::scalar_type MyComplex;
+      MyComplex ip;
+
+      if (0)
+	{
+	  for (int j = 0; j < k; ++j)
+	    {
+	      normalise (evec[j]);
+	      for (int i = 0; i < j; i++)
+		{
+		  ip = innerProduct (evec[i], evec[j]);	// are the evecs normalised? ; this assumes so.
+		  evec[j] = evec[j] - ip * evec[i];
+		}
+	    }
+	}
+
+      for (int j = 0; j < k; ++j)
+	{
+	  ip = innerProduct (evec[j], w);	// are the evecs normalised? ; this assumes so.
+	  w = w - ip * evec[j];
+	}
+      normalise (w);
+      t0 += usecond () / 1e6;
+      OrthoTime += t0;
+    }
+
+    void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
+    {
+      for (int i = 0; i < Qt.size (); ++i)
+	Qt[i] = 0.0;
+      for (int k = 0; k < Nm; ++k)
+	Qt[k + k * Nm] = 1.0;
+    }
+
+
+    void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
+    {
+
+      GridBase *grid = src.Grid();
+//      assert(grid == src._grid);
+
+      std::
+	cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
+	endl;
+      std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
+      std::cout << GridLogMessage << " -- size of eval   = " << eval.
+	size () << std::endl;
+
+//      assert(c.size() && Nm == eval.size());
+
+      std::vector < RealD > lme (Nm);
+      std::vector < RealD > lmd (Nm);
+
+
+      Field current (grid);
+      Field last (grid);
+      Field next (grid);
+
+      Nconv = 0;
+
+      RealD beta_k;
+
+      // Set initial vector
+      // (uniform vector) Why not src??
+      //      evec[0] = 1.0;
+      current = src;
+      std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
+	endl;
+      normalise (current);
+      std::
+	cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
+	std::endl;
+
+      // Initial Nk steps
+      OrthoTime = 0.;
+      double t0 = usecond () / 1e6;
+      RealD norm;		// sqrt norm of last vector
+
+      uint64_t iter = 0;
+
+      bool initted = false;
+      std::vector < RealD > low (Nstop * 10);
+      std::vector < RealD > high (Nstop * 10);
+      RealD cont = 0.;
+      while (1) {
+	  cont = 0.;
+	  std::vector < RealD > lme2 (Nm);
+	  std::vector < RealD > lmd2 (Nm);
+	  for (uint64_t k = 0; k < Nm; ++k, iter++) {
+	      step (lmd, lme, last, current, next, iter);
+	      last = current;
+	      current = next;
+	    }
+	  double t1 = usecond () / 1e6;
+	  std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
+	    t0 << "seconds" << std::endl;
+	  t0 = t1;
+	  std::
+	    cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
+	    OrthoTime << "seconds" << std::endl;
+
+	  // getting eigenvalues
+	  lmd2.resize (iter + 2);
+	  lme2.resize (iter + 2);
+	  for (uint64_t k = 0; k < iter; ++k) {
+	      lmd2[k + 1] = lmd[k];
+	      lme2[k + 2] = lme[k];
+	    }
+	  t1 = usecond () / 1e6;
+	  std::cout << GridLogMessage << "IRL:: copy: " << t1 -
+	    t0 << "seconds" << std::endl;
+	  t0 = t1;
+	  {
+	    int total = grid->_Nprocessors;
+	    int node = grid->_processor;
+	    int interval = (Nstop / total) + 1;
+	    int iu = (iter + 1) - (interval * node + 1);
+	    int il = (iter + 1) - (interval * (node + 1));
+	    std::vector < RealD > eval2 (iter + 3);
+	    RealD eps2;
+	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
+			      eps2);
+//        diagonalize(eval2,lme2,iter,Nk,grid);
+	    RealD diff = 0.;
+	    for (int i = il; i <= iu; i++) {
+		if (initted)
+		  diff =
+		    fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
+						      fabs (high[iu-i]));
+		if (initted && (diff > eresid))
+		  cont = 1.;
+		if (initted)
+		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
+			  high[iu-i], diff);
+		high[iu-i] = eval2[i];
+	      }
+	    il = (interval * node + 1);
+	    iu = (interval * (node + 1));
+	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
+			      eps2);
+	    for (int i = il; i <= iu; i++) {
+		if (initted)
+		  diff =
+		    fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
+						fabs (low[i]));
+		if (initted && (diff > eresid))
+		  cont = 1.;
+		if (initted)
+		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
+			  low[i], diff);
+		low[i] = eval2[i];
+	      }
+	    t1 = usecond () / 1e6;
+	    std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
+	      t0 << "seconds" << std::endl;
+	    t0 = t1;
+	  }
+
+	  for (uint64_t k = 0; k < Nk; ++k) {
+//          eval[k] = eval2[k];
+	    }
+	  if (initted)
+	    {
+	      grid->GlobalSumVector (&cont, 1);
+	      if (cont < 1.) return;
+	    }
+	  initted = true;
+	}
+
+    }
+
+
+
+
+
+#if 0
+
+/**
+   There is some matrix Q such that for any vector y
+   Q.e_1 = y and Q is unitary.
+**/
+    template < class T >
+      static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
+    {
+      int N = y.size ();	//Matrix Size
+      Fill (Q, 0.0);
+      T tau;
+      for (int i = 0; i < N; i++)
+	{
+	  Q[i][0] = y[i];
+	}
+      T sig = conj (y[0]) * y[0];
+      T tau0 = fabs (sqrt (sig));
+
+      for (int j = 1; j < N; j++)
+	{
+	  sig += conj (y[j]) * y[j];
+	  tau = abs (sqrt (sig));
+
+	  if (abs (tau0) > 0.0)
+	    {
+
+	      T gam = conj ((y[j] / tau) / tau0);
+	      for (int k = 0; k <= j - 1; k++)
+		{
+		  Q[k][j] = -gam * y[k];
+		}
+	      Q[j][j] = tau0 / tau;
+	    }
+	  else
+	    {
+	      Q[j - 1][j] = 1.0;
+	    }
+	  tau0 = tau;
+	}
+      return tau;
+    }
+
+/**
+	There is some matrix Q such that for any vector y
+	Q.e_k = y and Q is unitary.
+**/
+    template < class T >
+      static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
+    {
+      T tau = orthQ (Q, y);
+      SL (Q);
+      return tau;
+    }
+
+
+/**
+	Wind up with a matrix with the first con rows untouched
+
+say con = 2
+	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
+	and the matrix is upper hessenberg
+	and with f and Q appropriately modidied with Q is the arnoldi factorization
+
+**/
+
+    template < class T > static void Lock (DenseMatrix < T > &H,	///Hess mtx     
+					   DenseMatrix < T > &Q,	///Lock Transform
+					   T val,	///value to be locked
+					   int con,	///number already locked
+					   RealD small, int dfg, bool herm)
+    {
+      //ForceTridiagonal(H);
+
+      int M = H.dim;
+      DenseVector < T > vec;
+      Resize (vec, M - con);
+
+      DenseMatrix < T > AH;
+      Resize (AH, M - con, M - con);
+      AH = GetSubMtx (H, con, M, con, M);
+
+      DenseMatrix < T > QQ;
+      Resize (QQ, M - con, M - con);
+
+      Unity (Q);
+      Unity (QQ);
+
+      DenseVector < T > evals;
+      Resize (evals, M - con);
+      DenseMatrix < T > evecs;
+      Resize (evecs, M - con, M - con);
+
+      Wilkinson < T > (AH, evals, evecs, small);
+
+      int k = 0;
+      RealD cold = abs (val - evals[k]);
+      for (int i = 1; i < M - con; i++)
+	{
+	  RealD cnew = abs (val - evals[i]);
+	  if (cnew < cold)
+	    {
+	      k = i;
+	      cold = cnew;
+	    }
+	}
+      vec = evecs[k];
+
+      ComplexD tau;
+      orthQ (QQ, vec);
+      //orthQM(QQ,AH,vec);
+
+      AH = Hermitian (QQ) * AH;
+      AH = AH * QQ;
+
+      for (int i = con; i < M; i++)
+	{
+	  for (int j = con; j < M; j++)
+	    {
+	      Q[i][j] = QQ[i - con][j - con];
+	      H[i][j] = AH[i - con][j - con];
+	    }
+	}
+
+      for (int j = M - 1; j > con + 2; j--)
+	{
+
+	  DenseMatrix < T > U;
+	  Resize (U, j - 1 - con, j - 1 - con);
+	  DenseVector < T > z;
+	  Resize (z, j - 1 - con);
+	  T nm = norm (z);
+	  for (int k = con + 0; k < j - 1; k++)
+	    {
+	      z[k - con] = conj (H (j, k + 1));
+	    }
+	  normalise (z);
+
+	  RealD tmp = 0;
+	  for (int i = 0; i < z.size () - 1; i++)
+	    {
+	      tmp = tmp + abs (z[i]);
+	    }
+
+	  if (tmp < small / ((RealD) z.size () - 1.0))
+	    {
+	      continue;
+	    }
+
+	  tau = orthU (U, z);
+
+	  DenseMatrix < T > Hb;
+	  Resize (Hb, j - 1 - con, M);
+
+	  for (int a = 0; a < M; a++)
+	    {
+	      for (int b = 0; b < j - 1 - con; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += H[a][con + 1 + c] * U[c][b];
+		    }		//sum += H(a,con+1+c)*U(c,b);}
+		  Hb[b][a] = sum;
+		}
+	    }
+
+	  for (int k = con + 1; k < j; k++)
+	    {
+	      for (int l = 0; l < M; l++)
+		{
+		  H[l][k] = Hb[k - 1 - con][l];
+		}
+	    }			//H(Hb[k-1-con][l] , l,k);}}
+
+	  DenseMatrix < T > Qb;
+	  Resize (Qb, M, M);
+
+	  for (int a = 0; a < M; a++)
+	    {
+	      for (int b = 0; b < j - 1 - con; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += Q[a][con + 1 + c] * U[c][b];
+		    }		//sum += Q(a,con+1+c)*U(c,b);}
+		  Qb[b][a] = sum;
+		}
+	    }
+
+	  for (int k = con + 1; k < j; k++)
+	    {
+	      for (int l = 0; l < M; l++)
+		{
+		  Q[l][k] = Qb[k - 1 - con][l];
+		}
+	    }			//Q(Qb[k-1-con][l] , l,k);}}
+
+	  DenseMatrix < T > Hc;
+	  Resize (Hc, M, M);
+
+	  for (int a = 0; a < j - 1 - con; a++)
+	    {
+	      for (int b = 0; b < M; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += conj (U[c][a]) * H[con + 1 + c][b];
+		    }		//sum += conj( U(c,a) )*H(con+1+c,b);}
+		  Hc[b][a] = sum;
+		}
+	    }
+
+	  for (int k = 0; k < M; k++)
+	    {
+	      for (int l = con + 1; l < j; l++)
+		{
+		  H[l][k] = Hc[k][l - 1 - con];
+		}
+	    }			//H(Hc[k][l-1-con] , l,k);}}
+
+	}
+    }
+#endif
+
+
+  };
+
+}
+#endif
diff --git a/tests/lanczos/Test_wilson_lanczos.cc b/tests/lanczos/Test_wilson_lanczos.cc
index 4814e8c6..ca30baa3 100644
--- a/tests/lanczos/Test_wilson_lanczos.cc
+++ b/tests/lanczos/Test_wilson_lanczos.cc
@@ -61,7 +61,8 @@ int main(int argc, char** argv) {
   RNG5.SeedFixedIntegers(seeds5);
 
   LatticeGaugeField Umu(UGrid);
-  SU<Nc>::HotConfiguration(RNG4, Umu);
+//  SU<Nc>::HotConfiguration(RNG4, Umu);
+  SU<Nc>::ColdConfiguration(Umu);
 
 /*
   std::vector<LatticeColourMatrix> U(4, UGrid);
@@ -69,9 +70,15 @@ int main(int argc, char** argv) {
     U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
   }
 */
+//  std::vector<Complex> boundary = {1,1,1,-1};
+  std::vector<Complex> boundary = {1,1,1,1};
+  FermionOp::ImplParams Params(boundary);
 
-  RealD mass = -0.1;
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+
+
+  RealD mass = 0.0;
+//  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
   MdagMLinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator); /// <-----
   //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
 
@@ -89,7 +96,8 @@ int main(int argc, char** argv) {
   FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
      PlainHermOp<FermionField> Op     (HermOp);
 
-  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
+//  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
+  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);
 
   std::vector<RealD> eval(Nm);
   FermionField src(FGrid);
@@ -101,7 +109,8 @@ int main(int argc, char** argv) {
   };
 
   int Nconv;
-  IRL.calc(eval, evec, src, Nconv);
+//  IRL.calc(eval, evec, src, Nconv);
+  IRL.calc(eval, src, Nconv);
 
   std::cout << eval << std::endl;
 
diff --git a/tests/lanczos/Test_wilson_specflow.cc b/tests/lanczos/Test_wilson_specflow.cc
index 4e60a81d..8d5ba5c8 100644
--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@@ -134,6 +134,7 @@ int main(int argc, char** argv) {
 
   LatticeGaugeField Umu(UGrid);
 //  SU<Nc>::HotConfiguration(RNG4, Umu);
+//  SU<Nc>::ColdConfiguration(Umu);
 
   FieldMetaData header;
   std::string file("./config");
@@ -185,6 +186,7 @@ int main(int argc, char** argv) {
   FermionField src(FGrid);
   gaussian(RNG5, src);
   std::vector<Complex> boundary = {1,1,1,-1};
+//  std::vector<Complex> boundary = {1,1,1,1};
   FermionOp::ImplParams Params(boundary);
 
 
@@ -208,6 +210,7 @@ while ( mass > - 2.5){
      PlainHermOp<FermionField> Op2     (HermOp2);
 
   ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
+//  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);
 
   std::vector<RealD> eval(Nm);
   std::vector<FermionField> evec(Nm, FGrid);
@@ -218,6 +221,7 @@ while ( mass > - 2.5){
 
   int Nconv;
   IRL.calc(eval, evec, src, Nconv);
+//  IRL.calc(eval,  src, Nconv);
 
   std::cout << mass <<" : " << eval << std::endl;
 

From c1e5ef9476a415351e4be0dc9c323a311d4630e3 Mon Sep 17 00:00:00 2001
From: Chulwoo Jung <chulwoo@aurora-uan-0009.hostmgmt.cm.aurora.alcf.anl.gov>
Date: Fri, 15 Aug 2025 20:52:36 +0000
Subject: [PATCH 6/7] Adding config input

---
 tests/lanczos/Test_wilson_lanczos.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/lanczos/Test_wilson_lanczos.cc b/tests/lanczos/Test_wilson_lanczos.cc
index ca30baa3..3e7618af 100644
--- a/tests/lanczos/Test_wilson_lanczos.cc
+++ b/tests/lanczos/Test_wilson_lanczos.cc
@@ -64,14 +64,19 @@ int main(int argc, char** argv) {
 //  SU<Nc>::HotConfiguration(RNG4, Umu);
   SU<Nc>::ColdConfiguration(Umu);
 
+  FieldMetaData header;
+  std::string file("./config");
+  NerscIO::readConfiguration(Umu,header,file);
+
+
 /*
   std::vector<LatticeColourMatrix> U(4, UGrid);
   for (int mu = 0; mu < Nd; mu++) {
     U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
   }
 */
-//  std::vector<Complex> boundary = {1,1,1,-1};
-  std::vector<Complex> boundary = {1,1,1,1};
+  std::vector<Complex> boundary = {1,1,1,-1};
+//  std::vector<Complex> boundary = {1,1,1,1};
   FermionOp::ImplParams Params(boundary);
 
 

From 786496f22e91cf9299b67c1566d3344618a1e1bc Mon Sep 17 00:00:00 2001
From: Chulwoo Jung <chulwoo@bnl.gov>
Date: Mon, 3 Nov 2025 21:18:56 +0000
Subject: [PATCH 7/7] Checking in before pulling KrylovSchur

---
 .../iterative/ImplicitlyRestartedLanczos.h    |  27 +-
 tests/lanczos/Test_wilson_lanczos.cc          | 242 ++++++++++++++++--
 tests/lanczos/Test_wilson_specflow.cc         |  37 ++-
 3 files changed, 275 insertions(+), 31 deletions(-)

diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
index eb0d0761..df2007d2 100644
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@@ -53,6 +53,18 @@ enum IRLdiagonalisation {
   IRLdiagonaliseWithEigen
 };
 
+enum IRLeigsort { 
+  IRLeigsortMax,
+  IRLeigsortSqMin
+};
+
+#if 0
+bool square_comp(RealD a, RealD b){
+	if (a*a<b*b) return true;
+	return false;
+}
+#endif
+
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
  public:
@@ -119,9 +131,10 @@ class ImplicitlyRestartedLanczos {
   /////////////////////////
   // Constructor
   /////////////////////////
-  
-public:       
+ public:
+  IRLeigsort EigSort;
 
+  
   //////////////////////////////////////////////////////////////////
   // PAB:
   //////////////////////////////////////////////////////////////////
@@ -154,6 +167,7 @@ public:
     Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
     eresid(_eresid),      betastp(_betastp),
     MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
+    EigSort(IRLeigsortMax), 
     orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
 
     ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
@@ -170,6 +184,7 @@ public:
     Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
     eresid(_eresid),      betastp(_betastp),
     MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
+    EigSort(IRLeigsortMax), 
     orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
 
   ////////////////////////////////
@@ -316,8 +331,12 @@ until convergence
       // sorting
       //////////////////////////////////
       eval2_copy = eval2;
+//      if (EigSort==IRLeigsortMax)
+//      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),square_comp);
+//      else
       std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
       std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
+//      eval2_copy = eval2;
       const int chunk=8;
       for(int io=0; io<k2;io+=chunk){
 	std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
@@ -333,6 +352,7 @@ until convergence
       //////////////////////////////////
       Qt = Eigen::MatrixXd::Identity(Nm,Nm);
       for(int ip=k2; ip<Nm; ++ip){ 
+//        std::cout<<GridLogIRL <<"QR decompose "<<eval2[ip]<<std::endl;
 	QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
       }
       std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
@@ -375,7 +395,8 @@ until convergence
 
 	//  power of two search pattern;  not every evalue in eval2 is assessed.
 	int allconv =1;
-	for(int jj = 1; jj<=Nstop; jj*=2){
+//	for(int jj = 1; jj<=Nstop; jj*=2){
+	for(int jj = 1; jj<=Nstop; jj++){
 	  int j = Nstop-jj;
 	  RealD e = eval2_copy[j]; // Discard the evalue
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
diff --git a/tests/lanczos/Test_wilson_lanczos.cc b/tests/lanczos/Test_wilson_lanczos.cc
index 3e7618af..0f9a8b8d 100644
--- a/tests/lanczos/Test_wilson_lanczos.cc
+++ b/tests/lanczos/Test_wilson_lanczos.cc
@@ -6,7 +6,7 @@ Source file: ./tests/Test_dwf_lanczos.cc
 
 Copyright (C) 2015
 
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -27,6 +27,9 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
+#include <Grid/parallelIO/IldgIOtypes.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+
 
 using namespace std;
 using namespace Grid;
@@ -38,18 +41,111 @@ typedef typename WilsonFermionD::FermionField FermionField;
 
 RealD AllZero(RealD x) { return 0.; }
 
+template <class T> void writeFile(T& in, std::string const fname){
+#if 1
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}
+
+
+namespace Grid {
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+		  		RealD, mass , 
+		  		RealD, mstep , 
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
+                                Integer, ReadEvec,
+	  			RealD, resid,
+	  			RealD, ChebyLow,
+	  			RealD, ChebyHigh,
+	  			Integer, ChebyOrder)
+//                                  Integer, StartTrajectory,
+//                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
+//                                  bool, MetropolisTest,
+//                                  Integer, NoMetropolisUntil,
+//                                  std::string, StartingType,
+//                                  Integer, SW,
+//				  RealD, Kappa,
+//                                  IntegratorParameters, MD)
+
+  LanczosParameters() {
+    ////////////////////////////// Default values
+      mass = 0;
+//    MetropolisTest    = true;
+//    NoMetropolisUntil = 10;
+//    StartTrajectory   = 0;
+//    SW                = 2;
+//    Trajectories      = 10;
+//    StartingType      = "HotStart";
+    /////////////////////////////////
+  }
+
+  template <class ReaderClass >
+  LanczosParameters(Reader<ReaderClass> & TheReader){
+    initialize(TheReader);
+  }
+
+  template < class ReaderClass > 
+  void initialize(Reader<ReaderClass> &TheReader){
+//    std::cout << GridLogMessage << "Reading HMC\n";
+    read(TheReader, "HMC", *this);
+  }
+
+
+  void print_parameters() const {
+//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
+//    MD.print_parameters();
+  }
+  
+};
+
+}
+
 int main(int argc, char** argv) {
   Grid_init(&argc, &argv);
 
+  int Ndir=4;
+  auto mpi_layout  = GridDefaultMpi();
+  std::vector<int> nblock(4,1);
+  std::vector<int> mpi_split(4,1);
+//Interested in avoiding degeneracy only for now
+  nblock[3]=2;
+
+  int mrhs=1;
+  for(int i =0;i<Ndir;i++){
+      mpi_split[i] = mpi_layout[i] / nblock[i];
+      mrhs *= nblock[i];
+  }
+
+
   GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
       GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
       GridDefaultMpi());
-  GridRedBlackCartesian* UrbGrid =
-      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
+                                                    GridDefaultSimd(Nd,vComplex::Nsimd()),
+                                                    mpi_split,
+                                                    *UGrid);
+
+  GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
   GridCartesian* FGrid = UGrid;
   GridRedBlackCartesian* FrbGrid = UrbGrid;
-  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid,
-         FrbGrid);
+//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
 
   std::vector<int> seeds4({1, 2, 3, 4});
   std::vector<int> seeds5({5, 6, 7, 8});
@@ -62,12 +158,15 @@ int main(int argc, char** argv) {
 
   LatticeGaugeField Umu(UGrid);
 //  SU<Nc>::HotConfiguration(RNG4, Umu);
-  SU<Nc>::ColdConfiguration(Umu);
+//  SU<Nc>::ColdConfiguration(Umu);
 
   FieldMetaData header;
   std::string file("./config");
-  NerscIO::readConfiguration(Umu,header,file);
 
+//  int precision32 = 0;
+//  int tworow      = 0;
+//  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
+  NerscIO::readConfiguration(Umu,header,file);
 
 /*
   std::vector<LatticeColourMatrix> U(4, UGrid);
@@ -75,38 +174,101 @@ int main(int argc, char** argv) {
     U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
   }
 */
+
+  int Nstop = 10;
+  int Nu = 1;
+  int Nk = 20;
+  int Np = 80;
+  int Nm = Nk + Np;
+  int MaxIt = 10000;
+  RealD resid = 1.0e-5;
+
+  RealD mass = -1.0;
+
+  LanczosParameters LanParams;
+#if 1
+  {
+    XmlReader  HMCrd("LanParams.xml");
+    read(HMCrd,"LanczosParameters",LanParams);
+  }
+#else
+  {
+    LanParams.mass = mass;
+  }
+#endif
+  std::cout << GridLogMessage<< LanParams <<std::endl;
+  { 
+    XmlWriter HMCwr("LanParams.xml.out");
+    write(HMCwr,"LanczosParameters",LanParams);
+  }
+
+  mass=LanParams.mass;
+  resid=LanParams.resid;
+  Nstop=LanParams.Nstop;
+  Nu = mrhs;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
+  Nm = Nk + Np;
+
+//  FermionField src(FGrid);
+  std::vector<FermionField> src(Nu,FGrid);
+  for(int i =0;i<Nu;i++) gaussian(RNG5, src[i]);
+
+  if(LanParams.ReadEvec) {
+    std::string evecs_file="evec_in";
+    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD;
+    RD.open(evecs_file);
+    RD.readScidacFieldRecord(src[0],record);
+    RD.close();
+  }
+
   std::vector<Complex> boundary = {1,1,1,-1};
 //  std::vector<Complex> boundary = {1,1,1,1};
   FermionOp::ImplParams Params(boundary);
 
+  GridCartesian         * SFGrid   = SGrid;
+  GridRedBlackCartesian * SFrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SFGrid);
+//  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
+
+  LatticeGaugeField s_Umu(SGrid);
+  Grid_split  (Umu,s_Umu);
 
 
-  RealD mass = 0.0;
-//  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+
+while ( mass > - 2.0){
   FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
-  MdagMLinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator); /// <-----
+  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
+  FermionOp WilsonSplit(s_Umu,*SFGrid,*SFrbGrid,mass,Params);
+  MdagMLinearOperator<FermionOp,FermionField> SHermOp(WilsonSplit); /// <-----
   //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
-
-  const int Nstop = 20;
-  const int Nk = 60;
-  const int Np = 60;
-  const int Nm = Nk + Np;
-  const int MaxIt = 10000;
-  RealD resid = 1.0e-6;
+  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
 
   std::vector<double> Coeffs{0, 1.};
   Polynomial<FermionField> PolyX(Coeffs);
-  Chebyshev<FermionField> Cheby(0.0, 10., 12);
+//  Chebyshev<FermionField> Cheby(0.5, 60., 31);
+//                                  RealD, ChebyLow,
+//                                RealD, ChebyHigh,
+//                                Integer, ChebyOrder)
+
+  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
 
   FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
      PlainHermOp<FermionField> Op     (HermOp);
+     PlainHermOp<FermionField> Op2     (HermOp2);
 
-//  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
-  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);
-
+//  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
+//  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);
+    ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
+                                                     FrbGrid,SFrbGrid,mrhs,
+                                                     Cheby,
+                                                     Nstop, Nstop*2,
+                                                     Nu, Nk, Nm,
+                                                     resid, MaxIt,
+                                                     IRBLdiagonaliseWithEigen);
+  IRBL.split_test=1;
   std::vector<RealD> eval(Nm);
-  FermionField src(FGrid);
-  gaussian(RNG5, src);
   std::vector<FermionField> evec(Nm, FGrid);
   for (int i = 0; i < 1; i++) {
     std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
@@ -115,9 +277,39 @@ int main(int argc, char** argv) {
 
   int Nconv;
 //  IRL.calc(eval, evec, src, Nconv);
-  IRL.calc(eval, src, Nconv);
+  IRBL.calc(eval, evec, src, Nconv,LanczosType::irbl);
 
-  std::cout << eval << std::endl;
+  std::cout << mass <<" : " << eval << std::endl;
+
+  Gamma g5(Gamma::Algebra::Gamma5) ;
+  ComplexD dot;
+  FermionField tmp(FGrid);
+  FermionField sav(FGrid);
+  sav=evec[0];
+  for (int i = 0; i < Nstop ; i++) {
+    tmp = g5*evec[i];
+    dot = innerProduct(tmp,evec[i]);
+    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
+//    if ( i<1)
+    {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
+        auto evdensity = localInnerProduct(evec[i],evec[i] );
+	writeFile(evdensity,evfile);
+    }
+    if (i>0) sav += evec[i];
+  }
+  {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_sum");
+//        auto evdensity = localInnerProduct(evec[i],evec[i] );
+	writeFile(sav,evfile);
+  }
+  for(int i =0;i<Nu;i++) src[i]=evec[i];
+  for(int i=Nu;i<Nstop;i++) src[i%Nu] +=evec[i];
+//  src  = evec[0]+evec[1]+evec[2];
+//  src  += evec[3]+evec[4]+evec[5];
+//  src  += evec[6]+evec[7]+evec[8];
+  mass += LanParams.mstep;
+}
 
   Grid_finalize();
 }
diff --git a/tests/lanczos/Test_wilson_specflow.cc b/tests/lanczos/Test_wilson_specflow.cc
index 8d5ba5c8..6c9cbcac 100644
--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@@ -62,6 +62,8 @@ struct LanczosParameters: Serializable {
 				Integer, Nstop,
                                 Integer, Nk,
                                 Integer, Np,
+                                Integer, ReadEvec,
+	  			RealD, resid,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -139,8 +141,8 @@ int main(int argc, char** argv) {
   FieldMetaData header;
   std::string file("./config");
 
-  int precision32 = 0;
-  int tworow      = 0;
+//  int precision32 = 0;
+//  int tworow      = 0;
 //  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
   NerscIO::readConfiguration(Umu,header,file);
 
@@ -178,6 +180,7 @@ int main(int argc, char** argv) {
   }
 
   mass=LanParams.mass;
+  resid=LanParams.resid;
   Nstop=LanParams.Nstop;
   Nk=LanParams.Nk;
   Np=LanParams.Np;
@@ -185,12 +188,22 @@ int main(int argc, char** argv) {
 
   FermionField src(FGrid);
   gaussian(RNG5, src);
+  if(LanParams.ReadEvec) {
+    std::string evecs_file="evec_in";
+    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD;
+    RD.open(evecs_file);
+    RD.readScidacFieldRecord(src,record);
+    RD.close();
+  }
+
   std::vector<Complex> boundary = {1,1,1,-1};
 //  std::vector<Complex> boundary = {1,1,1,1};
   FermionOp::ImplParams Params(boundary);
 
 
-while ( mass > - 2.5){
+while ( mass > - 2.0){
   FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
   MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
   //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
@@ -228,6 +241,8 @@ while ( mass > - 2.5){
   Gamma g5(Gamma::Algebra::Gamma5) ;
   ComplexD dot;
   FermionField tmp(FGrid);
+  FermionField sav(FGrid);
+  sav=evec[0];
   for (int i = 0; i < Nstop ; i++) {
     tmp = g5*evec[i];
     dot = innerProduct(tmp,evec[i]);
@@ -237,7 +252,23 @@ while ( mass > - 2.5){
 	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
         auto evdensity = localInnerProduct(evec[i],evec[i] );
 	writeFile(evdensity,evfile);
+//  if(LanParams.ReadEvec) {
+//    std::string evecs_file="evec_in";
+  {
+    std::cout << GridLogIRL<< "Reading evecs from "<<evfile<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD;
+    RD.open(evfile);
+    RD.readScidacFieldRecord(evdensity,record);
+    RD.close();
+  }
     }
+    if (i>0) sav += evec[i];
+  }
+  {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_sum");
+//        auto evdensity = localInnerProduct(evec[i],evec[i] );
+	writeFile(sav,evfile);
   }
   src  = evec[0]+evec[1]+evec[2];
   src  += evec[3]+evec[4]+evec[5];