Changed batchedInnerProduct for portability

Verbosity reduction batched inner product for reorthogonalization
RestartedLanczosBidiagonalization seems to have been fixed
2026-07-06 10:23:29 +01:00 · 2026-03-17 18:54:18 -04:00 · 2026-03-17 13:02:16 -04:00 · 2026-03-16 14:34:56 -04:00 · 2026-03-13 19:12:54 -04:00 · 2026-03-12 10:49:21 -04:00
90 changed files with 10441 additions and 201 deletions
@@ -54,22 +54,24 @@ Version.h: version-cache
 include Make.inc
 include Eigen.inc

-extra_sources+=$(WILS_FERMION_FILES)
-extra_sources+=$(STAG_FERMION_FILES)
+if BUILD_FERMION_INSTANTIATIONS
+  extra_sources+=$(WILS_FERMION_FILES)
+  extra_sources+=$(STAG_FERMION_FILES)
 if BUILD_ZMOBIUS
-  extra_sources+=$(ZWILS_FERMION_FILES)
+    extra_sources+=$(ZWILS_FERMION_FILES)
 endif
 if BUILD_GPARITY
-  extra_sources+=$(GP_FERMION_FILES)
+    extra_sources+=$(GP_FERMION_FILES)
 endif
 if BUILD_FERMION_REPS
-  extra_sources+=$(ADJ_FERMION_FILES)
-  extra_sources+=$(TWOIND_FERMION_FILES)
+    extra_sources+=$(ADJ_FERMION_FILES)
+    extra_sources+=$(TWOIND_FERMION_FILES)
 endif
 if BUILD_SP
    extra_sources+=$(SP_FERMION_FILES)
 if BUILD_FERMION_REPS
-    extra_sources+=$(SP_TWOIND_FERMION_FILES)
+      extra_sources+=$(SP_TWOIND_FERMION_FILES)
+endif
 endif
 endif

@@ -75,6 +75,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/SimpleLanczos.h>
 #include <Grid/algorithms/iterative/PowerMethod.h>
 #include <Grid/algorithms/iterative/AdefGeneric.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
@@ -83,4 +84,9 @@ NAMESPACE_CHECK(PowerMethod);
 NAMESPACE_CHECK(multigrid);
 #include <Grid/algorithms/FFT.h>

+#include <Grid/algorithms/iterative/KrylovSchur.h>
+#include <Grid/algorithms/iterative/Arnoldi.h>
+#include <Grid/algorithms/iterative/LanczosBidiagonalization.h>
+#include <Grid/algorithms/iterative/RestartedLanczosBidiagonalization.h>
+
 #endif
@@ -28,6 +28,7 @@ Author: Peter Boyle <pboyle@bnl.gov>
 #pragma once

 #ifdef GRID_HIP
+#include <hip/hip_version.h>
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_CUDA
@@ -255,16 +256,29 @@ public:
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasZgemmBatched(gridblasHandle,
 				   hOpA,
 				   hOpB,
 				   m,n,k,
-				   (hipblasDoubleComplex *) &alpha_p[0],
-				   (hipblasDoubleComplex **)&Amk[0], lda,
-				   (hipblasDoubleComplex **)&Bkn[0], ldb,
-				   (hipblasDoubleComplex *) &beta_p[0],
-				   (hipblasDoubleComplex **)&Cmn[0], ldc,
+				   (hipDoubleComplex *) &alpha_p[0],
+				   (hipDoubleComplex **)&Amk[0], lda,
+				   (hipDoubleComplex **)&Bkn[0], ldb,
+				   (hipDoubleComplex *) &beta_p[0],
+				   (hipDoubleComplex **)&Cmn[0], ldc,
 				   batchCount);
+#else
+    auto err = hipblasZgemmBatched(gridblasHandle,
+                                   hOpA,
+                                   hOpB,
+                                   m,n,k,
+                                   (hipblasDoubleComplex *) &alpha_p[0],
+                                   (hipblasDoubleComplex **)&Amk[0], lda,
+                                   (hipblasDoubleComplex **)&Bkn[0], ldb,
+                                   (hipblasDoubleComplex *) &beta_p[0],
+                                   (hipblasDoubleComplex **)&Cmn[0], ldc,
+                                   batchCount);
+#endif
    //	 std::cout << " hipblas return code " <<(int)err<<std::endl;
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
@@ -503,17 +517,30 @@ public:
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasCgemmBatched(gridblasHandle,
 				   hOpA,
 				   hOpB,
 				   m,n,k,
-				   (hipblasComplex *) &alpha_p[0],
-				   (hipblasComplex **)&Amk[0], lda,
-				   (hipblasComplex **)&Bkn[0], ldb,
-				   (hipblasComplex *) &beta_p[0],
-				   (hipblasComplex **)&Cmn[0], ldc,
+				   (hipComplex *) &alpha_p[0],
+				   (hipComplex **)&Amk[0], lda,
+				   (hipComplex **)&Bkn[0], ldb,
+				   (hipComplex *) &beta_p[0],
+				   (hipComplex **)&Cmn[0], ldc,
 				   batchCount);
+#else
+    auto err = hipblasCgemmBatched(gridblasHandle,
+                                   hOpA,
+                                   hOpB,
+                                   m,n,k,
+                                   (hipblasComplex *) &alpha_p[0],
+                                   (hipblasComplex **)&Amk[0], lda,
+                                   (hipblasComplex **)&Bkn[0], ldb,
+                                   (hipblasComplex *) &beta_p[0],
+                                   (hipblasComplex **)&Cmn[0], ldc,
+                                   batchCount);

+#endif
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
@@ -1094,11 +1121,19 @@ public:
    GRID_ASSERT(info.size()==batchCount);

 #ifdef GRID_HIP
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
-				    (hipblasDoubleComplex **)&Ann[0], (int)n,
+				    (hipDoubleComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
 				    (int*) &info[0],
 				    (int)batchCount);
+#else
+    auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
+                                    (hipblasDoubleComplex **)&Ann[0], (int)n,
+                                    (int*) &ipiv[0],
+                                    (int*) &info[0],
+                                    (int)batchCount);
+#endif
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
@@ -1124,11 +1159,20 @@ public:
    GRID_ASSERT(info.size()==batchCount);

 #ifdef GRID_HIP
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
-				    (hipblasComplex **)&Ann[0], (int)n,
+				    (hipComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
 				    (int*) &info[0],
 				    (int)batchCount);
+#else
+    auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
+                                    (hipblasComplex **)&Ann[0], (int)n,
+                                    (int*) &ipiv[0],
+                                    (int*) &info[0],
+                                    (int)batchCount);
+#endif
+
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
@@ -1201,12 +1245,22 @@ public:
    GRID_ASSERT(Cnn.size()==batchCount);

 #ifdef GRID_HIP
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
-				    (hipblasDoubleComplex **)&Ann[0], (int)n,
+				    (hipDoubleComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
-				    (hipblasDoubleComplex **)&Cnn[0], (int)n,
+				    (hipDoubleComplex **)&Cnn[0], (int)n,
 				    (int*) &info[0],
 				    (int)batchCount);
+#else
+    auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
+                                    (hipblasDoubleComplex **)&Ann[0], (int)n,
+                                    (int*) &ipiv[0],
+                                    (hipblasDoubleComplex **)&Cnn[0], (int)n,
+                                    (int*) &info[0],
+                                    (int)batchCount);
+
+#endif
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
@@ -1235,12 +1289,21 @@ public:
    GRID_ASSERT(Cnn.size()==batchCount);

 #ifdef GRID_HIP
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
-				    (hipblasComplex **)&Ann[0], (int)n,
+				    (hipComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
-				    (hipblasComplex **)&Cnn[0], (int)n,
+				    (hipComplex **)&Cnn[0], (int)n,
 				    (int*) &info[0],
 				    (int)batchCount);
+#else
+    auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
+                                    (hipblasComplex **)&Ann[0], (int)n,
+                                    (int*) &ipiv[0],
+                                    (hipblasComplex **)&Cnn[0], (int)n,
+                                    (int*) &info[0],
+                                    (int)batchCount);
+#endif
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
@@ -92,8 +92,8 @@ class TwoLevelCGmrhs
  // Vector case
  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
  {
-    //    SolveSingleSystem(src,x);
-    SolvePrecBlockCG(src,x);
+    SolveSingleSystem(src,x);
+	// SolvePrecBlockCG(src,x);
  }

 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -0,0 +1,433 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid 
+
+Source file: ./lib/algorithms/iterative/Arnoldi.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Patrick Oare <poare@bnl.gov>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_ARNOLDI_H
+#define GRID_ARNOLDI_H
+
+NAMESPACE_BEGIN(Grid); 
+
+//Moved to KrylovSchur
+#if 0
+/**
+<<<<<<< HEAD
+ * Options for which Ritz values to keep in implicit restart.
+ */
+enum RitzFilter {
+  EvalNormSmall,           // Keep evals with smallest norm
+  EvalNormLarge,           // Keep evals with largest norm
+  EvalReSmall,             // Keep evals with smallest real part
+  EvalReLarge              // Keep evals with largest real part
+};
+
+// Select comparison function from RitzFilter
+struct ComplexComparator
+{
+  RitzFilter f;
+  ComplexComparator (RitzFilter _f) : f(_f) {}
+  bool operator()(std::complex<double> z1, std::complex<double> z2) { 
+    switch (f) {
+      RealD tmp1, tmp2;
+      tmp1=std::abs(std::imag(z1));
+      tmp2=std::abs(std::imag(z2));
+      case EvalNormSmall:
+        return std::abs(z1) < std::abs(z2);
+      case EvalNormLarge:
+        return std::abs(z1) > std::abs(z2);
+// Terrible hack
+//        return std::abs(std::real(z1)) < std::abs(std::real(z2));
+//	if ( std::abs(std::real(z1))  >4.) tmp1 +=1.;
+//	if ( std::abs(std::real(z2))  >4.) tmp2 +=1.;
+      case EvalReSmall:
+	  return tmp1 < tmp2;
+//        return std::abs(std::imag(z1)) < std::abs(std::imag(z2));
+      case EvalReLarge:
+	  return tmp1 > tmp2;
+//        return std::abs(std::real(z1)) > std::abs(std::real(z2));
+      default:
+        assert(0);
+    }
+  }
+};
+
+=======
+>>>>>>> 68af1bba67dd62881ead5ab1e54962a5486a0791
+#endif
+
+/**
+ * Implementation of the Arnoldi algorithm.
+ */
+template<class Field> 
+class Arnoldi {
+
+  private:
+  
+    std::string cname = std::string("Arnoldi");
+    int MaxIter;   // Max iterations
+    RealD Tolerance;
+    RealD ssq;
+    RealD rtol;
+    int Nm;           // Number of basis vectors to track (equals MaxIter if no restart)
+    int Nk;           // Number of basis vectors to keep every restart (equals -1 if no restart)
+    int Nstop;       // Stop after converging Nstop eigenvectors.
+
+    LinearOperatorBase<Field> &Linop;
+    GridBase *Grid;
+
+    RealD approxLambdaMax;
+    RealD beta_k;
+    Field f;
+    std::vector<Field> basis;               // orthonormal Arnoldi basis
+    Eigen::MatrixXcd Hess;                  // Hessenberg matrix of size Nbasis (after construction)
+    Eigen::MatrixXcd Qt;                    // Transpose of basis rotation which projects out high modes.
+
+    Eigen::VectorXcd evals;                 // evals of Hess
+    Eigen::MatrixXcd littleEvecs;           // Nm x Nm evecs matrix
+    std::vector<Field> evecs;               // Vector of evec fields
+
+    RitzFilter ritzFilter;                        // how to sort evals
+
+  public:       
+
+    Arnoldi(LinearOperatorBase<Field> &_Linop, GridBase *_Grid, RealD _Tolerance, RitzFilter filter = EvalReSmall)
+      : Linop(_Linop), Grid(_Grid), Tolerance(_Tolerance), ritzFilter(filter), f(_Grid), MaxIter(-1), Nm(-1), Nk(-1), 
+          Nstop (-1), evals (0), evecs (), ssq (0.0), rtol (0.0), beta_k (0.0), approxLambdaMax (0.0)
+    {
+      f = Zero();
+    };
+
+    /**
+     * Runs the Arnoldi loop with(out) implicit restarting. For each iteration:
+     *   - Runs an Arnoldi step.
+     *   - Computes the eigensystem of the Hessenberg matrix.
+     *   - Performs implicit restarting.
+     */
+    void operator()(const Field& v0, int _maxIter, int _Nm, int _Nk, int _Nstop, bool doubleOrthog = false) {
+      MaxIter = _maxIter;
+      Nm = _Nm; Nk = _Nk;
+      Nstop = _Nstop;
+      
+      ssq = norm2(v0);
+      RealD approxLambdaMax = approxMaxEval(v0);
+      rtol = Tolerance * approxLambdaMax;
+
+      ComplexComparator compareComplex (ritzFilter);
+      std::cout << GridLogMessage << "Comparing Ritz values with: " << ritzFilter << std::endl;
+
+      int start = 1;
+      Field startVec = v0;
+      littleEvecs = Eigen::MatrixXcd::Zero(Nm, Nm);
+      for (int i = 0; i < MaxIter; i++) {
+        std::cout << GridLogMessage << "Restart Iteration " << i << std::endl;
+
+        // Perform Arnoldi steps to compute Krylov basis and Rayleigh quotient (Hess)
+        arnoldiIteration(startVec, Nm, start, doubleOrthog);
+        startVec = f;
+
+        // compute eigensystem and sort evals
+        // compute_eigensystem();
+        compute_eigensystem(Hess);
+        std::cout << GridLogMessage << "Eigenvalues after Arnoldi step: " << std::endl << evals << std::endl;
+
+        std::sort(evals.begin(), evals.end(), compareComplex);
+        std::cout << GridLogMessage << "Ritz values after sorting (first Nk preserved): " << std::endl << evals << std::endl;
+        // SU(N)::tepidConfiguration
+
+        // Implicit restart to de-weight unwanted eigenvalues
+        implicitRestart(_Nm, _Nk);      // probably can delete _Nm and _Nk from function args
+        start = Nk;
+
+        // check convergence and return if needed.
+        int Nconv = converged();
+        std::cout << GridLogMessage << "Number of evecs converged: " << Nconv << std::endl;
+        if (Nconv >= Nstop || i == MaxIter - 1) {
+          std::cout << GridLogMessage << "Converged with " << Nconv << " / " << Nstop << " eigenvectors on iteration " 
+                        << i << "." << std::endl;
+          basisRotate(evecs, Qt, 0, Nk, 0, Nk, Nm);
+          std::cout << GridLogMessage << "Eigenvalues [first " << Nconv << " converged]: " << std::endl << evals << std::endl;
+          return;
+        }
+      }      
+    }
+
+    /**
+     * Approximates the maximum eigenvalue of Linop.Op to normalize the residual and test for convergence. 
+     * 
+     * Parameters
+     * ----------
+     * Field& v0
+     *  Source field to start with. Must have non-zero norm.
+     * int MAX_ITER (default = 50)
+     *  Maximum number of iterations for power approximation. 
+     * 
+     * Returns
+     * -------
+     * RealD lamApprox
+     *  Approximation of largest eigenvalue. 
+     */
+    RealD approxMaxEval(const Field& v0, int MAX_ITER = 50) {
+      assert (norm2(v0) > 1e-8);                        // must have relatively large source norm to start
+      RealD lamApprox = 0.0;
+      RealD denom = 1.0; RealD num = 1.0;
+      Field v0cp (Grid); Field tmp (Grid);
+      v0cp = v0;
+      denom = std::sqrt(norm2(v0cp));
+      for (int i = 0; i < MAX_ITER; i++) {
+        Linop.Op(v0cp, tmp);                               // CAREFUL: do not do Op(tmp, tmp)
+        v0cp = tmp;
+        num = std::sqrt(norm2(v0cp));                      // num = |A^{n+1} v0|
+        lamApprox = num / denom;                           // lam = |A^{n+1} v0| / |A^n v0|
+        std::cout << GridLogDebug << "Approx for max eval: " << lamApprox << std::endl;
+        denom = num;                                       // denom = |A^{n} v0|
+      }
+      return lamApprox;
+    }
+
+    /**
+     * Constructs the Arnoldi basis for the Krylov space K_n(D, src). (TODO make private)
+     * 
+     * Parameters
+     * ----------
+     * v0 : Field&
+     *  Source to generate Krylov basis. 
+     * Nm : int
+     *  Final size of the basis desired. If the basis becomes complete before a basis of size Nm is constructed 
+     *  (determined by relative tolerance Tolerance), stops iteration there. 
+     * doubleOrthog : bool (default = false)
+     *  Whether to double orthogonalize the basis (for numerical cancellations) or not. 
+     * start        : int (default = 0)
+     *  If non-zero, assumes part of the Arnoldi basis has already been constructed. 
+     */
+    void arnoldiIteration(const Field& v0, int Nm, int start = 1, bool doubleOrthog = false)
+    {
+
+      ComplexD coeff;
+      Field w (Grid);           // A acting on last Krylov vector. 
+
+      if (start == 1) {       // initialize everything that we need.
+        RealD v0Norm = 1 / std::sqrt(ssq);
+        basis.push_back(v0Norm * v0);                // normalized source
+
+        Hess = Eigen::MatrixXcd::Zero(Nm, Nm);
+        f = Zero();
+      } else {
+        assert( start == basis.size() );      // should be starting at the end of basis (start = Nk)
+        Eigen::MatrixXcd HessCp = Hess;
+        Hess = Eigen::MatrixXcd::Zero(Nm, Nm);
+        Hess(Eigen::seqN(0, Nk), Eigen::seqN(0, Nk)) = HessCp;
+      }
+
+      // Construct next Arnoldi vector by normalizing w_i = Dv_i - \sum_j v_j h_{ji}
+      for (int i = start - 1; i < Nm; i++) {
+
+        Linop.Op(basis.back(), w);
+        for (int j = 0; j < basis.size(); j++) {
+          coeff = innerProduct(basis[j], w);       // coeff = h_{ij}. Note that since {vi} is ONB it's OK to subtract it off after. 
+          Hess(j, i) = coeff;
+          w -= coeff * basis[j];
+        }
+
+        if (doubleOrthog) {
+          // TODO implement
+        }
+
+        // add w_i to the pile
+        if (i < Nm - 1) {
+          coeff = std::sqrt(norm2(w));
+          Hess(i+1, i) = coeff;
+          basis.push_back(
+            (1.0/coeff) * w
+          );
+        }
+
+        // after iterations, update f and beta_k = ||f||
+        f = w;                                // make sure f is not normalized
+        beta_k = std::sqrt(norm2(f));         // beta_k = ||f_k|| determines convergence.
+      }
+
+      std::cout << GridLogMessage << "|f|^2 after Arnoldi step = " << norm2(f) << std::endl;
+      std::cout << GridLogDebug << "Computed Hessenberg matrix = " << std::endl << Hess << std::endl;
+
+      return;
+    }
+
+    /**
+     * Approximates the eigensystem of the linear operator by computing the eigensystem of 
+     * the Hessenberg matrix. Assumes that the Hessenberg matrix has already been constructed (by 
+     * calling the operator() function).
+     * 
+     * TODO implement in parent class eventually.
+     * 
+     * Parameters
+     * ----------
+     * Eigen::MatrixXcd& S
+     *  Schur matrix (upper triangular) similar to original Rayleigh quotient.
+     */
+    void compute_eigensystem(Eigen::MatrixXcd& S)
+    {
+
+      std::cout << GridLogMessage << "Computing eigenvalues." << std::endl;
+
+      evecs.clear();
+
+      Eigen::ComplexEigenSolver<Eigen::MatrixXcd> es;
+      es.compute(S);
+      evals = es.eigenvalues();
+      littleEvecs = es.eigenvectors();
+
+      // Convert evecs to lattice fields
+      for (int k = 0; k < evals.size(); k++) {
+        Eigen::VectorXcd vec = littleEvecs.col(k);
+        Field tmp (basis[0].Grid());
+        tmp = Zero();
+        for (int j = 0; j < basis.size(); j++) {
+          tmp = tmp + vec[j] * basis[j];
+        }
+        evecs.push_back(tmp);
+      }
+
+      std::cout << GridLogMessage << "Eigenvalues: " << std::endl << evals << std::endl;
+
+    }
+
+    /**
+     * Verifies the factorization DV = V^\dag H + f e^\dag with the last-computed 
+     * V, H, f. 
+     */
+    // RealD verifyFactorization() {
+    //   int k = basis.size();         // number of basis vectors, also the size of H.
+    //   std::vector<Field> factorized (k, Zero());
+    //   Field tmp (FGrid); tmp = Zero();
+    //   for (int i = 0; i < basis.size(); i++) {
+    //     Linop.Op(basis[i], tmp);
+    //   }
+    //   // basisRotate(basis, Q, 0, Nk, 0, Nk, Nm);
+    //   // Linop.Op(, )
+    // }
+
+    /* Getters */
+    Eigen::MatrixXcd    getHessenbergMat()  { return Hess; }
+    Field               getF()              { return f; }
+    std::vector<Field>  getBasis()          { return basis; }
+    Eigen::VectorXcd    getEvals()          { return evals; }
+    std::vector<Field>  getEvecs()          { return evecs; }
+
+    /**
+     * Implements implicit restarting for Arnoldi. Assumes eigenvalues are sorted. 
+     * 
+     * Parameters
+     * ----------
+     * int _Nm
+     *  Size of basis to keep (Hessenberg is MxM).
+     * int Nk
+     *  Number of basis vectors to keep at each restart.
+     */
+    void implicitRestart(int _Nm, int _Nk) {
+      assert ( _Nk <= _Nm );
+      Nm = _Nm; Nk = _Nk;
+      int Np = Nm - Nk;       // keep Nk smallest (or largest, depends on sort function) evecs
+      
+      std::cout << GridLogMessage << "Computing QR Factorizations." << std::endl;
+
+      Eigen::MatrixXcd Q = Eigen::MatrixXcd::Identity(Nm, Nm);
+      Eigen::MatrixXcd Qi (Nm, Nm);
+      Eigen::MatrixXcd R (Nm, Nm);
+
+      for (int i = Nk; i < Nm; i++) {        // keep the first Nk eigenvalues and iterate through the last Np. Should loop Np times
+
+        // Useful debugging output
+        std::cout << GridLogDebug << "Computing QR factorization for i = " << i << std::endl;
+        std::cout << GridLogDebug << "Eval shift = " << evals[i] << std::endl;
+        std::cout << GridLogDebug << "Hess before rotation: " << Hess << std::endl;
+
+        // QR factorize 
+        Eigen::HouseholderQR<Eigen::MatrixXcd> QR (Hess - evals[i] * Eigen::MatrixXcd::Identity(Nm, Nm));
+        Qi = QR.householderQ();
+        Q = Q * Qi;
+        Hess = Qi.adjoint() * Hess * Qi;
+
+        std::cout << GridLogDebug << "Qt up to i = " << Q.transpose() << std::endl;
+
+      }
+
+      std::cout << GridLogDebug << "Hess after all rotations: " << std::endl << Hess << std::endl; 
+
+      // form Arnoldi vector f: f is normal to the basis vectors and its norm \beta is used to determine the Ritz estimate. 
+      std::complex<double> beta = Hess(Nk, Nk-1);
+      std::complex<double> sigma = Q(Nm-1, Nk-1);
+      f = basis[Nk] * beta + f * sigma;
+      RealD betak = std::sqrt(norm2(f));
+      std::cout << GridLogMessage << "|f|^2 after implicit restart = " << norm2(f) << std::endl;
+
+      // Rotate basis by Qt
+      Qt = Q.transpose();
+      basisRotate(basis, Qt, 0, Nk + 1, 0, Nm, Nm);
+
+      // rotate
+      basisRotate(evecs, Qt, 0, Nk + 1, 0, Nm, Nm);
+
+      // Truncate the basis and restart
+      basis = std::vector<Field> (basis.begin(), basis.begin() + Nk);
+      // evecs = std::vector<Field> (evecs.begin(), evecs.begin() + Nk);
+      Hess = Hess(Eigen::seqN(0, Nk), Eigen::seqN(0, Nk));
+
+      std::cout << "evecs size: " << evecs.size() << std::endl;
+
+    }
+  
+    /**
+     * Computes the number of Arnoldi eigenvectors that have converged. An eigenvector s is considered converged 
+     * for a tolerance epsilon if 
+     *    r(s) := |\beta e_m^T s| < epsilon
+     * where beta is the norm of f_{m+1}.
+     * 
+     * Parameters
+     * ----------
+     * 
+     * Returns
+     * -------
+     * int : Number of converged eigenvectors.
+     */
+    int converged() {
+      int Nconv = 0;
+      for (int k = 0; k < evecs.size(); k++) {
+        RealD emTs = std::abs(littleEvecs(Nm - 1, k));           // e_m^T s
+        RealD ritzEstimate = beta_k * emTs;
+        // TODO should be ritzEstimate < Tolerance * lambda_max
+        std::cout << GridLogMessage << "Ritz estimate for evec " << k << " = " << ritzEstimate << std::endl;
+        if (ritzEstimate < rtol) {
+          Nconv++;
+        }
+      }
+      return Nconv;
+    }
+
+};
+    
+NAMESPACE_END(Grid);
+#endif
@@ -0,0 +1,277 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/iterative/ConjugateGradientTimeslice.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#ifndef GRID_CONJUGATE_GRADIENT_TIMESLICE_H
+#define GRID_CONJUGATE_GRADIENT_TIMESLICE_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////
+// Base classes for iterative processes based on operators
+// single input vec, single output vec.
+/////////////////////////////////////////////////////////////
+
+/**
+ * Simple modification of conjugate gradient that outputs the residual as a function 
+ * of time, in order to study the large wavelength behavior of the solver. 
+ */
+
+
+template <class Field>
+class ConjugateGradientTimeslice : public OperatorFunction<Field> {
+public:
+
+  using OperatorFunction<Field>::operator();
+  
+  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
+                           // Defaults true.
+  RealD Tolerance;
+  Integer MaxIterations;
+  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
+  RealD TrueResidual;
+  
+  ConjugateGradientTimeslice(RealD tol, Integer maxit, bool err_on_no_conv = true)
+    : Tolerance(tol),
+      MaxIterations(maxit),
+      ErrorOnNoConverge(err_on_no_conv)
+  {};
+
+  virtual void LogIteration(int k,RealD a,RealD b){
+    //    std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
+  };
+  virtual void LogBegin(void){
+    std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
+  };
+
+    void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
+
+      this->LogBegin();
+
+      GRID_TRACE("ConjugateGradientTimeslice");
+    GridStopWatch PreambleTimer;
+    GridStopWatch ConstructTimer;
+    GridStopWatch NormTimer;
+    GridStopWatch AssignTimer;
+    PreambleTimer.Start();
+    psi.Checkerboard() = src.Checkerboard();
+
+    conformable(psi, src);
+
+    RealD cp, c, a, d, b, ssq, qq;
+    //RealD b_pred;
+
+    // Was doing copies
+    ConstructTimer.Start();
+    Field p  (src.Grid());
+    Field mmp(src.Grid());
+    Field r  (src.Grid());
+    ConstructTimer.Stop();
+
+    // Initial residual computation & set up
+    NormTimer.Start();
+    ssq = norm2(src);                 // Norm of source vector ||b||^2
+
+    ssqtx = localNorm2(src);          // Norm |b(x, t)|^2 as a field
+    std::vector<RealD> ssqt;          // Norm of source not summed over time slices, ssq(t) = \sum_x |b(x, t)|^2
+    sliceSum(ssqtx, ssqt, Tdir);      // TODO make sure Tdir is globally defined
+
+    RealD guess = norm2(psi);         // Norm of initial guess ||psi||^2
+    NormTimer.Stop();
+    assert(std::isnan(guess) == 0);
+    AssignTimer.Start();
+    if ( guess == 0.0 ) {
+      r = src;
+      p = r;
+      a = ssq;
+    } else { 
+      Linop.HermOpAndNorm(psi, mmp, d, b);        // 
+      r = src - mmp;      // Initial residual r0 = b - A guess
+      p = r;              // initial conj vector p0 = r0
+      a = norm2(p);
+    }
+    cp = a;
+    AssignTimer.Stop();
+
+    // Handle trivial case of zero src
+    if (ssq == 0.){
+      psi = Zero();
+      IterationsToComplete = 1;
+      TrueResidual = 0.;
+      return;
+    }
+
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   mmp " << b << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:  cp,r " << cp << std::endl;
+    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:     p " << a << std::endl;
+
+    RealD rsq = Tolerance * Tolerance * ssq;
+
+    // Check if guess is really REALLY good :)
+    if (cp <= rsq) {
+      TrueResidual = std::sqrt(a/ssq);
+      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
+      IterationsToComplete = 0;	
+      return;
+    }
+
+    std::cout << GridLogIterative << std::setprecision(8)
+              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
+
+    PreambleTimer.Stop();
+    GridStopWatch LinalgTimer;
+    GridStopWatch InnerTimer;
+    GridStopWatch AxpyNormTimer;
+    GridStopWatch LinearCombTimer;
+    GridStopWatch MatrixTimer;
+    GridStopWatch SolverTimer;
+
+    RealD usecs = -usecond();
+    SolverTimer.Start();
+    int k;
+    for (k = 1; k <= MaxIterations; k++) {
+
+      GridStopWatch IterationTimer;
+      IterationTimer.Start();
+      c = cp;
+
+      MatrixTimer.Start();
+      Linop.HermOp(p, mmp);         // Computes mmp = Ap
+      MatrixTimer.Stop();
+
+      LinalgTimer.Start();
+
+      InnerTimer.Start();
+      ComplexD dc  = innerProduct(p,mmp);         // p^\dagger A p
+      InnerTimer.Stop();
+      d = dc.real();
+      a = c / d;
+
+      // What is axpy? Some accelerator or something? Check Lattice_arith.h
+      AxpyNormTimer.Start();
+
+      // axpy_norm computes ax+by for vectors x and y compatible with a GPU. Here b is set to 1 (see the function in Lattice_reduction.h). 
+      // The first argument passes r by reference, so it stores r --> -a * Ap + 1 * r, i.e. it performs an update on 
+      // r_k --> r_{k+1} = r_k - \alpha_k A p_k. The function returns the norm squared of the first variable, i.e. ||r_{k+1}||^2.
+      cp = axpy_norm(r, -a, mmp, r);
+      AxpyNormTimer.Stop();
+      b = cp / c;
+
+      LinearCombTimer.Start();
+      {
+        autoView( psi_v , psi, AcceleratorWrite);
+        autoView( p_v   , p,   AcceleratorWrite);
+        autoView( r_v   , r,   AcceleratorWrite);
+        accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
+            coalescedWrite(psi_v[ss], a      *  p_v(ss) + psi_v(ss));
+            coalescedWrite(p_v[ss]  , b      *  p_v(ss) + r_v  (ss));
+        });
+      }
+      LinearCombTimer.Stop();
+      LinalgTimer.Stop();
+      LogIteration(k,a,b);
+
+      IterationTimer.Stop();
+      if ( (k % 500) == 0 ) {
+        std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
+                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
+      } else { 
+        std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
+                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
+      }
+
+      // Stopping condition
+      if (cp <= rsq) {
+        usecs +=usecond();
+        SolverTimer.Stop();
+        Linop.HermOpAndNorm(psi, mmp, d, qq);
+        p = mmp - src;
+        GridBase *grid = src.Grid();
+        RealD DwfFlops = (1452. )*grid->gSites()*4*k
+   	               + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
+        RealD srcnorm = std::sqrt(norm2(src));
+        RealD resnorm = std::sqrt(norm2(p));
+        RealD true_residual = resnorm / srcnorm;
+        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
+          << "\tComputed residual " << std::sqrt(cp / ssq)
+          << "\tTrue residual " << true_residual
+          << "\tTarget " << Tolerance << std::endl;
+
+        // GridLogMessage logs the message to the terminal output; GridLogPerformance probably writes to a log file?
+        //	std::cout << GridLogMessage << "\tPreamble   " << PreambleTimer.Elapsed() <<std::endl;
+        std::cout << GridLogMessage << "\tSolver Elapsed    " << SolverTimer.Elapsed() <<std::endl;
+        std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
+        std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+        std::cout << GridLogPerformance << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+        std::cout << GridLogPerformance << "\t\tInner      " << InnerTimer.Elapsed() <<std::endl;
+        std::cout << GridLogPerformance << "\t\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+        std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+
+        std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
+
+        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
+
+        IterationsToComplete = k;	
+        TrueResidual = true_residual;
+
+        return;
+      }
+    }
+    // Failed. Calculate true residual before giving up                                                         
+    // Linop.HermOpAndNorm(psi, mmp, d, qq);
+    //    p = mmp - src;
+    //TrueResidual = sqrt(norm2(p)/ssq);
+    //    TrueResidual = 1;
+
+    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
+    	      <<" residual "<< std::sqrt(cp / ssq)<< std::endl;
+    SolverTimer.Stop();
+    std::cout << GridLogMessage << "\tPreamble   " << PreambleTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage << "\tConstruct  " << ConstructTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage << "\tNorm       " << NormTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage << "\tAssign     " << AssignTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage << "\tSolver     " << SolverTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
+    std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
+    std::cout << GridLogMessage<< "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\t\tInner      " << InnerTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\t\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
+    std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
+
+    if (ErrorOnNoConverge) assert(0);
+    IterationsToComplete = k;
+
+  }
+};
+
+NAMESPACE_END(Grid);
+#endif
@@ -53,6 +53,18 @@ enum IRLdiagonalisation {
  IRLdiagonaliseWithEigen
 };

+enum IRLeigsort { 
+  IRLeigsortMax,
+  IRLeigsortSqMin
+};
+
+#if 0
+bool square_comp(RealD a, RealD b){
+	if (a*a<b*b) return true;
+	return false;
+}
+#endif
+
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
 public:
@@ -119,9 +131,10 @@ class ImplicitlyRestartedLanczos {
  /////////////////////////
  // Constructor
  /////////////////////////
-  
-public:       
+ public:
+  IRLeigsort EigSort;

+  
  //////////////////////////////////////////////////////////////////
  // PAB:
  //////////////////////////////////////////////////////////////////
@@ -154,6 +167,7 @@ public:
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
+    EigSort(IRLeigsortMax), 
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };

    ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
@@ -170,6 +184,7 @@ public:
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
+    EigSort(IRLeigsortMax), 
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };

  ////////////////////////////////
@@ -316,8 +331,12 @@ until convergence
      // sorting
      //////////////////////////////////
      eval2_copy = eval2;
+//      if (EigSort==IRLeigsortMax)
+//      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),square_comp);
+//      else
      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
      std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
+//      eval2_copy = eval2;
      const int chunk=8;
      for(int io=0; io<k2;io+=chunk){
 	std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
@@ -333,6 +352,7 @@ until convergence
      //////////////////////////////////
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      for(int ip=k2; ip<Nm; ++ip){ 
+//        std::cout<<GridLogIRL <<"QR decompose "<<eval2[ip]<<std::endl;
 	QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
      }
      std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
@@ -375,7 +395,8 @@ until convergence

 	//  power of two search pattern;  not every evalue in eval2 is assessed.
 	int allconv =1;
-	for(int jj = 1; jj<=Nstop; jj*=2){
+//	for(int jj = 1; jj<=Nstop; jj*=2){
+	for(int jj = 1; jj<=Nstop; jj++){
 	  int j = Nstop-jj;
 	  RealD e = eval2_copy[j]; // Discard the evalue
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
@@ -0,0 +1,276 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./Grid/algorithms/iterative/LanczosBidiagonalization.h
+
+Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LANCZOS_BIDIAGONALIZATION_H
+#define GRID_LANCZOS_BIDIAGONALIZATION_H
+
+NAMESPACE_BEGIN(Grid);
+
+/**
+ * Lanczos Bidiagonalization (Golub-Kahan)
+ *
+ * For a linear operator A with adjoint A^dag, constructs the bidiagonal
+ * decomposition:
+ *
+ *   A  V_m = U_m B_m
+ *   A^dag U_m = V_m B_m^T + beta_{m+1} v_{m+1} e_m^T
+ *
+ * where:
+ *   V_m = [v_1, ..., v_m]  right Lanczos vectors (orthonormal)
+ *   U_m = [u_1, ..., u_m]  left  Lanczos vectors (orthonormal)
+ *   B_m is upper bidiagonal with diag(alpha_1,...,alpha_m) and
+ *       superdiag(beta_2,...,beta_m)
+ *
+ * The singular values of A are approximated by those of B_m.
+ * The singular values of B_m are the square roots of the eigenvalues of
+ * the symmetric tridiagonal matrix B_m^T B_m.
+ *
+ * Usage:
+ *   LanczosBidiagonalization<Field> lb(Linop, grid);
+ *   lb.run(src, Nm, tol);
+ *   // Access results via getters.
+ */
+template <class Field>
+class LanczosBidiagonalization {
+
+  public: 
+  LinearOperatorBase<Field> &Linop;
+  GridBase *Grid;
+
+  int Nm;           // number of Lanczos steps taken
+  RealD Tolerance;  // convergence threshold on beta_{k+1} / alpha_k
+
+  std::vector<Field>  V;       // right Lanczos vectors v_1 ... v_m
+  std::vector<Field>  U;       // left  Lanczos vectors u_1 ... u_m
+  std::vector<RealD>  alpha;   // diagonal of bidiagonal matrix
+  std::vector<RealD>  beta;    // super-diagonal (beta[k] couples u_k and v_{k+1})
+
+  // SVD of the bidiagonal matrix (filled after computeSVD())
+  Eigen::VectorXd  singularValues;
+  Eigen::MatrixXd  leftSVecs;   // columns are left  singular vectors of B
+  Eigen::MatrixXd  rightSVecs;  // columns are right singular vectors of B
+
+public:
+
+  LanczosBidiagonalization(LinearOperatorBase<Field> &_Linop, GridBase *_Grid,
+                           RealD _tol = 1.0e-8)
+    : Linop(_Linop), Grid(_Grid), Tolerance(_tol), Nm(0)
+  {}
+
+  /**
+   * Run the Golub-Kahan Lanczos bidiagonalization.
+   *
+   * Parameters
+   * ----------
+   * src  : starting vector (need not be normalised)
+   * Nmax : maximum number of Lanczos steps
+   * reorth : if true, full reorthogonalisation of both V and U bases
+   */
+  void run(const Field &src, int Nmax, bool reorth = true)
+  {
+    assert(norm2(src) > 0.0);
+
+    V.clear(); U.clear();
+    alpha.clear(); beta.clear();
+    Nm = 0;
+
+    Field p(Grid), r(Grid);
+
+    // --- initialise: v_1 = src / ||src|| ---
+    Field v(Grid);
+    v = src;
+    RealD nrm = std::sqrt(norm2(v));
+    v = (1.0 / nrm) * v;
+    V.push_back(v);
+
+    for (int k = 0; k < Nmax; ++k) {
+
+      // p = A v_k
+      Linop.Op(V[k], p);
+
+      // p = p - beta_k * u_{k-1}   (remove previous left vector)
+      if (k > 0) {
+        p = p - beta[k-1] * U[k-1];
+      }
+
+      // alpha_k = ||p||
+      RealD ak = std::sqrt(norm2(p));
+      if (ak < 1.0e-14) {
+        std::cout << GridLogMessage
+                  << "LanczosBidiagonalization: lucky breakdown at step "
+                  << k << " (alpha = " << ak << ")" << std::endl;
+        break;
+      }
+      alpha.push_back(ak);
+
+      // u_k = p / alpha_k
+      Field u(Grid);
+      u = (1.0 / ak) * p;
+
+      // full reorthogonalisation of u against previous U
+      if (reorth) {
+        for (int j = 0; j < (int)U.size(); ++j) {
+          ComplexD ip = innerProduct(U[j], u);
+          u = u - ip * U[j];
+        }
+        RealD unrm = std::sqrt(norm2(u));
+        if (unrm > 1.0e-14) u = (1.0 / unrm) * u;
+      }
+      U.push_back(u);
+
+      // r = A^dag u_k - alpha_k * v_k
+      Linop.AdjOp(U[k], r);
+      r = r - ak * V[k];
+
+      // full reorthogonalisation of r against previous V
+      if (reorth) {
+        for (int j = 0; j < (int)V.size(); ++j) {
+          ComplexD ip = innerProduct(V[j], r);
+          r = r - ip * V[j];
+        }
+      }
+
+      // beta_{k+1} = ||r||
+      RealD bk = std::sqrt(norm2(r));
+      beta.push_back(bk);
+
+      Nm = k + 1;
+
+      std::cout << GridLogMessage
+                << "LanczosBidiagonalization step " << k
+                << "  alpha = " << ak
+                << "  beta  = " << bk << std::endl;
+
+      // convergence: residual beta / alpha small enough
+      if (bk / ak < Tolerance) {
+        std::cout << GridLogMessage
+                  << "LanczosBidiagonalization converged at step " << k
+                  << "  (beta/alpha = " << bk / ak << ")" << std::endl;
+        break;
+      }
+
+      if (k == Nmax - 1) break;   // no v_{k+2} needed after last step
+
+      // v_{k+1} = r / beta_{k+1}
+      Field vnext(Grid);
+      vnext = (1.0 / bk) * r;
+      V.push_back(vnext);
+    }
+  }
+
+  /**
+   * Compute the SVD of the bidiagonal matrix B using Eigen.
+   * Singular values are stored in descending order.
+   */
+  void computeSVD()
+  {
+    int m = Nm;
+    Eigen::MatrixXd B = Eigen::MatrixXd::Zero(m, m);
+
+    for (int k = 0; k < m; ++k) {
+      B(k, k) = alpha[k];
+      if (k + 1 < m && k < (int)beta.size())
+        B(k, k+1) = beta[k];
+    }
+
+    Eigen::JacobiSVD<Eigen::MatrixXd> svd(B,
+        Eigen::ComputeThinU | Eigen::ComputeThinV);
+
+    singularValues = svd.singularValues();   // already sorted descending
+    leftSVecs      = svd.matrixU();
+    rightSVecs     = svd.matrixV();
+
+    std::cout << GridLogMessage
+              << "LanczosBidiagonalization: singular values of B_" << m
+              << std::endl;
+    for (int k = 0; k < m; ++k)
+      std::cout << GridLogMessage << "  sigma[" << k << "] = "
+                << singularValues(k) << std::endl;
+  }
+
+  /**
+   * Return the k-th approximate left singular vector of A in the full
+   * lattice space.  computeSVD() must have been called first.
+   */
+  Field leftSingularVector(int k)
+  {
+    assert(k < (int)leftSVecs.cols());
+    Field svec(Grid);
+    svec = Zero();
+    for (int j = 0; j < Nm; ++j)
+      svec = svec + leftSVecs(j, k) * U[j];
+    return svec;
+  }
+
+  /**
+   * Return the k-th approximate right singular vector of A in the full
+   * lattice space.  computeSVD() must have been called first.
+   */
+  Field rightSingularVector(int k)
+  {
+    assert(k < (int)rightSVecs.cols());
+    Field svec(Grid);
+    svec = Zero();
+    for (int j = 0; j < Nm; ++j)
+      svec = svec + rightSVecs(j, k) * V[j];
+    return svec;
+  }
+
+  /**
+   * Verify the bidiagonalization: returns max residual
+   *   max_k || A v_k - alpha_k u_k - beta_k u_{k-1} ||
+   */
+  RealD verify()
+  {
+    Field tmp(Grid);
+    RealD maxres = 0.0;
+    for (int k = 0; k < Nm; ++k) {
+      Linop.Op(V[k], tmp);
+      tmp = tmp - alpha[k] * U[k];
+      if (k > 0 && k-1 < (int)beta.size())
+        tmp = tmp - beta[k-1] * U[k-1];
+      RealD res = std::sqrt(norm2(tmp));
+      if (res > maxres) maxres = res;
+      std::cout << GridLogMessage
+                << "LanczosBidiagonalization verify step " << k
+                << "  ||A v_k - alpha_k u_k - beta_{k-1} u_{k-1}|| = "
+                << res << std::endl;
+    }
+    return maxres;
+  }
+
+  /* Getters */
+  int                       getNm()           const { return Nm; }
+  const std::vector<Field>& getV()            const { return V; }
+  const std::vector<Field>& getU()            const { return U; }
+  const std::vector<RealD>& getAlpha()        const { return alpha; }
+  const std::vector<RealD>& getBeta()         const { return beta; }
+  Eigen::VectorXd           getSingularValues() const { return singularValues; }
+};
+
+NAMESPACE_END(Grid);
+#endif
@@ -60,18 +60,29 @@ public:

  void Level(int lv) { level=lv; };

-  PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
+  PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax, int _nstep) : 
    Tolerance(tol), 
    MaxIterations(maxit),
    Linop(_Linop),
    Preconditioner(Prec),
    mmax(_mmax),
-    nstep(_nstep)
+    nstep(_nstep)         // what is nstep vs mmax? one is the number of inner iterations
  { 
    level=1;
    verbose=1;
  };

+  // virtual method stubs for updating GCR polynomial
+  virtual void LogBegin(void){
+    std::cout << "GCR::LogBegin() "<<std::endl;
+  };
+  virtual void LogIteration(int k, ComplexD a, std::vector<ComplexD> betas){
+    std::cout << "GCR::LogIteration() "<<std::endl;
+  };
+  virtual void LogComplete(std::vector<ComplexD>& alphas, std::vector<std::vector<ComplexD>>& betas) {
+    std::cout << "GCR::LogComplete() "<<std::endl;
+  };
+
  void operator() (const Field &src, Field &psi){

    //    psi=Zero();
@@ -96,19 +107,18 @@ public:
      GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;

      if(cp<rsq) {
+        SolverTimer.Stop();

-	SolverTimer.Stop();
+        Linop.Op(psi,r);
+        axpy(r,-1.0,src,r);
+        RealD tr = norm2(r);
+        GCRLogLevel<<"PGCR: Converged on iteration " <<steps
+          << " computed residual "<<sqrt(cp/ssq)
+          << " true residual "    <<sqrt(tr/ssq)
+          << " target "           <<Tolerance <<std::endl;

-	Linop.Op(psi,r);
-	axpy(r,-1.0,src,r);
-	RealD tr = norm2(r);
-	GCRLogLevel<<"PGCR: Converged on iteration " <<steps
-		 << " computed residual "<<sqrt(cp/ssq)
-		 << " true residual "    <<sqrt(tr/ssq)
-		 << " target "           <<Tolerance <<std::endl;
-
-	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
-	return;
+        GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
+        return;
      }

    }
@@ -135,9 +145,9 @@ public:
    ////////////////////////////////
    // history for flexible orthog
    ////////////////////////////////
-    std::vector<Field> q(mmax,grid);
-    std::vector<Field> p(mmax,grid);
-    std::vector<RealD> qq(mmax);
+    std::vector<Field> q(mmax,grid);        // q = Ap
+    std::vector<Field> p(mmax,grid);        // store mmax conjugate momenta
+    std::vector<RealD> qq(mmax);            // qq = (Ap)^2 = <p|A^\dagger A |p> (denom of \alpha)
      
    GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;

@@ -155,7 +165,9 @@ public:
    LinalgTimer.Start();
    r=src-Az;
    LinalgTimer.Stop();
-    GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl;
+    GCRLogLevel<< "PGCR true residual r = src - A psi   "<< norm2(r) <<std::endl;
+
+    this->LogBegin();       // initialize polynomial GCR if needed (TODO think about placement of this)
    
    /////////////////////
    // p = Prec(r)
@@ -178,32 +190,45 @@ public:
    p[0]= z;
    q[0]= Az;
    qq[0]= zAAz;
+
+    std::cout << "||init p - src||: " << norm2(p[0] - src) << std::endl;   // for debugging
    
    cp =norm2(r);
    LinalgTimer.Stop();

+    std::vector<ComplexD> all_alphas;
+    std::vector<std::vector<ComplexD>> all_betas;
+
    for(int k=0;k<nstep;k++){

      steps++;

      int kp     = k+1;
-      int peri_k = k %mmax;
+      int peri_k = k %mmax;     // only store mmax vectors; just roll around if needed
      int peri_kp= kp%mmax;

+      // std::cout << "peri_kp = " << peri_kp << std::endl;
+
      LinalgTimer.Start();
      rq= innerProduct(q[peri_k],r); // what if rAr not real?
-      a = rq/qq[peri_k];
+      a = rq/qq[peri_k];              // compute alpha_j

-      axpy(psi,a,p[peri_k],psi);         
+      all_alphas.push_back(a);

-      cp = axpy_norm(r,-a,q[peri_k],r);
+      axpy(psi,a,p[peri_k],psi);      // update psi --> psi + \alpha p
+
+      cp = axpy_norm(r,-a,q[peri_k],r);       // update r --> r - \alpha D p. Note q = Dp
      LinalgTimer.Stop();

-      GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
+      // LogIterationA(k + 1, a);

-      if((k==nstep-1)||(cp<rsq)){
-	return cp;
-      }
+      GCRLogLevel<< "GCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
+
+      // moving this to end of loop so that it doesn't exit beforehand
+      // TODO if I want to uncomment this, I have to split the LogIteration again and put LogIterationA() beforehand
+      // if((k==nstep-1)||(cp<rsq)){
+      //   return cp;
+      // }


      PrecTimer.Start();
@@ -221,22 +246,205 @@ public:
      q[peri_kp]=Az;
      p[peri_kp]=z;

+      // Field Dsrc (grid);
+      // Linop.Op(src, Dsrc);
+      // std::cout << "||q[peri_kp] - D(src)||: " << norm2(q[peri_kp] - Dsrc) << std::endl;   // for debugging
+
+          // // delete after testing
+          // std::cout << "Testing Dsq on one for GCR: " << std::endl;
+          // Field myField (grid);
+          // myField = 1.0;
+          // Field out1 (grid); Field out2 (grid);
+          // Linop.HermOp(myField, out1);
+          // Linop.Op(myField, out2);
+          // std::cout << "Dsq.Hermop(ones) has norm " << norm2(out1) << std::endl;
+          // std::cout << "Dsq.Op(ones) has norm " << norm2(out2) << std::endl;
+
+      // basically northog = k+1 if mmax is large
      int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history.
+      // std::cout << "northog: " << northog << std::endl;
+      std::vector<ComplexD> betas (northog);
+      // std::cout << "peri_kp: " << peri_kp << std::endl;
+      // we iterate backwards counting down from the current k+1 index (peri_kp) because we 
      for(int back=0;back<northog;back++){

 	int peri_back=(k-back)%mmax;   	  GRID_ASSERT((k-back)>=0);

-	b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
-	p[peri_kp]=p[peri_kp]+b*p[peri_back];
-	q[peri_kp]=q[peri_kp]+b*q[peri_back];
+        // b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
+        b=-(innerProduct(q[peri_back],Az))/qq[peri_back];     // TODO try complex beta
+        p[peri_kp]=p[peri_kp]+b*p[peri_back];
+        q[peri_kp]=q[peri_kp]+b*q[peri_back];
+
+        // LogIterationB(peri_back, b);
+        // betas[back] = b;    // may need to change the indexing if I ever do it with restarts
+        // std::cout << "[DEBUG] pushing beta for back = " << back << ", peri_back = " << peri_back << std::endl;
+
+        betas[peri_back] = b;    // may need to change the indexing if I ever do it with restarts

      }
      qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
      LinalgTimer.Stop();
+
+      // log iteration and update GCR polynomial if necessary.
+      all_betas.push_back(betas);
+      LogIteration(k + 1, a, betas);
+
+      // finish if necessary
+      if((k==nstep-1)||(cp<rsq)){
+        std::cout << "All alphas: " << std::endl << all_alphas << std::endl;
+        std::cout << "All betas: " << std::endl << all_betas << std::endl;
+        LogComplete(all_alphas, all_betas);
+        std::cout << "Exiting GCR." << std::endl;
+        return cp;
+      }
+
    }
    GRID_ASSERT(0); // never reached
    return cp;
  }
 };
+
+class PolynomialFile: Serializable {
+  public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(PolynomialFile, 
+      std::vector<std::vector<std::complex<double>>>, data,
+      std::vector<std::vector<std::complex<double>>>, betas,
+      std::vector<std::complex<double>>,              alphas
+    );
+};
+
+// Optionally record the GCR polynomial. [PO]: TODO
+template <class Field>
+class PGCRPolynomial : public PrecGeneralisedConjugateResidualNonHermitian<Field> {
+public:
+  std::vector<ComplexD> ak;
+  std::vector<std::vector<ComplexD>> bk;
+  // std::vector<ComplexD> poly_p;
+  std::vector<std::vector<ComplexD>> poly_p;
+  std::vector<ComplexD> poly_Ap;        // polynomial in Ap_j (only store it for last p)
+  std::vector<ComplexD> poly_r;
+  std::vector<ComplexD> polynomial;
+
+  PolynomialFile& PF;
+
+public:
+  PGCRPolynomial(RealD tol, Integer maxit,LinearOperatorBase<Field> &_Linop, LinearFunction<Field> &Prec, int _mmax, int _nstep, PolynomialFile& _PF)
+    : PrecGeneralisedConjugateResidualNonHermitian<Field>(tol, maxit, _Linop, Prec, _mmax, _nstep), PF(_PF)
+  {};
+
+  // think this applies the polynomial in A = Linop to a field src. The coeffs are 
+  // stored in the vector `polynomial`.
+  void PolyOp(const Field &src, Field &psi)
+  {
+    Field tmp(src.Grid());
+    Field AtoN(src.Grid());
+    AtoN = src;
+    psi=AtoN*polynomial[0];
+    for(int n=1;n<polynomial.size();n++){
+      tmp = AtoN;
+      this->Linop.Op(tmp,AtoN);               // iterate A^n
+      psi = psi + polynomial[n]*AtoN;       // psi += poly_n A^n src
+    }
+  }
+
+  // [PO TODO] debug this
+  void PGCRsequence(const Field &src, Field &x)
+  {
+    Field Ap(src.Grid());
+    Field r(src.Grid());
+    // Field p(src.Grid());
+    // p=src;
+    std::vector<Field> p;
+    p.push_back(src);
+    r=src;
+    x=Zero();
+    x.Checkerboard()=src.Checkerboard();
+    for(int k=0;k<ak.size();k++){
+      x = x + ak[k]*p[k];
+      this->Linop.Op(p[k], Ap);
+      r = r - ak[k] * Ap;
+      // p[k] = r;
+      p.push_back(r);
+      for (int i = 0; i < k; i++) {     // [PO TODO] check indices
+        p[k+1] += bk[i, k+1] * p[i];
+      }
+      // p = r + bk[k] * p;
+    }
+  }
+
+  void Solve(const Field &src, Field &psi)
+  {
+    psi=Zero();
+    this->operator()(src, psi);
+  }
+
+  virtual void LogBegin(void)
+  {
+    std::cout << "PGCR::LogBegin() "<<std::endl;
+    ak.resize(0);
+    bk.resize(0);
+    polynomial.resize(0);
+    poly_Ap.push_back(0.0);     // start with (0.0); during first iteration should change to (0.0, 1.0)
+    std::vector<ComplexD> p0_tmp;
+    p0_tmp.push_back(1.0);
+    poly_p.push_back(p0_tmp);
+    poly_r.push_back(1.0);
+  };
+
+  // Updates vector psi and r and initializes vector p[k+1]
+  virtual void LogIteration(int k, ComplexD a, std::vector<ComplexD> betas){
+    std::cout << "PGCR::LogIteration(k = " << k << ")" << std::endl;
+    ak.push_back(a);
+    bk.push_back(betas);
+
+    // update Ap by pushing p[k] to the right
+    poly_Ap.push_back(0.0);   // need to pad the end with an element
+    poly_Ap[0] = 0.0;         // technically this should be unnecessary, as the first component is never set
+    for(int i = 0; i < k; i++){
+      poly_Ap[i+1]=poly_p[k-1][i];        // A\vec{p} = (0, \vec{p}) bc A shifts components of p to the right
+    }
+
+    // update psi_{k+1} --> psi_k + a_k p_k
+    polynomial.push_back(0.0);
+    for(int i = 0; i < k; i++) {
+      polynomial[i] += a * poly_p[k-1][i];
+    }
+    {
+      std::vector<std::complex<double>> poly_stdcmplx(polynomial.begin(), polynomial.end());
+      PF.data.push_back(poly_stdcmplx);
+    }
+
+    //  r_{k+1} --> r_k - a_k A p_k
+    //  p_{k+1} --> r_k + \sum_{i=0}^k \beta_{ik} p_i, input betas = (\beta_{ik})_i
+    poly_r.push_back(0.0);        // should be of size k+1 if we start with k = 1
+    std::vector<ComplexD> p_next (k + 1, ComplexD(0.0));     // p_{k+1} = same size as r_{k+1}
+    for(int i = 0; i < k + 1; i++){
+      poly_r[i] = poly_r[i] - a * poly_Ap[i];     // update r_{k+1} --> r_k - \alpha_k A p_k
+      p_next[i] = poly_r[i];                 // init new vector as r_{k+1}
+    }
+
+    // p_{k+1} --> p_{k+1} + \sum_i \beta_{ij} p_i
+    int nbeta = betas.size();
+    std::cout << "Betas: " << betas << std::endl;
+    for (int j = 0; j < nbeta; j++) {
+      for (int i = 0; i < j+1; i++) {
+        p_next[i] += betas[j] * poly_p[j][i];
+      }
+    }
+    poly_p.push_back(p_next);                 // add p_{k+1} to the list of p's
+  }
+
+  virtual void LogComplete(std::vector<ComplexD>& alphas, std::vector<std::vector<ComplexD>>& betas) {
+    /** Logs all alphas and betas to complete the iterations. */
+    std::cout << "PGCR::LogComplete() "<<std::endl;
+    for (int i = 0; i < alphas.size(); i++) {
+      PF.alphas.push_back(std::complex<double>(alphas[i].real(), alphas[i].imag()));
+      std::vector<std::complex<double>> beta_stdcmplx(betas[i].begin(), betas[i].end());
+      PF.betas.push_back(beta_stdcmplx);
+    }
+  };
+
+};
+
 NAMESPACE_END(Grid);
 #endif
@@ -0,0 +1,753 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./Grid/algorithms/iterative/RestartedLanczosBidiagonalization.h
+
+Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_RESTARTED_LANCZOS_BIDIAGONALIZATION_H
+#define GRID_RESTARTED_LANCZOS_BIDIAGONALIZATION_H
+
+NAMESPACE_BEGIN(Grid);
+
+/**
+ * Implicitly Restarted Lanczos Bidiagonalization (IRLBA)
+ *
+ * Computes the p largest (or p smallest) singular triplets of a linear
+ * operator A using the Golub-Kahan-Lanczos bidiagonalization with implicit
+ * restart via thick-restart / QR shifts.
+ *
+ * Algorithm (Baglama & Reichel, SIAM J. Sci. Comput. 27(1):19-42, 2005):
+ *
+ *   Outer loop:
+ *     1. Extend the p-step (or seed) bidiagonalization to k steps:
+ *           A  V_k = U_k B_k
+ *           A^dag U_k = V_k B_k^T + beta_{k+1} v_{k+1} e_k^T
+ *     2. Compute SVD:  B_k = X Sigma Y^T
+ *     3. Check convergence of the p desired singular values via
+ *           |beta_{k+1} * y_{k,i}|  <  tol * sigma_i
+ *        where y_{k,i} is the last component of the i-th right singular vector.
+ *     4. Apply k-p implicit QR shifts to implicitly compress the basis
+ *        to p steps (Sorensen-Lehoucq thick restart):
+ *           B_p^+ = X_p^T B_k Y_p   (upper bidiagonal, p x p)
+ *        and update the lattice vectors:
+ *           V_p^+ = V_k Y_p
+ *           U_p^+ = U_k X_p
+ *        The new residual coupling is
+ *           beta_p^+ v_{p+1}^+ = beta_{k+1} v_{k+1} * (e_k^T Y_p)_p
+ *                               + B_k(p,p+1) * (orthogonal tail from QR)
+ *     5. Go to step 1.
+ *
+ * Template parameter
+ * ------------------
+ *   Field : lattice field type (must support Grid algebra operations)
+ *
+ * Usage
+ * -----
+ *   RestartedLanczosBidiagonalization<Field> irlba(Linop, grid, p, k, tol, maxIter);
+ *   irlba.run(src);
+ *   // Results available via getters.
+ */
+template <class Field>
+class RestartedLanczosBidiagonalization {
+
+public:
+  LinearOperatorBase<Field> &Linop;
+  GridBase *Grid;
+
+  int    Nk;       // number of desired singular triplets
+  int    Nm;       // Lanczos basis size (Nm > Nk)
+  RealD  Tolerance;
+  int    MaxIter;
+  bool   largest; // if true, target largest singular values; otherwise smallest
+
+  // Converged singular triplets (filled after run())
+  std::vector<RealD>  singularValues;   // sigma_0 >= sigma_1 >= ...
+  std::vector<Field>  leftVectors;      // approximate left singular vectors
+  std::vector<Field>  rightVectors;     // approximate right singular vectors
+
+private:
+  // Working bases (size up to Nm+1)
+  std::vector<Field>  V;    // right Lanczos vectors
+  std::vector<Field>  U;    // left  Lanczos vectors
+  std::vector<RealD>  alpha;
+  std::vector<RealD>  beta;
+
+  // After a thick restart, the column at index restart_col of U^dag A V
+  // has extra non-zero entries (rows 0..restart_col-2) beyond what the
+  // upper bidiagonal captures.  fvec[j] = <U[j] | A V[restart_col]> for
+  // j = 0..restart_col-1.  (fvec[restart_col-1] == beta[restart_col-1].)
+  // reset_col == -1 means no restart has occurred yet (pure bidiagonal).
+  std::vector<RealD>  fvec;
+  int                 restart_col;
+
+public:
+
+  RestartedLanczosBidiagonalization(LinearOperatorBase<Field> &_Linop,
+                                    GridBase *_Grid,
+                                    int _Nk, int _Nm,
+                                    RealD _tol   = 1.0e-8,
+                                    int   _maxIt = 300,
+                                    bool  _largest = true)
+    : Linop(_Linop), Grid(_Grid),
+      Nk(_Nk), Nm(_Nm),
+      Tolerance(_tol), MaxIter(_maxIt),
+      largest(_largest)
+  {
+    assert(Nm > Nk);
+  }
+
+  /**
+   * Run IRLBA starting from src.
+   * On exit, singularValues, leftVectors, rightVectors are filled with
+   * the Nk converged singular triplets.
+   */
+  void run(const Field &src)
+  {
+    assert(norm2(src) > 0.0);
+
+    singularValues.clear();
+    leftVectors.clear();
+    rightVectors.clear();
+
+    // Allocate working bases
+    V.clear(); U.clear();
+    alpha.clear(); beta.clear();
+    fvec.clear(); restart_col = -1;
+    V.reserve(Nm + 1);
+    U.reserve(Nm);
+
+    // Seed: v_0 = src / ||src||
+    Field vtmp(Grid);
+    vtmp = src;
+    RealD nrm = std::sqrt(norm2(vtmp));
+    vtmp = (1.0 / nrm) * vtmp;
+    V.push_back(vtmp);
+
+    int pStart = 0;  // current basis size at start of extension
+    RealD betaRestart = 0.0; // coupling from previous restart
+
+    for (int iter = 0; iter < MaxIter; ++iter) {
+
+      // ----------------------------------------------------------------
+      // Step 1: extend from pStart steps to Nm steps
+      // ----------------------------------------------------------------
+      extendBasis(pStart, Nm, betaRestart);
+//      verify();
+
+      // ----------------------------------------------------------------
+      // Step 2: SVD of the Nm x Nm B matrix.
+      // iter=0 (pStart==0): B is exactly bidiagonal — use buildBidiagonal.
+      // iter>0 (pStart==Nk): after a thick restart, column restart_col of
+      // U^dag A V has extra off-diagonal entries captured by fvec; use
+      // buildFullB so the Ritz values and restart vectors are computed from
+      // the exact projected matrix A V = U B_full.
+      // ----------------------------------------------------------------
+      Eigen::MatrixXd B = (pStart == 0) ? buildBidiagonal(Nm) : buildFullB(Nm);
+      Eigen::JacobiSVD<Eigen::MatrixXd> svd(B,
+          Eigen::ComputeThinU | Eigen::ComputeThinV);
+
+      Eigen::VectorXd sigma = svd.singularValues();  // descending
+      Eigen::MatrixXd X     = svd.matrixU();          // Nm x Nm left SVecs of B
+      Eigen::MatrixXd Y     = svd.matrixV();          // Nm x Nm right SVecs of B
+
+      // If targeting smallest, reorder so desired ones come first
+      Eigen::VectorXi order = sortOrder(sigma);
+
+      // ----------------------------------------------------------------
+      // Step 3: check convergence of the Nk desired singular values
+      // ----------------------------------------------------------------
+      RealD betaK = beta.back();  // beta_{k+1}
+      // In our convention A V = U B (exact), the residual is in the A^dag
+      // direction: A^dag u_j - sigma_j v_j = betaK * X[Nm-1,j] * V[Nm].
+      // Convergence criterion: |betaK * X[Nm-1, idx]| < tol * sigma_idx.
+      int nconv = 0;
+      for (int i = 0; i < Nk; ++i) {
+        int idx = order(i);
+        RealD res = std::abs(betaK * X(Nm - 1, idx));
+        RealD thr = Tolerance * std::max(sigma(idx), 1.0e-14);
+        std::cout << GridLogMessage
+                  << "IRLBA iter " << iter
+                  << "  sigma[" << i << "] = " << sigma(idx)
+                  << "  res = " << res
+                  << "  thr = " << thr << std::endl;
+        if (res < thr) ++nconv;
+        else break;  // residuals not strictly ordered but break is conservative
+      }
+
+      if (nconv >= Nk) {
+        std::cout << GridLogMessage
+                  << "IRLBA converged: " << nconv << " singular values after "
+                  << iter + 1 << " iterations." << std::endl;
+        // Collect converged triplets
+        extractTriplets(Nm, sigma, X, Y, order, Nk);
+        return;
+      }
+
+      // ----------------------------------------------------------------
+      // Step 4: implicit restart — compress to Nk steps
+      // ----------------------------------------------------------------
+      implicitRestart(Nm, Nk, sigma, X, Y, order, betaK, betaRestart);
+//      verify();
+
+      // Lucky breakdown: exact invariant subspace found; convergence is exact.
+      // B_p^+ = diag(alpha[0..Nk-1]); extract directly from restart basis.
+      if (betaRestart < 1.0e-14) {
+        std::cout << GridLogMessage
+                  << "IRLBA: lucky breakdown after restart (betaRestart = 0)."
+                  << " Extracting " << Nk << " exact Ritz triplets." << std::endl;
+        // Re-run SVD on the p-step diagonal B^+ to get sorted Ritz triplets.
+        Eigen::MatrixXd Bp = buildBidiagonal(Nk);
+        Eigen::JacobiSVD<Eigen::MatrixXd> svdp(Bp,
+            Eigen::ComputeThinU | Eigen::ComputeThinV);
+        Eigen::VectorXi ordp = sortOrder(svdp.singularValues());
+        extractTriplets(Nk, svdp.singularValues(), svdp.matrixU(),
+                        svdp.matrixV(), ordp, Nk);
+        return;
+      }
+
+      pStart = Nk;
+    }
+
+    std::cout << GridLogMessage
+              << "IRLBA: did not converge in " << MaxIter
+              << " iterations. Returning best approximations." << std::endl;
+
+    // Return best available approximations
+    Eigen::MatrixXd B = buildFullB((int)alpha.size());
+    Eigen::JacobiSVD<Eigen::MatrixXd> svd(B,
+        Eigen::ComputeThinU | Eigen::ComputeThinV);
+    Eigen::VectorXd sigma = svd.singularValues();
+    Eigen::MatrixXd X     = svd.matrixU();
+    Eigen::MatrixXd Y     = svd.matrixV();
+    Eigen::VectorXi order = sortOrder(sigma);
+    int nout = std::min(Nk, (int)alpha.size());
+    extractTriplets((int)alpha.size(), sigma, X, Y, order, nout);
+  }
+
+  /* Getters */
+  int getNk() const { return (int)singularValues.size(); }
+  const std::vector<RealD>&  getSingularValues() const { return singularValues; }
+  const std::vector<Field>&  getLeftVectors()    const { return leftVectors; }
+  const std::vector<Field>&  getRightVectors()   const { return rightVectors; }
+
+  /**
+   * Print B_k and U^dag A V to verify the bidiagonalization relation
+   *   A V_m = U_m B_m   (exact in our GK convention)
+   * On the first call (pStart=0), max|B - U^dag A V| should be ~machine epsilon.
+   * After a restart and extension, the column p of U^dag A V deviates from B
+   * by O(betaK): this is expected because the thick restart breaks the Krylov
+   * structure at column p, introducing off-diagonal terms proportional to betaK.
+   * These terms vanish as betaK -> 0 (convergence), so the algorithm is correct.
+   */
+  void verify()
+  {
+    int m  = (int)alpha.size();
+    int nU = (int)U.size();
+    int nV = (int)V.size();
+    if (m == 0) { std::cout << GridLogMessage << "IRLBA verify: empty basis" << std::endl; return; }
+
+    // Build reference matrix Bref (nU x nV):
+    //   Columns 0..m-1 : buildFullB(m)  (bidiagonal + fvec column at restart_col)
+    //   Column  m      : residual column, two cases:
+    //     (a) restart_col == m (right after implicitRestart, before extendBasis):
+    //         V[m] = sgn*V_old[Nm], so <U[i]|A|V[m]> = fvec[i] for all i
+    //     (b) otherwise (pure GK or after extendBasis):
+    //         only entry (m-1, m) = beta[m-1]  (GK recurrence residual)
+    Eigen::MatrixXd Bref = Eigen::MatrixXd::Zero(nU, nV);
+    {
+      Eigen::MatrixXd Bfull = buildFullB(m);
+      int cols = std::min(m, nV);
+      Bref.block(0, 0, m, cols) = Bfull.block(0, 0, m, cols);
+    }
+    if (nV > m && m > 0) {
+      if (restart_col == m && (int)fvec.size() == m) {
+        // Case (a): right after implicitRestart
+        for (int i = 0; i < m; ++i) Bref(i, m) = fvec[i];
+      } else if ((int)beta.size() >= m) {
+        // Case (b): standard GK residual column
+        Bref(m - 1, m) = beta[m - 1];
+      }
+    }
+
+    // Compute M[i,j] = <U[i] | A | V[j]>
+    Eigen::MatrixXd M = Eigen::MatrixXd::Zero(nU, nV);
+    Field Avj(Grid);
+    for (int j = 0; j < nV; ++j) {
+      Linop.Op(V[j], Avj);
+      for (int i = 0; i < nU; ++i) {
+        ComplexD ip = innerProduct(U[i], Avj);
+        M(i, j) = ip.real();
+      }
+    }
+
+    // Print Bref
+    std::cout << GridLogMessage
+              << "IRLBA verify: Bref (" << nU << "x" << nV << "):" << std::endl;
+    for (int i = 0; i < nU; ++i) {
+      std::cout << GridLogMessage << "  row " << i << ": ";
+      for (int j = 0; j < nV; ++j) std::cout << Bref(i,j) << " ";
+      std::cout << std::endl;
+    }
+
+    // Print U^dag A V
+    std::cout << GridLogMessage
+              << "IRLBA verify: U^dag A V (" << nU << "x" << nV << "):" << std::endl;
+    for (int i = 0; i < nU; ++i) {
+      std::cout << GridLogMessage << "  row " << i << ": ";
+      for (int j = 0; j < nV; ++j) std::cout << M(i,j) << " ";
+      std::cout << std::endl;
+    }
+
+    // Max deviation over the full nU x nV matrix
+    RealD maxdev = (Bref - M).cwiseAbs().maxCoeff();
+    std::cout << GridLogMessage
+              << "IRLBA verify: max|Bref - U^dag A V| = " << maxdev << std::endl;
+
+    // Beta
+    std::cout << GridLogMessage << "IRLBA verify: beta[0.." << (int)beta.size()-1 << "] = ";
+    for (auto b : beta) std::cout << b << " ";
+    std::cout << std::endl;
+  }
+
+private:
+
+  // ------------------------------------------------------------------
+  // Build the m x m upper-bidiagonal matrix from alpha[0..m-1], beta[0..m-2]
+  // ------------------------------------------------------------------
+  Eigen::MatrixXd buildBidiagonal(int m) const
+  {
+    Eigen::MatrixXd B = Eigen::MatrixXd::Zero(m, m);
+    for (int k = 0; k < m; ++k) {
+      B(k, k) = alpha[k];
+      if (k + 1 < m && k < (int)beta.size())
+        B(k, k + 1) = beta[k];
+    }
+    return B;
+  }
+
+  // ------------------------------------------------------------------
+  // Build the full m x m B matrix, including the non-bidiagonal column
+  // at restart_col that arises after a thick restart.
+  //
+  // After restart, A V[restart_col] has projections onto all U[0..restart_col-1]
+  // (not just U[restart_col-1]).  These are stored in fvec[0..restart_col-1]
+  // and make column restart_col of U^dag A V non-bidiagonal.
+  // ------------------------------------------------------------------
+  Eigen::MatrixXd buildFullB(int m) const
+  {
+    Eigen::MatrixXd B = buildBidiagonal(m);
+    if (restart_col >= 0 && restart_col < m && (int)fvec.size() > 0) {
+      for (int j = 0; j < restart_col && j < (int)fvec.size(); ++j){
+        B(j, restart_col) = fvec[j];
+        std::cout << GridLogDebug << "buildFullB: B  " <<j<<" "<<restart_col<<B(j, restart_col)<<std::endl;
+      }
+    }
+    return B;
+  }
+
+  // ------------------------------------------------------------------
+  // Return a permutation vector that puts the desired Nk singular values
+  // first (largest first if largest==true, smallest first otherwise).
+  // Eigen's JacobiSVD already returns sigma in descending order, so for
+  // largest we just return 0,1,...,m-1; for smallest we reverse.
+  // ------------------------------------------------------------------
+  Eigen::VectorXi sortOrder(const Eigen::VectorXd &sigma) const
+  {
+    int m = (int)sigma.size();
+    Eigen::VectorXi ord(m);
+    if (largest) {
+      for (int i = 0; i < m; ++i) ord(i) = i;
+    } else {
+      for (int i = 0; i < m; ++i) ord(i) = m - 1 - i;
+    }
+    return ord;
+  }
+
+  // ------------------------------------------------------------------
+  // Extend the Lanczos bidiagonalization from pStart to kEnd steps.
+  // On first call pStart==0 (V[0] already set).
+  // On restart calls V[0..pStart], U[0..pStart-1], alpha[0..pStart-1],
+  // beta[0..pStart-1] are already set; betaRestart is the coupling
+  // beta_{pStart} that drives the first new U step.
+  // ------------------------------------------------------------------
+  void extendBasis(int pStart, int kEnd, RealD betaRestart)
+  {
+    // Truncate containers to pStart (Lattice has no default constructor)
+    if ((int)V.size() > pStart + 1) V.erase(V.begin() + pStart + 1, V.end());
+    if ((int)U.size() > pStart)     U.erase(U.begin() + pStart,     U.end());
+    alpha.resize(pStart);
+    beta.resize(pStart);
+
+    Field p(Grid), r(Grid);
+
+    for (int k = pStart; k < kEnd; ++k) {
+
+      // p = A v_k
+      Linop.Op(V[k], p);
+
+      // Remove previous left vector coupling
+      if (k > 0) {
+        p = p - beta[k - 1] * U[k - 1];
+      }
+      // On the first step after a restart, beta[pStart-1] was already set;
+      // but V[pStart] was already constructed including the beta correction,
+      // so no extra subtraction needed here beyond the standard recurrence.
+
+      // Reorthogonalize p against U, then alpha_k = ||p||, u_k = p/alpha_k
+      reorthogonalize(p, U);
+      RealD ak = std::sqrt(norm2(p));
+      if (ak < 1.0e-14) {
+        std::cout << GridLogMessage
+                  << "IRLBA extendBasis: lucky breakdown at step " << k
+                  << " (alpha = " << ak << ")" << std::endl;
+        alpha.push_back(ak);
+        Field zero(Grid); zero = Zero();
+        U.push_back(zero);
+        beta.push_back(0.0);
+        V.push_back(zero);
+        break;
+      }
+      alpha.push_back(ak);
+
+      Field u(Grid);
+      u = (1.0 / ak) * p;
+      U.push_back(u);
+
+      // r = A^dag u_k - alpha_k v_k, reorthogonalize, then beta_{k+1} = ||r||
+      Linop.AdjOp(U[k], r);
+      r = r - ak * V[k];
+      reorthogonalize(r, V);
+
+      RealD bk = std::sqrt(norm2(r));
+      beta.push_back(bk);
+
+      std::cout << GridLogMessage
+                << "IRLBA extend step " << k
+                << "  alpha = " << ak
+                << "  beta  = " << bk << std::endl;
+
+      // Always push v_{k+1} (needed as residual direction for restart)
+      if (bk < 1.0e-14) {
+        std::cout << GridLogMessage
+                  << "IRLBA extendBasis: lucky breakdown (beta = 0) at step "
+                  << k << std::endl;
+        Field zero(Grid); zero = Zero();
+        V.push_back(zero);
+        break;
+      }
+      Field vnext(Grid);
+      vnext = (1.0 / bk) * r;
+      V.push_back(vnext);
+
+      if (k == kEnd - 1) break;  // v_{k+1} pushed above; stop here
+    }
+  }
+
+public:
+  // ------------------------------------------------------------------
+  // Block reorthogonalization helpers.
+  // Declared public because CUDA extended lambdas cannot live inside
+  // private/protected member functions.
+  //
+  // batchInnerProducts: computes c[j] = <basis[j], vec> for all j
+  //   in a single GPU pass (one accelerator_barrier instead of n).
+  //   Queues n pairs of (per-site kernel, reduceKernel) to computeStream
+  //   without intermediate CPU syncs, then syncs once at the end.
+  //
+  // batchUpdate: computes vec -= sum_j c[j]*basis[j] in one GPU kernel.
+  //
+  // reorthogonalize: two-pass Classical Gram-Schmidt (CGS2) using the
+  //   two helpers above.  Each pass costs 2 GPU syncs (1 IP + 1 update)
+  //   instead of 2n syncs per pass in the old sequential MGS.
+  // ------------------------------------------------------------------
+
+  void batchInnerProducts(const Field &vec,
+                          const std::vector<Field> &basis,
+                          std::vector<ComplexD> &c)
+  {
+    int n = (int)basis.size();
+    c.resize(n);
+    if (n == 0) return;
+
+    typedef typename Field::vector_object         vobj;
+    typedef decltype(innerProduct(vobj(), vobj())) inner_t;
+    typedef decltype(basis[0].View(AcceleratorRead)) View;
+
+    GridBase *grid = vec.Grid();
+    uint64_t oSites = grid->oSites();
+    uint64_t nsimd  = grid->Nsimd();
+
+    // all_ip[j * oSites + ss] = per-site inner product of basis[j] and vec at site ss.
+    // Layout: n contiguous blocks of oSites each.
+    deviceVector<inner_t> all_ip((uint64_t)n * oSites);
+    inner_t *all_ip_p = &all_ip[0];
+
+    hostVector<View>   h_basis_v(n);
+    deviceVector<View> d_basis_v(n);
+    for (int j = 0; j < n; ++j) {
+      h_basis_v[j] = basis[j].View(AcceleratorRead);
+      acceleratorPut(d_basis_v[j], h_basis_v[j]);
+    }
+    View *basis_vp = &d_basis_v[0];
+
+    // Queue n per-site kernels to the accelerator stream — no intermediate barriers.
+    autoView(vec_v, vec, AcceleratorRead);
+    for (int j = 0; j < n; ++j) {
+      int      jj      = j;
+      uint64_t oSites_ = oSites;
+      accelerator_for(ss, oSites, nsimd, {
+        auto x = coalescedRead(basis_vp[jj][ss]);
+        auto y = coalescedRead(vec_v[ss]);
+        coalescedWrite(all_ip_p[jj * oSites_ + ss], innerProduct(x, y));
+      });
+    }
+
+    // ONE sync after all n kernels
+    accelerator_barrier();
+
+    // Copy all per-site results to host
+    hostVector<inner_t> all_ip_h((uint64_t)n * oSites);
+    acceleratorCopyFromDevice(all_ip_p, &all_ip_h[0], (uint64_t)n * oSites * sizeof(inner_t));
+
+    // Reduce on host: sum over oSites, then collapse SIMD lanes via Reduce(TensorRemove(...))
+    // TensorRemove strips the iSinglet tensor wrapper to expose the SIMD scalar type.
+    // Reduce sums all nsimd lanes and returns a plain scalar (RealD or ComplexD).
+    std::vector<ComplexD> raw(n);
+    for (int j = 0; j < n; ++j) {
+      inner_t sum = Zero();
+      for (uint64_t ss = 0; ss < oSites; ++ss)
+        sum += all_ip_h[(uint64_t)j * oSites + ss];
+      raw[j] = ComplexD(Reduce(TensorRemove(sum)));
+    }
+    grid->GlobalSumVector(&raw[0], n);
+    for (int j = 0; j < n; ++j) c[j] = raw[j];
+
+    for (int j = 0; j < n; ++j) h_basis_v[j].ViewClose();
+  }
+
+  void batchUpdate(Field &vec,
+                   const std::vector<Field> &basis,
+                   const std::vector<ComplexD> &c)
+  {
+    int n = (int)basis.size();
+    if (n == 0) return;
+
+    typedef typename Field::vector_object vobj;
+    typedef decltype(basis[0].View(AcceleratorRead)) View;
+
+    GridBase *grid = vec.Grid();
+    uint64_t oSites = grid->oSites();
+    uint64_t nsimd  = grid->Nsimd();
+
+    // Split complex coefficients into real/imag double arrays on device.
+    // Using doubles avoids potential ComplexD-device-code compatibility issues.
+    hostVector<double>   h_re(n), h_im(n);
+    deviceVector<double> d_re(n), d_im(n);
+    for (int k = 0; k < n; ++k) {
+      h_re[k] = c[k].real();
+      h_im[k] = c[k].imag();
+    }
+    acceleratorCopyToDevice(&h_re[0], &d_re[0], n * sizeof(double));
+    acceleratorCopyToDevice(&h_im[0], &d_im[0], n * sizeof(double));
+    double *re_p = &d_re[0];
+    double *im_p = &d_im[0];
+
+    // Basis views
+    hostVector<View>   h_basis_v(n);
+    deviceVector<View> d_basis_v(n);
+    for (int k = 0; k < n; ++k) {
+      h_basis_v[k] = basis[k].View(AcceleratorRead);
+      acceleratorPut(d_basis_v[k], h_basis_v[k]);
+    }
+    View *basis_vp = &d_basis_v[0];
+
+    // Single kernel: vec[ss] -= sum_k (re[k] + i*im[k]) * basis[k][ss]
+    autoView(vec_v, vec, AcceleratorWrite);
+    accelerator_for(ss, oSites, nsimd, {
+      auto v = coalescedRead(vec_v[ss]);
+      for (int k = 0; k < n; ++k) {
+        auto b = coalescedRead(basis_vp[k][ss]);
+        v = v - re_p[k] * b - timesI(im_p[k] * b);
+      }
+      coalescedWrite(vec_v[ss], v);
+    });
+
+    for (int k = 0; k < n; ++k) h_basis_v[k].ViewClose();
+  }
+
+  // ------------------------------------------------------------------
+  // Full reorthogonalization using two-pass Classical Gram-Schmidt (CGS2).
+  // Each pass calls batchInnerProducts (1 GPU sync) + batchUpdate (1 sync),
+  // replacing the old 2n GPU syncs per pass from sequential MGS.
+  // ------------------------------------------------------------------
+  void reorthogonalize(Field &vec, const std::vector<Field> &basis)
+  {
+    if (basis.empty()) return;
+    std::vector<ComplexD> c;
+    for (int pass = 0; pass < 2; ++pass) {
+      batchInnerProducts(vec, basis, c);
+      batchUpdate(vec, basis, c);
+    }
+  }
+
+  // ------------------------------------------------------------------
+  // Implicit restart: given the Nm-step bidiagonalization and its SVD,
+  // compress to Nk steps via implicit QR shifts applied to B_k.
+  //
+  // The "shifts" are the Nm - Nk singular values we want to deflate
+  // (those NOT in the desired set).  We apply them as implicit QR steps
+  // to the bidiagonal matrix, then update the lattice bases accordingly.
+  //
+  // After this call:
+  //   V[0..Nk],  U[0..Nk-1],  alpha[0..Nk-1],  beta[0..Nk-1]  are updated.
+  //   betaRestart  ← new beta_Nk coupling for the next extension.
+  // ------------------------------------------------------------------
+  void implicitRestart(int k, int p,
+                       const Eigen::VectorXd &sigma,
+                       const Eigen::MatrixXd &X,
+                       const Eigen::MatrixXd &Y,
+                       const Eigen::VectorXi &order,
+                       RealD betaK,
+                       RealD &betaRestart)
+  {
+    // Thick restart (Baglama & Reichel, Sec. 2.2):
+    //
+    // Given B_k = X Sigma Y^T, define the new p-step basis by:
+    //   V^+_i = V_k * y_{order(i)}      (right sing. vec. of B_k)
+    //   U^+_i = U_k * x_{order(i)}      (left  sing. vec. of B_k)
+    //
+    // Then A V^+_i = A V_k y_{order(i)} = U_k B_k y_{order(i)}
+    //             = sigma_{order(i)} U_k x_{order(i)} = sigma_{order(i)} U^+_i
+    //
+    // So B_p^+ = diag(sigma_{order(0)}, ..., sigma_{order(p-1)}) — DIAGONAL,
+    // all internal betas are zero.
+    //
+    // The residual coupling comes from A^dag U_k = V_k B_k^T + betaK V[k] e_{k-1}^T:
+    //   A^dag U^+_{p-1} - sigma_{order(p-1)} V^+_{p-1}
+    //     = V_k (B_k^T x_{order(p-1)} - sigma_{order(p-1)} y_{order(p-1)})
+    //       + betaK * X(k-1, order(p-1)) * V[k]
+    //     = betaK * X(k-1, order(p-1)) * V[k]   (since B_k^T x_j = sigma_j y_j)
+    //
+    // Therefore: betaRestart = |betaK * X(k-1, order(p-1))|
+    //            V[p] = sign(X(k-1, order(p-1))) * V[k]
+
+    // ---- Build new lattice vectors ----
+    std::vector<Field> Vnew, Unew;
+    Vnew.reserve(p + 1);
+    Unew.reserve(p);
+
+    for (int i = 0; i < p; ++i) {
+      int idx = order(i);
+      Field vi(Grid); vi = Zero();
+      for (int j = 0; j < k; ++j)
+        vi = vi + Y(j, idx) * V[j];
+      Vnew.push_back(vi);
+    }
+
+    for (int i = 0; i < p; ++i) {
+      int idx = order(i);
+      Field ui(Grid); ui = Zero();
+      for (int j = 0; j < k; ++j)
+        ui = ui + X(j, idx) * U[j];
+      Unew.push_back(ui);
+    }
+
+    // New v_{p} (0-indexed: V[p]) = sign * V[k]
+    // From A^dag U_k = V_k B_k^T + betaK V[k] e_{k-1}^T:
+    //   A^dag U^+_j - sigma_j V^+_j = betaK * X(k-1, order(j)) * V[k]
+    // The last Ritz pair (j=p-1) defines betaRestart and the sign of V[p].
+    // All p couplings (j=0..p-1) are stored in fvec so that buildFullB can
+    // reconstruct the exact column p of U^dag A V after the next extension.
+    RealD coeff = betaK * X(k - 1, order(p - 1));
+    betaRestart  = std::abs(coeff);
+    RealD sgn = (coeff >= 0.0) ? 1.0 : -1.0;
+
+    fvec.resize(p);
+    for (int j = 0; j < p; ++j)
+      fvec[j] = betaK * X(k - 1, order(j)) * sgn;
+    // fvec[p-1] == betaRestart by construction
+    restart_col = p;
+
+    Field vp(Grid);
+    if (betaRestart > 1.0e-14) {
+      vp = sgn * V[k];
+    } else {
+      betaRestart = 0.0;
+      vp = Zero();
+    }
+    Vnew.push_back(vp);  // V[p]
+
+    // ---- New alpha, beta ----
+    // B_p^+ is diagonal: alpha^+_i = sigma_{order(i)}, all internal beta = 0
+    std::vector<RealD> alpha_new(p), beta_new(p);
+    for (int i = 0; i < p; ++i) alpha_new[i] = sigma(order(i));
+    for (int i = 0; i < p - 1; ++i) beta_new[i] = 0.0;
+    beta_new[p - 1] = betaRestart;
+
+    // ---- Commit new state ----
+    V = Vnew;
+    U = Unew;
+    alpha = alpha_new;
+    beta  = beta_new;
+
+    std::cout << GridLogMessage
+              << "IRLBA restart: compressed to " << p << " steps,"
+              << "  new beta_p = " << betaRestart << std::endl;
+  }
+
+  // ------------------------------------------------------------------
+  // Extract the desired singular triplets into the public output vectors.
+  // ------------------------------------------------------------------
+  void extractTriplets(int m,
+                       const Eigen::VectorXd &sigma,
+                       const Eigen::MatrixXd &X,
+                       const Eigen::MatrixXd &Y,
+                       const Eigen::VectorXi &order,
+                       int nout)
+  {
+    singularValues.resize(nout);
+    leftVectors.clear();   leftVectors.reserve(nout);
+    rightVectors.clear();  rightVectors.reserve(nout);
+
+    for (int i = 0; i < nout; ++i) {
+      int idx = order(i);
+      singularValues[i] = sigma(idx);
+
+      // Left singular vector of A:  svec_L = U_m * x_i
+      Field svL(Grid); svL = Zero();
+      for (int j = 0; j < m && j < (int)U.size(); ++j)
+        svL = svL + X(j, idx) * U[j];
+      leftVectors.push_back(svL);
+
+      // Right singular vector of A:  svec_R = V_m * y_i
+      Field svR(Grid); svR = Zero();
+      for (int j = 0; j < m && j < (int)V.size(); ++j)
+        svR = svR + Y(j, idx) * V[j];
+      rightVectors.push_back(svR);
+    }
+  }
+};
+
+NAMESPACE_END(Grid);
+#endif
@@ -0,0 +1,931 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LANC_H
+#define GRID_LANC_H
+
+#include <string.h>		//memset
+
+#ifdef USE_LAPACK
+#ifdef USE_MKL
+#include<mkl_lapack.h>
+#else
+void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
+		    double *vl, double *vu, int *il, int *iu, double *abstol,
+		    int *m, double *w, double *z, int *ldz, int *isuppz,
+		    double *work, int *lwork, int *iwork, int *liwork,
+		    int *info);
+//#include <lapacke/lapacke.h>
+#endif
+#endif
+
+//#include <Grid/algorithms/densematrix/DenseMatrix.h>
+
+// eliminate temorary vector in calc()
+#define MEM_SAVE
+
+namespace Grid
+{
+
+  struct Bisection
+  {
+
+#if 0
+    static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
+			  std::vector < RealD > &BETA,
+			  std::vector < RealD > &eig)
+    {
+      int i, j;
+        std::vector < RealD > evec1 (row_num + 3);
+        std::vector < RealD > evec2 (row_num + 3);
+      RealD eps2;
+        ALPHA[1] = 0.;
+        BETHA[1] = 0.;
+      for (i = 0; i < row_num - 1; i++)
+	{
+	  ALPHA[i + 1] = A[i * (row_num + 1)].real ();
+	  BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
+	}
+      ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
+        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
+        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
+
+      // Do we really need to sort here?
+      int begin = 1;
+      int end = row_num;
+      int swapped = 1;
+      while (swapped)
+	{
+	  swapped = 0;
+	  for (i = begin; i < end; i++)
+	    {
+	      if (mag (evec2[i]) > mag (evec2[i + 1]))
+		{
+		  swap (evec2 + i, evec2 + i + 1);
+		  swapped = 1;
+		}
+	    }
+	  end--;
+	  for (i = end - 1; i >= begin; i--)
+	    {
+	      if (mag (evec2[i]) > mag (evec2[i + 1]))
+		{
+		  swap (evec2 + i, evec2 + i + 1);
+		  swapped = 1;
+		}
+	    }
+	  begin++;
+	}
+
+      for (i = 0; i < row_num; i++)
+	{
+	  for (j = 0; j < row_num; j++)
+	    {
+	      if (i == j)
+		H[i * row_num + j] = evec2[i + 1];
+	      else
+		H[i * row_num + j] = 0.;
+	    }
+	}
+    }
+#endif
+
+    static void bisec (std::vector < RealD > &c,
+		       std::vector < RealD > &b,
+		       int n,
+		       int m1,
+		       int m2,
+		       RealD eps1,
+		       RealD relfeh, std::vector < RealD > &x, RealD & eps2)
+    {
+      std::vector < RealD > wu (n + 2);
+
+      RealD h, q, x1, xu, x0, xmin, xmax;
+      int i, a, k;
+
+      b[1] = 0.0;
+      xmin = c[n] - fabs (b[n]);
+      xmax = c[n] + fabs (b[n]);
+      for (i = 1; i < n; i++)
+	{
+	  h = fabs (b[i]) + fabs (b[i + 1]);
+	  if (c[i] + h > xmax)
+	    xmax = c[i] + h;
+	  if (c[i] - h < xmin)
+	    xmin = c[i] - h;
+	}
+      xmax *= 2.;
+
+      eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
+      if (eps1 <= 0.0)
+	eps1 = eps2;
+      eps2 = 0.5 * eps1 + 7.0 * (eps2);
+      x0 = xmax;
+      for (i = m1; i <= m2; i++)
+	{
+	  x[i] = xmax;
+	  wu[i] = xmin;
+	}
+
+      for (k = m2; k >= m1; k--)
+	{
+	  xu = xmin;
+	  i = k;
+	  do
+	    {
+	      if (xu < wu[i])
+		{
+		  xu = wu[i];
+		  i = m1 - 1;
+		}
+	      i--;
+	    }
+	  while (i >= m1);
+	  if (x0 > x[k])
+	    x0 = x[k];
+	  while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
+	    {
+	      x1 = (xu + x0) / 2;
+
+	      a = 0;
+	      q = 1.0;
+	      for (i = 1; i <= n; i++)
+		{
+		  q =
+		    c[i] - x1 -
+		    ((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
+		  if (q < 0)
+		    a++;
+		}
+//      printf("x1=%0.14e a=%d\n",x1,a);
+	      if (a < k)
+		{
+		  if (a < m1)
+		    {
+		      xu = x1;
+		      wu[m1] = x1;
+		    }
+		  else
+		    {
+		      xu = x1;
+		      wu[a + 1] = x1;
+		      if (x[a] > x1)
+			x[a] = x1;
+		    }
+		}
+	      else
+		x0 = x1;
+	    }
+	  printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
+	  x[k] = (x0 + xu) / 2;
+	}
+    }
+  };
+
+/////////////////////////////////////////////////////////////
+// Implicitly restarted lanczos
+/////////////////////////////////////////////////////////////
+
+
+  template < class Field > class SimpleLanczos
+  {
+
+    const RealD small = 1.0e-16;
+  public:
+    int lock;
+    int get;
+    int Niter;
+    int converged;
+
+    int Nstop;			// Number of evecs checked for convergence
+    int Nk;			// Number of converged sought
+    int Np;			// Np -- Number of spare vecs in kryloc space
+    int Nm;			// Nm -- total number of vectors
+
+
+    RealD OrthoTime;
+
+    RealD eresid;
+
+//    SortEigen < Field > _sort;
+
+    LinearFunction < Field > &_Linop;
+
+//    OperatorFunction < Field > &_poly;
+
+    /////////////////////////
+    // Constructor
+    /////////////////////////
+    void init (void)
+    {
+    };
+//    void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector  < RealD > >&evecs);
+
+    SimpleLanczos (LinearFunction < Field > &Linop,	// op
+//		   OperatorFunction < Field > &poly,	// polynmial
+		   int _Nstop,	// sought vecs
+		   int _Nk,	// sought vecs
+		   int _Nm,	// spare vecs
+		   RealD _eresid,	// resid in lmdue deficit 
+		   int _Niter):	// Max iterations
+     
+      _Linop (Linop),
+ //     _poly (poly),
+      Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
+    {
+      Np = Nm - Nk;
+      assert (Np > 0);
+    };
+
+    /////////////////////////
+    // Sanity checked this routine (step) against Saad.
+    /////////////////////////
+    void RitzMatrix (std::vector < Field > &evec, int k)
+    {
+
+      if (1)
+	return;
+
+      GridBase *grid = evec[0].Grid();
+      Field w (grid);
+      std::cout << GridLogMessage << "RitzMatrix " << std::endl;
+      for (int i = 0; i < k; i++)
+	{
+	  _Linop(evec[i], w);
+//      _poly(_Linop,evec[i],w);
+	  std::cout << GridLogMessage << "[" << i << "] ";
+	  for (int j = 0; j < k; j++)
+	    {
+	      ComplexD in = innerProduct (evec[j], w);
+	      if (fabs ((double) i - j) > 1)
+		{
+		  if (abs (in) > 1.0e-9)
+		    {
+		      std::cout << GridLogMessage << "oops" << std::endl;
+		      abort ();
+		    }
+		  else
+		    std::cout << GridLogMessage << " 0 ";
+		}
+	      else
+		{
+		  std::cout << GridLogMessage << " " << in << " ";
+		}
+	    }
+	  std::cout << GridLogMessage << std::endl;
+	}
+    }
+
+    void step (std::vector < RealD > &lmd,
+	       std::vector < RealD > &lme,
+	       Field & last, Field & current, Field & next, uint64_t k)
+    {
+      if (lmd.size () <= k)
+	lmd.resize (k + Nm);
+      if (lme.size () <= k)
+	lme.resize (k + Nm);
+
+
+//      _poly(_Linop,current,next );   // 3. wk:=Avk−βkv_{k−1}
+      _Linop(current, next);	// 3. wk:=Avk−βkv_{k−1}
+      if (k > 0)
+	{
+	  next -= lme[k - 1] * last;
+	}
+//      std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
+
+      ComplexD zalph = innerProduct (current, next);	// 4. αk:=(wk,vk)
+      RealD alph = real (zalph);
+
+      next = next - alph * current;	// 5. wk:=wk−αkvk
+//      std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
+
+      RealD beta = normalise (next);	// 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+      // 7. vk+1 := wk/βk+1
+//       norm=beta;
+
+      int interval = Nm / 100 + 1;
+      if ((k % interval) == 0)
+	std::
+	  cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
+	  beta << std::endl;
+      const RealD tiny = 1.0e-20;
+      if (beta < tiny)
+	{
+	  std::cout << GridLogMessage << " beta is tiny " << beta << std::
+	    endl;
+	}
+      lmd[k] = alph;
+      lme[k] = beta;
+
+    }
+
+    void qr_decomp (std::vector < RealD > &lmd,
+		    std::vector  < RealD > &lme,
+		    int Nk,
+		    int Nm,
+		    std::vector  < RealD > &Qt, RealD Dsh, int kmin, int kmax)
+    {
+      int k = kmin - 1;
+      RealD x;
+
+      RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
+      RealD c = (lmd[k] - Dsh) * Fden;
+      RealD s = -lme[k] * Fden;
+
+      RealD tmpa1 = lmd[k];
+      RealD tmpa2 = lmd[k + 1];
+      RealD tmpb = lme[k];
+
+      lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
+      lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
+      lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
+      x = -s * lme[k + 1];
+      lme[k + 1] = c * lme[k + 1];
+
+      for (int i = 0; i < Nk; ++i)
+	{
+	  RealD Qtmp1 = Qt[i + Nm * k];
+	  RealD Qtmp2 = Qt[i + Nm * (k + 1)];
+	  Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
+	  Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
+	}
+
+      // Givens transformations
+      for (int k = kmin; k < kmax - 1; ++k)
+	{
+
+	  RealD Fden = 1.0 / hypot (x, lme[k - 1]);
+	  RealD c = lme[k - 1] * Fden;
+	  RealD s = -x * Fden;
+
+	  RealD tmpa1 = lmd[k];
+	  RealD tmpa2 = lmd[k + 1];
+	  RealD tmpb = lme[k];
+
+	  lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
+	  lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
+	  lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
+	  lme[k - 1] = c * lme[k - 1] - s * x;
+
+	  if (k != kmax - 2)
+	    {
+	      x = -s * lme[k + 1];
+	      lme[k + 1] = c * lme[k + 1];
+	    }
+
+	  for (int i = 0; i < Nk; ++i)
+	    {
+	      RealD Qtmp1 = Qt[i + Nm * k];
+	      RealD Qtmp2 = Qt[i + Nm * (k + 1)];
+	      Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
+	      Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
+	    }
+	}
+    }
+
+#if 0
+#ifdef USE_LAPACK
+#ifdef USE_MKL
+#define LAPACK_INT MKL_INT
+#else
+#define LAPACK_INT long long
+#endif
+    void diagonalize_lapack (std::vector  < RealD > &lmd, std::vector  < RealD > &lme, int N1,	// all
+			     int N2,	// get
+			     GridBase * grid)
+    {
+      const int size = Nm;
+      LAPACK_INT NN = N1;
+      double evals_tmp[NN];
+      double DD[NN];
+      double EE[NN];
+      for (int i = 0; i < NN; i++)
+	for (int j = i - 1; j <= i + 1; j++)
+	  if (j < NN && j >= 0)
+	    {
+	      if (i == j)
+		DD[i] = lmd[i];
+	      if (i == j)
+		evals_tmp[i] = lmd[i];
+	      if (j == (i - 1))
+		EE[j] = lme[j];
+	    }
+      LAPACK_INT evals_found;
+      LAPACK_INT lwork =
+	((18 * NN) >
+	 (1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
+      LAPACK_INT liwork = 3 + NN * 10;
+      LAPACK_INT iwork[liwork];
+      double work[lwork];
+      LAPACK_INT isuppz[2 * NN];
+      char jobz = 'N';		// calculate evals only
+      char range = 'I';		// calculate il-th to iu-th evals
+      //    char range = 'A'; // calculate all evals
+      char uplo = 'U';		// refer to upper half of original matrix
+      char compz = 'I';		// Compute eigenvectors of tridiagonal matrix
+      int ifail[NN];
+      LAPACK_INT info;
+//  int total = QMP_get_number_of_nodes();
+//  int node = QMP_get_node_number();
+//  GridBase *grid = evec[0]._grid;
+      int total = grid->_Nprocessors;
+      int node = grid->_processor;
+      int interval = (NN / total) + 1;
+      double vl = 0.0, vu = 0.0;
+      LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
+      if (iu > NN)
+	iu = NN;
+      double tol = 0.0;
+      if (1)
+	{
+	  memset (evals_tmp, 0, sizeof (double) * NN);
+	  if (il <= NN)
+	    {
+	      printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
+#ifdef USE_MKL
+	      dstegr (&jobz, &range, &NN,
+#else
+	      LAPACK_dstegr (&jobz, &range, &NN,
+#endif
+			     (double *) DD, (double *) EE, &vl, &vu, &il, &iu,	// these four are ignored if second parameteris 'A'
+			     &tol,	// tolerance
+			     &evals_found, evals_tmp, (double *) NULL, &NN,
+			     isuppz, work, &lwork, iwork, &liwork, &info);
+	      for (int i = iu - 1; i >= il - 1; i--)
+		{
+		  printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
+			  evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
+		  evals_tmp[i] = evals_tmp[i - (il - 1)];
+		  if (il > 1)
+		    evals_tmp[i - (il - 1)] = 0.;
+		}
+	    }
+	  {
+	    grid->GlobalSumVector (evals_tmp, NN);
+	  }
+	}
+// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
+    }
+#undef LAPACK_INT
+#endif
+
+
+    void diagonalize (std::vector  < RealD > &lmd,
+		      std::vector  < RealD > &lme,
+		      int N2, int N1, GridBase * grid)
+    {
+
+#ifdef USE_LAPACK
+      const int check_lapack = 0;	// just use lapack if 0, check against lapack if 1
+
+      if (!check_lapack)
+	return diagonalize_lapack (lmd, lme, N2, N1, grid);
+
+//      diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
+#endif
+    }
+#endif
+
+    static RealD normalise (Field & v)
+    {
+      RealD nn = norm2 (v);
+      nn = sqrt (nn);
+      v = v * (1.0 / nn);
+      return nn;
+    }
+
+    void orthogonalize (Field & w, std::vector < Field > &evec, int k)
+    {
+      double t0 = -usecond () / 1e6;
+      typedef typename Field::scalar_type MyComplex;
+      MyComplex ip;
+
+      if (0)
+	{
+	  for (int j = 0; j < k; ++j)
+	    {
+	      normalise (evec[j]);
+	      for (int i = 0; i < j; i++)
+		{
+		  ip = innerProduct (evec[i], evec[j]);	// are the evecs normalised? ; this assumes so.
+		  evec[j] = evec[j] - ip * evec[i];
+		}
+	    }
+	}
+
+      for (int j = 0; j < k; ++j)
+	{
+	  ip = innerProduct (evec[j], w);	// are the evecs normalised? ; this assumes so.
+	  w = w - ip * evec[j];
+	}
+      normalise (w);
+      t0 += usecond () / 1e6;
+      OrthoTime += t0;
+    }
+
+    void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
+    {
+      for (int i = 0; i < Qt.size (); ++i)
+	Qt[i] = 0.0;
+      for (int k = 0; k < Nm; ++k)
+	Qt[k + k * Nm] = 1.0;
+    }
+
+
+    void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
+    {
+
+      GridBase *grid = src.Grid();
+//      assert(grid == src._grid);
+
+      std::
+	cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
+	endl;
+      std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
+      std::cout << GridLogMessage << " -- size of eval   = " << eval.
+	size () << std::endl;
+
+//      assert(c.size() && Nm == eval.size());
+
+      std::vector < RealD > lme (Nm);
+      std::vector < RealD > lmd (Nm);
+
+
+      Field current (grid);
+      Field last (grid);
+      Field next (grid);
+
+      Nconv = 0;
+
+      RealD beta_k;
+
+      // Set initial vector
+      // (uniform vector) Why not src??
+      //      evec[0] = 1.0;
+      current = src;
+      std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
+	endl;
+      normalise (current);
+      std::
+	cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
+	std::endl;
+
+      // Initial Nk steps
+      OrthoTime = 0.;
+      double t0 = usecond () / 1e6;
+      RealD norm;		// sqrt norm of last vector
+
+      uint64_t iter = 0;
+
+      bool initted = false;
+      std::vector < RealD > low (Nstop * 10);
+      std::vector < RealD > high (Nstop * 10);
+      RealD cont = 0.;
+      while (1) {
+	  cont = 0.;
+	  std::vector < RealD > lme2 (Nm);
+	  std::vector < RealD > lmd2 (Nm);
+	  for (uint64_t k = 0; k < Nm; ++k, iter++) {
+	      step (lmd, lme, last, current, next, iter);
+	      last = current;
+	      current = next;
+	    }
+	  double t1 = usecond () / 1e6;
+	  std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
+	    t0 << "seconds" << std::endl;
+	  t0 = t1;
+	  std::
+	    cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
+	    OrthoTime << "seconds" << std::endl;
+
+	  // getting eigenvalues
+	  lmd2.resize (iter + 2);
+	  lme2.resize (iter + 2);
+	  for (uint64_t k = 0; k < iter; ++k) {
+	      lmd2[k + 1] = lmd[k];
+	      lme2[k + 2] = lme[k];
+	    }
+	  t1 = usecond () / 1e6;
+	  std::cout << GridLogMessage << "IRL:: copy: " << t1 -
+	    t0 << "seconds" << std::endl;
+	  t0 = t1;
+	  {
+	    int total = grid->_Nprocessors;
+	    int node = grid->_processor;
+	    int interval = (Nstop / total) + 1;
+	    int iu = (iter + 1) - (interval * node + 1);
+	    int il = (iter + 1) - (interval * (node + 1));
+	    std::vector < RealD > eval2 (iter + 3);
+	    RealD eps2;
+	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
+			      eps2);
+//        diagonalize(eval2,lme2,iter,Nk,grid);
+	    RealD diff = 0.;
+	    for (int i = il; i <= iu; i++) {
+		if (initted)
+		  diff =
+		    fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
+						      fabs (high[iu-i]));
+		if (initted && (diff > eresid))
+		  cont = 1.;
+		if (initted)
+		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
+			  high[iu-i], diff);
+		high[iu-i] = eval2[i];
+	      }
+	    il = (interval * node + 1);
+	    iu = (interval * (node + 1));
+	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
+			      eps2);
+	    for (int i = il; i <= iu; i++) {
+		if (initted)
+		  diff =
+		    fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
+						fabs (low[i]));
+		if (initted && (diff > eresid))
+		  cont = 1.;
+		if (initted)
+		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
+			  low[i], diff);
+		low[i] = eval2[i];
+	      }
+	    t1 = usecond () / 1e6;
+	    std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
+	      t0 << "seconds" << std::endl;
+	    t0 = t1;
+	  }
+
+	  for (uint64_t k = 0; k < Nk; ++k) {
+//          eval[k] = eval2[k];
+	    }
+	  if (initted)
+	    {
+	      grid->GlobalSumVector (&cont, 1);
+	      if (cont < 1.) return;
+	    }
+	  initted = true;
+	}
+
+    }
+
+
+
+
+
+#if 0
+
+/**
+   There is some matrix Q such that for any vector y
+   Q.e_1 = y and Q is unitary.
+**/
+    template < class T >
+      static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
+    {
+      int N = y.size ();	//Matrix Size
+      Fill (Q, 0.0);
+      T tau;
+      for (int i = 0; i < N; i++)
+	{
+	  Q[i][0] = y[i];
+	}
+      T sig = conj (y[0]) * y[0];
+      T tau0 = fabs (sqrt (sig));
+
+      for (int j = 1; j < N; j++)
+	{
+	  sig += conj (y[j]) * y[j];
+	  tau = abs (sqrt (sig));
+
+	  if (abs (tau0) > 0.0)
+	    {
+
+	      T gam = conj ((y[j] / tau) / tau0);
+	      for (int k = 0; k <= j - 1; k++)
+		{
+		  Q[k][j] = -gam * y[k];
+		}
+	      Q[j][j] = tau0 / tau;
+	    }
+	  else
+	    {
+	      Q[j - 1][j] = 1.0;
+	    }
+	  tau0 = tau;
+	}
+      return tau;
+    }
+
+/**
+	There is some matrix Q such that for any vector y
+	Q.e_k = y and Q is unitary.
+**/
+    template < class T >
+      static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
+    {
+      T tau = orthQ (Q, y);
+      SL (Q);
+      return tau;
+    }
+
+
+/**
+	Wind up with a matrix with the first con rows untouched
+
+say con = 2
+	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
+	and the matrix is upper hessenberg
+	and with f and Q appropriately modidied with Q is the arnoldi factorization
+
+**/
+
+    template < class T > static void Lock (DenseMatrix < T > &H,	///Hess mtx     
+					   DenseMatrix < T > &Q,	///Lock Transform
+					   T val,	///value to be locked
+					   int con,	///number already locked
+					   RealD small, int dfg, bool herm)
+    {
+      //ForceTridiagonal(H);
+
+      int M = H.dim;
+      DenseVector < T > vec;
+      Resize (vec, M - con);
+
+      DenseMatrix < T > AH;
+      Resize (AH, M - con, M - con);
+      AH = GetSubMtx (H, con, M, con, M);
+
+      DenseMatrix < T > QQ;
+      Resize (QQ, M - con, M - con);
+
+      Unity (Q);
+      Unity (QQ);
+
+      DenseVector < T > evals;
+      Resize (evals, M - con);
+      DenseMatrix < T > evecs;
+      Resize (evecs, M - con, M - con);
+
+      Wilkinson < T > (AH, evals, evecs, small);
+
+      int k = 0;
+      RealD cold = abs (val - evals[k]);
+      for (int i = 1; i < M - con; i++)
+	{
+	  RealD cnew = abs (val - evals[i]);
+	  if (cnew < cold)
+	    {
+	      k = i;
+	      cold = cnew;
+	    }
+	}
+      vec = evecs[k];
+
+      ComplexD tau;
+      orthQ (QQ, vec);
+      //orthQM(QQ,AH,vec);
+
+      AH = Hermitian (QQ) * AH;
+      AH = AH * QQ;
+
+      for (int i = con; i < M; i++)
+	{
+	  for (int j = con; j < M; j++)
+	    {
+	      Q[i][j] = QQ[i - con][j - con];
+	      H[i][j] = AH[i - con][j - con];
+	    }
+	}
+
+      for (int j = M - 1; j > con + 2; j--)
+	{
+
+	  DenseMatrix < T > U;
+	  Resize (U, j - 1 - con, j - 1 - con);
+	  DenseVector < T > z;
+	  Resize (z, j - 1 - con);
+	  T nm = norm (z);
+	  for (int k = con + 0; k < j - 1; k++)
+	    {
+	      z[k - con] = conj (H (j, k + 1));
+	    }
+	  normalise (z);
+
+	  RealD tmp = 0;
+	  for (int i = 0; i < z.size () - 1; i++)
+	    {
+	      tmp = tmp + abs (z[i]);
+	    }
+
+	  if (tmp < small / ((RealD) z.size () - 1.0))
+	    {
+	      continue;
+	    }
+
+	  tau = orthU (U, z);
+
+	  DenseMatrix < T > Hb;
+	  Resize (Hb, j - 1 - con, M);
+
+	  for (int a = 0; a < M; a++)
+	    {
+	      for (int b = 0; b < j - 1 - con; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += H[a][con + 1 + c] * U[c][b];
+		    }		//sum += H(a,con+1+c)*U(c,b);}
+		  Hb[b][a] = sum;
+		}
+	    }
+
+	  for (int k = con + 1; k < j; k++)
+	    {
+	      for (int l = 0; l < M; l++)
+		{
+		  H[l][k] = Hb[k - 1 - con][l];
+		}
+	    }			//H(Hb[k-1-con][l] , l,k);}}
+
+	  DenseMatrix < T > Qb;
+	  Resize (Qb, M, M);
+
+	  for (int a = 0; a < M; a++)
+	    {
+	      for (int b = 0; b < j - 1 - con; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += Q[a][con + 1 + c] * U[c][b];
+		    }		//sum += Q(a,con+1+c)*U(c,b);}
+		  Qb[b][a] = sum;
+		}
+	    }
+
+	  for (int k = con + 1; k < j; k++)
+	    {
+	      for (int l = 0; l < M; l++)
+		{
+		  Q[l][k] = Qb[k - 1 - con][l];
+		}
+	    }			//Q(Qb[k-1-con][l] , l,k);}}
+
+	  DenseMatrix < T > Hc;
+	  Resize (Hc, M, M);
+
+	  for (int a = 0; a < j - 1 - con; a++)
+	    {
+	      for (int b = 0; b < M; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += conj (U[c][a]) * H[con + 1 + c][b];
+		    }		//sum += conj( U(c,a) )*H(con+1+c,b);}
+		  Hc[b][a] = sum;
+		}
+	    }
+
+	  for (int k = 0; k < M; k++)
+	    {
+	      for (int l = con + 1; l < j; l++)
+		{
+		  H[l][k] = Hc[k][l - 1 - con];
+		}
+	    }			//H(Hc[k][l-1-con] , l,k);}}
+
+	}
+    }
+#endif
+
+
+  };
+
+}
+#endif
@@ -97,7 +97,7 @@ public:

    RealD scale;

-    ConjugateGradient<FineField> CG(1.0e-3,400,false);
+    ConjugateGradient<FineField> CG(1.0e-4,2000,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);

@@ -131,7 +131,10 @@ public:
    RealD scale;

    TrivialPrecon<FineField> simple_fine;
-    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
+    //    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,10,DiracOp,simple_fine,30,30);
+    //    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,10,DiracOp,simple_fine,12,12);
+    //    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
+    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,10,10);
    FineField noise(FineGrid);
    FineField src(FineGrid);
    FineField guess(FineGrid);
@@ -146,16 +149,16 @@ public:
      
      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;

-      for(int i=0;i<2;i++){
+      for(int i=0;i<3;i++){
 	//  void operator() (const Field &src, Field &psi){
 #if 1
-	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
+	if (i==0)std::cout << GridLogMessage << " inverting on noise "<<std::endl;
 	src = noise;
 	guess=Zero();
 	GCR(src,guess);
 	subspace[b] = guess;
 #else
-	std::cout << GridLogMessage << " inverting on zero "<<std::endl;
+	if (i==0)std::cout << GridLogMessage << " inverting on zero "<<std::endl;
 	src=Zero();
 	guess = noise;
 	GCR(src,guess);
@@ -167,7 +170,7 @@ public:

      }

-      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<" <f|OpDagOp|f>"<<norm2(Mn)<<std::endl;
      subspace[b]   = noise;

    }
@@ -260,7 +260,8 @@ class GridLimeReader : public BinaryIO {
              << " / field= " << n2ck << " / rdiff= " << GRID_FIELD_NORM_CALC(FieldNormMetaData_,n2ck) << std::endl;
 	  GRID_FIELD_NORM_CHECK(FieldNormMetaData_,n2ck);
 	}
-	assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
+//	assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
+	scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);

 	// find out if next field is a GridFieldNorm
 	return;
@@ -122,7 +122,7 @@ public:
    field.checksum = std::stoul(header["CHECKSUM"],0,16);
    field.ensemble_id      = header["ENSEMBLE_ID"];
    field.ensemble_label   = header["ENSEMBLE_LABEL"];
-    field.sequence_number  = std::stol(header["SEQUENCE_NUMBER"]);
+//    field.sequence_number  = std::stol(header["SEQUENCE_NUMBER"]);
    field.creator          = header["CREATOR"];
    field.creator_hardware = header["CREATOR_HARDWARE"];
    field.creation_date    = header["CREATION_DATE"];
@@ -596,16 +596,32 @@ template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
 //////////////////////////////////////////
 // Trace lattice and non-lattice
 //////////////////////////////////////////
+#define GRID_UNOP(name)   name
+#define GRID_DEF_UNOP(op, name)						\
+  template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
+  inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg)) \
+  {									\
+    return     LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg); \
+  }
+
 template<int Index,class vobj>
 inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(vobj()))>
 {
  return traceIndex<SpinIndex>(lhs);
 }
+
+GridUnopClass(UnaryTraceSpin, traceIndex<SpinIndex>(a));
+GRID_DEF_UNOP(traceSpin, UnaryTraceSpin);
+
 template<int Index,class vobj>
 inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(vobj()))>
 {
  return traceIndex<ColourIndex>(lhs);
 }
+
+GridUnopClass(UnaryTraceColour, traceIndex<ColourIndex>(a));
+GRID_DEF_UNOP(traceColour, UnaryTraceColour);
+
 template<int Index,class vobj>
 inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
 {
@@ -617,6 +633,8 @@ inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIn
  return traceIndex<ColourIndex>(lhs);
 }

+#undef GRID_UNOP
+#undef GRID_DEF_UNOP
 //////////////////////////////////////////
 // Current types
 //////////////////////////////////////////
@@ -103,6 +103,18 @@ class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{
  PolyakovMod(): ObsBase(NoParameters()){}
 };

+template < class Impl >
+class SpatialPolyakovMod: public ObservableModule<SpatialPolyakovLogger<Impl>, NoParameters>{
+  typedef ObservableModule<SpatialPolyakovLogger<Impl>, NoParameters> ObsBase;
+  using ObsBase::ObsBase; // for constructors
+
+  // acquire resource
+  virtual void initialize(){
+    this->ObservablePtr.reset(new SpatialPolyakovLogger<Impl>());
+  }
+  public:
+  SpatialPolyakovMod(): ObsBase(NoParameters()){}
+};

 template < class Impl >
 class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
@@ -2,11 +2,12 @@

 Grid physics library, www.github.com/paboyle/Grid

-Source file: ./lib/qcd/modules/polyakov_line.h
+Source file: ./Grid/qcd/observables/polyakov_loop.h

-Copyright (C) 2017
+Copyright (C) 2025

 Author: David Preti <david.preti@csic.es>
+Author: Alexis Verney-Provatas <2414441@swansea.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -60,4 +61,43 @@ class PolyakovLogger : public HmcObservable<typename Impl::Field> {
  }
 };

+template <class Impl>
+class SpatialPolyakovLogger : public HmcObservable<typename Impl::Field> {
+ public:
+    // here forces the Impl to be of gauge fields
+    // if not the compiler will complain
+    INHERIT_GIMPL_TYPES(Impl);
+
+     // necessary for HmcObservable compatibility
+    typedef typename Impl::Field Field;
+
+    void TrajectoryComplete(int traj,
+                            Field &U,
+                            GridSerialRNG &sRNG,
+                            GridParallelRNG &pRNG) {
+
+    // Save current numerical output precision
+    int def_prec = std::cout.precision();
+
+    // Assume that the dimensions are D=3+1
+    int Ndim = 3;
+    ComplexD polyakov;
+   
+    // Iterate over the spatial directions and print the average spatial polyakov loop
+    // over them 
+    for (int idx=0; idx<Ndim; idx++) {
+        polyakov = WilsonLoops<Impl>::avgPolyakovLoop(U, idx);
+    
+        std::cout << GridLogMessage
+            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+            << "Polyakov Loop in the " << idx << " spatial direction : [ " << traj << " ] "<< polyakov << std::endl;
+
+    }
+
+    // Return to original output precision
+    std::cout.precision(def_prec);
+
+  }
+};
+
 NAMESPACE_END(Grid);
@@ -254,9 +254,9 @@ static void testGenerators(GroupName::Sp) {
  }
 }

-template <int N>
-static Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > >
-ProjectOnGeneralGroup(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu, GroupName::Sp) {
+template <class vtype, int N>
+static Lattice<iScalar<iScalar<iMatrix<vtype, N> > > >
+ProjectOnGeneralGroup(const Lattice<iScalar<iScalar<iMatrix<vtype, N> > > > &Umu, GroupName::Sp) {
  return ProjectOnSpGroup(Umu);
 }

@@ -177,25 +177,43 @@ public:
  }

  //////////////////////////////////////////////////
-  // average over all x,y,z the temporal loop
+  // average Polyakov loop in mu direction over all directions != mu
  //////////////////////////////////////////////////
-  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
-    GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
+  static ComplexD avgPolyakovLoop(const GaugeField &Umu, const int mu) {  //assume Nd=4
+    
+    // Protect against bad value of mu [0, 3]
+    if ((mu < 0 ) || (mu > 3)) {
+      std::cout << GridLogError << "Index is not an integer inclusively between 0 and 3." << std::endl;
+      exit(1);
+    }
+
+    // U_loop is U_{mu}
+    GaugeMat U_loop(Umu.Grid()), P(Umu.Grid());
    ComplexD out;
    int T = Umu.Grid()->GlobalDimensions()[3];
    int X = Umu.Grid()->GlobalDimensions()[0];
    int Y = Umu.Grid()->GlobalDimensions()[1];
    int Z = Umu.Grid()->GlobalDimensions()[2];

-    Ut = peekLorentz(Umu,3); //Select temporal direction
-    P = Ut;
-    for (int t=1;t<T;t++){ 
-      P = Gimpl::CovShiftForward(Ut,3,P);
+    // Number of sites in mu direction
+    int N_mu = Umu.Grid()->GlobalDimensions()[mu];
+
+    U_loop = peekLorentz(Umu, mu); //Select direction
+    P = U_loop;
+    for (int t=1;t<N_mu;t++){ 
+      P = Gimpl::CovShiftForward(U_loop,mu,P);
    }
   RealD norm = 1.0/(Nc*X*Y*Z*T);
   out = sum(trace(P))*norm;
   return out;   
-}
+  }  
+
+  /////////////////////////////////////////////////
+  // overload for temporal Polyakov loop
+  /////////////////////////////////////////////////
+  static ComplexD avgPolyakovLoop(const GaugeField &Umu) { 
+    return avgPolyakovLoop(Umu, 3);
+  }

  //////////////////////////////////////////////////
  // average over traced single links
@@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once 

+#ifndef MIN
+#define MIN(x,y) ((x)>(y)?(y):(x))
+#endif
+
+
 // Introduce a class to gain deterministic bit reproducible reduction.
 // make static; perhaps just a namespace is required.
 NAMESPACE_BEGIN(Grid);
@@ -24,7 +24,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
@@ -230,3 +234,4 @@ int main(int argc, char **argv)
 #endif
 } // main

+#endif
@@ -25,7 +25,11 @@ directory
 *************************************************************************************/
 /*  END LEGAL */

-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
@@ -231,5 +235,4 @@ int main(int argc, char **argv)
 #endif
 } // main

-
-
+#endif
@@ -24,7 +24,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
@@ -230,5 +234,4 @@ int main(int argc, char **argv)
 #endif
 } // main

-
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 int main(int argc, char **argv) {
  using namespace Grid;
@@ -195,5 +199,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
@@ -449,5 +453,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
@@ -442,5 +446,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 using namespace Grid;

@@ -918,3 +922,5 @@ int main(int argc, char **argv) {
  return 0;
 #endif
 } // main
+
+#endif
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 using namespace Grid;

@@ -873,3 +877,5 @@ int main(int argc, char **argv) {
  return 0;
 #endif
 } // main
+
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 int main(int argc, char **argv) {
  using namespace Grid;
@@ -193,5 +197,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 NAMESPACE_BEGIN(Grid);

@@ -512,5 +516,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 int main(int argc, char **argv) {
  using namespace Grid;
@@ -345,5 +349,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 NAMESPACE_BEGIN(Grid);

@@ -516,5 +520,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 NAMESPACE_BEGIN(Grid);

@@ -567,5 +571,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 int main(int argc, char **argv) {
  using namespace Grid;
@@ -263,5 +267,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 int main(int argc, char **argv) {
  using namespace Grid;
@@ -417,5 +421,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 NAMESPACE_BEGIN(Grid);

@@ -452,5 +456,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>

 NAMESPACE_BEGIN(Grid);

@@ -462,5 +466,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+#include<Grid/Grid.h>



@@ -264,5 +268,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main

-
-
+#endif
@@ -0,0 +1,16 @@
+#include <Grid/Grid.h>
+#pragma once
+
+
+#ifndef ENABLE_FERMION_INSTANTIATIONS
+#include <iostream>
+
+int main(void) {
+  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
+	    << "which this example relies on. "
+	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
+	    << "to run this example."
+	    << std::endl;
+  return 1;
+}
+#endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace Grid;
@@ -731,3 +734,5 @@ int main (int argc, char ** argv)

  Grid_finalize();
 }
+
+#endif
@@ -20,6 +20,9 @@
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -439,3 +442,4 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  GRID_ASSERT(norm2(src_e)<1.0e-4);
  GRID_ASSERT(norm2(src_o)<1.0e-4);
 }
+#endif
@@ -20,6 +20,10 @@
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -439,3 +443,5 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  GRID_ASSERT(norm2(src_e)<1.0e-4);
  GRID_ASSERT(norm2(src_o)<1.0e-4);
 }
+
+#endif
@@ -20,6 +20,9 @@
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -385,3 +388,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
  exit(0);
 }
+
+#endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -238,5 +241,4 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  }
 }

-
-
+#endif
@@ -1,3 +1,7 @@
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+
 #include <Grid/Grid.h>
 #include <sstream>
 using namespace std;
@@ -155,3 +159,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }

+#endif
@@ -20,6 +20,9 @@
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -129,3 +132,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
  exit(0);
 }
+
+#endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -149,3 +152,5 @@ int main (int argc, char ** argv)

  Grid_finalize();
 }
+
+#endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -172,5 +175,4 @@ void benchDw(std::vector<int> & latt4, int Ls)
  //  Dw.Report();
 }

-
-
+#endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -110,3 +113,5 @@ int main (int argc, char ** argv)

  Grid_finalize();
 }
+
+#endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -112,3 +115,5 @@ int main (int argc, char ** argv)

  Grid_finalize();
 }
+
+#endif
@@ -26,6 +26,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
+
 #include <Grid/Grid.h>
 #include <Grid/algorithms/blas/BatchedBlas.h>

@@ -873,7 +877,7 @@ int main (int argc, char ** argv)
  int do_su4=0;
  int do_memory=1;
  int do_comms =1;
-  int do_blas  =1;
+  int do_blas  =0;
  int do_dslash=1;

  int sel=4;
@@ -978,3 +982,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
  fclose(FP);
 }
+
+#endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -258,3 +261,5 @@ int main (int argc, char ** argv)

  Grid_finalize();
 }
+
+#endif
@@ -19,6 +19,9 @@ Author: Richard Rollins <rprollins@users.noreply.github.com>
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
+#include "disable_benchmarks_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -161,3 +164,5 @@ void bench_wilson_eo (
  double flops = (single_site_flops * volume * ncall)/2.0;
  std::cout << flops/(t1-t0) << "\t\t";
 }
+
+#endif
@@ -0,0 +1,16 @@
+#include <Grid/Grid.h>
+
+#pragma once
+
+#ifndef ENABLE_FERMION_INSTANTIATIONS
+#include <iostream>
+
+int main(void) {
+  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
+	    << "which this benchmark relies on. "
+	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
+	    << "to run this benchmark."
+	    << std::endl;
+  return 1;
+}
+#endif
@@ -172,6 +172,12 @@ case ${ac_TRACING} in
 esac

 ############### fermions
+AC_ARG_ENABLE([fermion-instantiations],
+     [AS_HELP_STRING([--enable-fermion-instantiations=yes|no],[enable fermion instantiations])],
+     [ac_FERMION_INSTANTIATIONS=${enable_fermion_instantiations}], [ac_FERMION_INSTANTIATIONS=yes])
+
+AM_CONDITIONAL(BUILD_FERMION_INSTANTIATIONS, [ test "${ac_FERMION_INSTANTIATIONS}X" == "yesX" ])
+
 AC_ARG_ENABLE([fermion-reps],
     [AS_HELP_STRING([--enable-fermion-reps=yes|no],[enable extra fermion representation support])],
     [ac_FERMION_REPS=${enable_fermion_reps}], [ac_FERMION_REPS=yes])
@@ -194,6 +200,9 @@ AM_CONDITIONAL(BUILD_ZMOBIUS, [ test "${ac_ZMOBIUS}X" == "yesX" ])
 case ${ac_FERMION_REPS} in
   yes) AC_DEFINE([ENABLE_FERMION_REPS],[1],[non QCD fermion reps]);;
 esac
+case ${ac_FERMION_INSTANTIATIONS} in
+   yes) AC_DEFINE([ENABLE_FERMION_INSTANTIATIONS],[1],[enable fermions]);;
+esac
 case ${ac_GPARITY} in
   yes) AC_DEFINE([ENABLE_GPARITY],[1],[fermion actions with GPARITY BCs]);;
 esac
@@ -3,6 +3,9 @@
 * without regression / tests being applied
 */

+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -310,5 +313,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }

-
-
+#endif
@@ -0,0 +1,430 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+// copied here from Test_general_coarse_pvdagm.cc
+
+#include <cstdlib>
+
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+// Hermitize a DWF operator by squaring it
+template<class Matrix,class Field>
+class SquaredLinearOperator : public LinearOperatorBase<Field> {
+
+  public:
+  Matrix &_Mat;
+
+  public:
+    SquaredLinearOperator(Matrix &Mat): _Mat(Mat) {};
+
+    void OpDiag (const Field &in, Field &out) {    assert(0);  }
+    void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+    void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+    void Op     (const Field &in, Field &out){
+      // std::cout << "Op is overloaded as HermOp" << std::endl;
+      HermOp(in, out);
+    }
+    void AdjOp     (const Field &in, Field &out){
+      HermOp(in, out);
+    }
+    void _Op     (const Field &in, Field &out){
+      // std::cout << "Op: M "<<std::endl;
+      _Mat.M(in, out);
+    }
+    void _AdjOp     (const Field &in, Field &out){
+      // std::cout << "AdjOp: Mdag "<<std::endl;
+      _Mat.Mdag(in, out);
+    }
+    void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+    void HermOp(const Field &in, Field &out){
+      // std::cout << "HermOp: Mdag M Mdag M"<<std::endl;
+      Field tmp(in.Grid());
+      _Op(in,tmp);
+      _AdjOp(tmp,out);
+    }
+};
+
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    //    _Mat.M(in,tmp);
+    //    _PV.Mdag(tmp,out);
+    //    _PV.M(out,tmp);
+    //    _Mat.Mdag(tmp,out);
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+};
+
+template<class Matrix, class Field>
+class ShiftedComplexPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  ComplexD shift;
+public:
+ShiftedComplexPVdagMLinearOperator(ComplexD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+  
+  void resetShift(ComplexD newShift) {
+    shift = newShift;
+  }
+};
+
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+//   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  std::vector<int> lat_size {16, 16, 16, 32};
+  std::cout << "Lattice size: " << lat_size << std::endl;
+  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(lat_size, 
+								          GridDefaultSimd(Nd,vComplex::Nsimd()),
+								          GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  // poare TODO: replace this with the following line?
+  Coordinate clatt = lat_size;
+//   Coordinate clatt = GridDefaultLatt();              // [PO] initial line before I edited it
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+//   std::string file("ckpoint_lat.4000");
+  std::string file("/Users/patrickoare/libraries/PETSc-Grid/ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  RealD mass=0.01;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  // const int nbasis = 20;            // size of approximate basis for low-mode space
+  const int nbasis = 3;            // size of approximate basis for low-mode space
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNearestStencilGeometry5D geom(Coarse5d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+  typedef ShiftedComplexPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedComplexPVdagM_t;
+  PVdagM_t PVdagM(Ddwf, Dpv);
+  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv);
+  SquaredLinearOperator<DomainWallFermionD, LatticeFermionD> Dsq (Ddwf);
+  NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
+
+  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
+  int Nm = 10;
+  int Nk = 6;
+  // int Nm = 6;         // Nm = 6 case is acting really strangely... with Nm = 6 and Nm = 3 it zeros out the Hessenberg and also makes it imaginary?
+  // int Nk = 2;
+  // int Nk = Nm+1;     // if just running once
+  // int maxIter = 5;
+  // int maxIter = 1;
+  int maxIter = 3;
+  // int maxIter = 100;
+  int Nstop = 6;
+
+  Coordinate origin ({0,0,0,0});
+  auto tmpSrc = peekSite(src, origin);
+  std::cout << "[DEBUG] Source at origin = " <<  tmpSrc << std::endl;
+  LatticeFermion src2 = src;
+
+  // Run Lanczos and Arnoldi on a Hermitian matrix
+  // Arnoldi Arn (Dsq, FGrid, 1e-8, false);
+  // Arn(src, 1, Nm, -1);
+  Arnoldi Arn (Dsq, FGrid, 1e-8, EvalNormLarge);      // for comparison to Lanczos
+  // Arn(src, maxIter, Nm, Nk, Nstop);
+
+  // auto tmpSrcDup = peekSite(src, origin);
+  // std::cout << "[DEBUG] Source at origin = " <<  tmpSrcDup << std::endl;
+  // auto tmpSrc2Dup = peekSite(src2, origin);
+  // std::cout << "[DEBUG] Source2 at origin = " <<  tmpSrc2Dup << std::endl;
+
+  Arn(src, maxIter, Nm, Nk, Nstop);
+  std::cout << "Hessenberg mat for symmetric N = " << Nm << std::endl;
+  std::cout << Arn.getHessenbergMat() << std::endl;
+
+  // ImplicitlyRestartedLanczosHermOpTester<LatticeFermionD> SimpleTester (Dsq);
+  // ImplicitlyRestartedLanczos<LatticeFermionD> Lanc (Dsq, Dsq, SimpleTester, Nm, Nm, Nm, 1e-8, Nm);
+  int Nconv;
+  PlainHermOp DsqHermOp (Dsq);
+  // std::vector<RealD> levals (Nm+1); std::vector<LatticeFermionD> levecs (Nm+1, src);
+  // ImplicitlyRestartedLanczos<LatticeFermionD> Lanc (DsqHermOp, DsqHermOp, Nm, Nm, Nm + 1, 1e-8, Nm);
+  std::vector<RealD> levals (Nm+1); std::vector<LatticeFermionD> levecs (Nm, src);
+  ImplicitlyRestartedLanczos<LatticeFermionD> Lanc (DsqHermOp, DsqHermOp, Nstop, Nk, Nm, 1e-8, maxIter);
+  std::cout << GridLogMessage << "Calculating with Lanczos" << std::endl;
+
+  // auto tmpSrc1 = peekSite(src, origin);
+  // std::cout << "[DEBUG] Source at origin = " <<  tmpSrc1 << std::endl;
+  // auto tmpSrc2 = peekSite(src2, origin);
+  // std::cout << "[DEBUG] Source2 at origin = " <<  tmpSrc2 << std::endl;
+  // std::cout << "[DEBUG] Source norm2: " << norm2(src) << std::endl;
+
+  std::cout << "running Lanczos now" << std::endl;
+  Lanc.calc(levals, levecs, src2, Nconv);
+
+  std::cout<<GridLogMessage << "*******************************************" << std::endl;
+  std::cout<<GridLogMessage << "***************** RESULTS *****************" << std::endl;
+  std::cout<<GridLogMessage << "*******************************************" << std::endl;
+
+  std::cout << GridLogMessage << "Arnoldi eigenvalues: " << std::endl << Arn.getEvals() << std::endl;
+  std::cout << GridLogMessage << "Lanczos eigenvalues: " << std::endl << levals << std::endl;
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -3,6 +3,9 @@
 * without regression / tests being applied
 */

+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -432,5 +435,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }

-
-
+#endif
@@ -0,0 +1,405 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+// Tests code written to read off the Krylov coefficients
+
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+#include <Grid/algorithms/iterative/ConjugateGradient.h>
+
+using namespace std;
+using namespace Grid;
+
+// Hermitize a DWF operator by squaring it
+template<class Matrix,class Field>
+class SquaredLinearOperator : public LinearOperatorBase<Field> {
+
+  public:
+  Matrix &_Mat;
+
+  public:
+    SquaredLinearOperator(Matrix &Mat): _Mat(Mat) {};
+
+    void OpDiag (const Field &in, Field &out) {    assert(0);  }
+    void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+    void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+    void Op     (const Field &in, Field &out){
+      // std::cout << "Op is overloaded as HermOp" << std::endl;
+      HermOp(in, out);
+    }
+    void AdjOp     (const Field &in, Field &out){
+      HermOp(in, out);
+    }
+    void _Op     (const Field &in, Field &out){
+      // std::cout << "Op: M "<<std::endl;
+      _Mat.M(in, out);
+    }
+    void _AdjOp     (const Field &in, Field &out){
+      // std::cout << "AdjOp: Mdag "<<std::endl;
+      _Mat.Mdag(in, out);
+    }
+    void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+    void HermOp(const Field &in, Field &out){
+      // std::cout << "HermOp: Mdag M Mdag M"<<std::endl;
+      Field tmp(in.Grid());
+      _Op(in,tmp);
+      _AdjOp(tmp,out);
+    }
+};
+
+/**
+ * Computes the coefficients in the Krylov expansion for 1/D ~ \sum_{i=0}^N c_i D^i. 
+ * 
+ * Parameters
+ * ----------
+ * std::vector<double> &coeffs
+ *    Polynomial coeffients to return, with indexing order (c_0, c_1, c_2, ..., c_n). 
+ * LinearOperatorBase<FineField> &DiracOp
+ *    Dirac operator D. 
+ * FineField src
+ *    Source field b. 
+ * FineField psiStar
+ *    Output approximation for D^{-1} b coming from a Krylov method. 
+ * int N
+ *    Dimension of the polynomial approximation (Krylov space K_{N-1} = {b, Db, D^2 b, ..., D^{N-1} b}).
+ */
+void poly_coeffs(std::vector<ComplexD> &coeffs, LinearOperatorBase<LatticeFermion> &DiracOp, LatticeFermion src,
+                  LatticeFermion psiStar, GridCartesian* FGrid, int N, bool use_herm = false)
+{
+  // stdBasis = {b, Db, D^2 b, ..., D^N b}, kryBasis = {k0, k1, ..., kN}
+  std::vector<LatticeFermion> kryBasis;
+  Eigen::VectorXcd psiStarCoeffs (N);
+
+  // Normalize by 1 / ||src||; does not change the polynomial coefficients
+  double srcNorm   = 1 / std::sqrt(norm2(src));
+  kryBasis.push_back(srcNorm * src);                // normalized source
+  psiStar          = srcNorm * psiStar;
+  psiStarCoeffs(0) = innerProduct(kryBasis[0], psiStar);
+
+  // orthonormalize canonical Krylov basis {b, Db, D^2 b, ..., D^{N-1} b} <--> {k_i} and compute components <k_i | psi*>
+  LatticeFermion tmp (FGrid);
+  for (int i = 0; i < N - 1; i++) {               // construct ONB for {b, Db, ..., D^{i+1} b}
+    if (use_herm) {
+      DiracOp.HermOp(kryBasis.back(), tmp);         // tmp \in span{(D^\dag D)^{i+1} b} \oplus span{(D^\dag D)^i b, ..., D^\dag D b, b}
+    } else {
+      DiracOp.Op(kryBasis.back(), tmp);             // tmp \in span{D^{i+1} b} \oplus span{D^i b, ..., Db, b}
+    }
+
+    for (int j = 0; j < i+1; j++) {               // orthogonalize tmp with previous basis vectors
+      ComplexD coeff = innerProduct(kryBasis[j], tmp);      // <k_j | tmp>
+      tmp -= coeff * kryBasis[j];                           // subtract off |k_j><k_j | tmp>; now tmp is perp to |k_j>
+    }
+    double tmpNorm = 1 / std::sqrt(norm2(tmp));
+    kryBasis.push_back(
+      tmpNorm * tmp
+    );                                                      // normalize |k_i> and add to kryBasis
+    psiStarCoeffs(i+1) = innerProduct(kryBasis[i+1], psiStar);  // compute < k_i | psi* >
+  }
+
+  // To verify the basis is ONB
+  // for (int i = 0; i < N; i++) {
+  //   for (int j = 0; j < N; j++) {
+  //     std::cout << "<ki|kj> for (i,j) = (" << i << ", " << j << ") = "  << innerProduct(kryBasis[i], kryBasis[j]) << std::endl;
+  //   }
+  // }
+
+  // Compute change of basis matrix
+  LatticeFermion tmp2 (FGrid);
+  Eigen::MatrixXcd M = Eigen::MatrixXcd::Zero(N, N);
+  tmp = kryBasis[0];       // current Krylov vector; starts with tmp = src (normalized)
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < i + 1; j++) {    // fill column with components of kryVec. Only need j <= i to get orthonormal components
+      M(j, i) = innerProduct(kryBasis[j], tmp);
+    }    
+    if (use_herm) {     // tmp --> D^\dag D(tmp)
+      DiracOp.HermOp(tmp, tmp2);
+      tmp = tmp2;
+    } else {      // tmp --> D(tmp). Note that DiracOp.Op(tmp, tmp) will cause a bug
+      DiracOp.Op(tmp, tmp2);
+      tmp = tmp2;
+    }
+  }
+  
+  // Compute M^{-1} @ psiStarCoeffs and copy to coeffs
+  Eigen::VectorXcd res (N);
+  res = M.inverse() * psiStarCoeffs;
+  for (int i = 0; i < N; i++) {
+    coeffs[i] = res(i);
+  }
+
+}
+
+// out file for poly coefficients (should it be complex?)
+// class PolynomialFile: Serializable {
+// public:
+//   GRID_SERIALIZABLE_CLASS_MEMBERS(OutputFile, std::vector< Real >, data);
+// };
+
+std::complex<double> poly_approx(std::complex<double> x, std::vector<std::complex<double>> coeffs) {
+  std::complex<double> px;
+  for (int i = 0; i < coeffs.size(); i++) {
+    px += coeffs[i] * std::pow(x, i);
+  }
+  return px;
+}
+
+/**
+ * Returns the approximation psi = \sum_i c_i D^i b resulting from a Krylov solver.
+ * 
+ * Parameters
+ * ----------
+ * LatticeFermion &psi
+ *    Approximation field, returned psi = \sum_i c_i D^i b.
+ * LatticeFermion src
+ *    Source b used to generate the Krylov space K_n(D, b).
+ * LinearOperatorBase<LatticeFermion> &Linop
+ *    Dirac operator used to generate the Krylov space K_n(D, b).
+ * std::vector<std::complex<double>> coeffs
+ *    Polynomial coefficients returned from the solver. 
+ */
+void krylovApprox(LatticeFermion &psi, LatticeFermion src, LinearOperatorBase<LatticeFermion> &Linop, std::vector<ComplexD> coeffs) {
+  psi = Zero();
+  LatticeFermion tmp (psi.Grid());
+  tmp = src;
+  LatticeFermion tmp2 (psi.Grid());
+  for (int i = 0; i < coeffs.size(); i++) {
+      psi = psi + coeffs[i] * tmp;
+      Linop.Op(tmp, tmp2);              // tmp = D*tmp
+      tmp = tmp2;
+  }
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc, &argv);
+  const int Ls = 8;
+  std::vector<int> lat_size {16, 16, 16, 32};
+  std::cout << "Lattice size: " << lat_size << std::endl;
+  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(lat_size, 
+								          GridDefaultSimd(Nd,vComplex::Nsimd()),
+								          GridDefaultMpi());
+
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  //////////////////////////////////////////////////////////////////////
+  // You can manage seeds however you like.
+  // Recommend SeedUniqueString.
+  //////////////////////////////////////////////////////////////////////
+  // std::vector<int> seeds4({1, 2, 3, 4}); 
+  // GridParallelRNG RNG4(UGrid);
+  // RNG4.SeedFixedIntegers(seeds4);
+
+  // std::vector<int> seeds5({1, 2, 3, 4, 5}); 
+  // GridParallelRNG RNG5(FGrid);
+  // RNG5.SeedFixedIntegers(seeds5);
+
+  // std::string outStrStem = "/Users/patrickoare/Dropbox (MIT)/research/multigrid/grid_out/";
+
+  LatticeGaugeField Umu(UGrid);
+  FieldMetaData header;
+  std::string file("/Users/patrickoare/libraries/PETSc-Grid/ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu, header, file);
+  
+  RealD mass=0.01;
+  RealD M5=1.8;
+  // RealD M5=1.0;
+  RealD b=1.5;// Scale factor b+c=2, b-c=1
+  RealD c=0.5;
+
+  // load in Dirac operators that we'll use; square it to Hermitize
+  // Dsq just needs to be a Hermitian operator so we can use CG on it
+  DomainWallFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
+  SquaredLinearOperator<DomainWallFermionD, LatticeFermionD> Dsq (Ddwf);
+  NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
+
+  LatticeFermion src (FGrid); src = 1.0;                              // Source to use 
+  LatticeFermion psiCG (FGrid); psiCG = Zero();                       // Field to solve with for CG
+  LatticeFermion psiGCR (FGrid); psiGCR = Zero();                     // Field to solve with for GCR
+
+  std::cout << GridLogMessage << "*******************************************" << std::endl;
+  std::cout << GridLogMessage << "********** TESTING CG POLY COEFFS *********" << std::endl;
+  std::cout << GridLogMessage << "*******************************************" << std::endl << std::endl;
+
+  double tol = 1.0e-8;
+  int N = 5;           // max iterations (size of Krylov basis)
+
+  // GCR variables
+  int outer_iters = 1;                  // num restarts for GCR
+  TrivialPrecon<LatticeFermionD> prec;  // trivial preconditioner
+
+  ConjugateGradientPolynomial<LatticeFermion> CGP(tol, N, false);
+  CGP(Dsq, src, psiCG);
+
+  // Compute Krylov coeffs directly and compare
+  std::vector<ComplexD> cg_coeffs (N);
+  poly_coeffs(cg_coeffs, Dsq, src, psiCG, FGrid, N, true);
+
+  PolynomialFile PF;
+
+  // Use GCR solver, also get poly coeffs
+  std::vector<ComplexD> gcr_sym_coeffs (N);     // Can try N --> N + 3 to test to see if the last 3 comps are 0
+  PGCRPolynomial<LatticeFermionD> GCRPolySym(tol, outer_iters, Dsq, prec, N+1, N, PF);    // mmax sets the memory, note the last beta doesn't really matter for updating the polynomial
+  GCRPolySym(src, psiGCR);
+  // poly_coeffs(gcr_sym_coeffs, Dsq, src, psi, FGrid, N, true);
+  poly_coeffs(gcr_sym_coeffs, Dsq, src, psiGCR, FGrid, N, true);
+
+  std::cout << GridLogMessage << std::endl << "******** CG POLYNOMIAL COEFFICIENTS *******" << std::endl;
+  std::cout << GridLogMessage << CGP.polynomial << std::endl << std::endl;
+
+  std::cout << GridLogMessage << "****** DIRECT POLYNOMIAL COEFFICIENTS *****" << std::endl;
+  std::cout << GridLogMessage << cg_coeffs << std::endl << std::endl;
+
+  // TODO: try GCR with a Hermitian operator (Dsq)
+  std::cout << GridLogMessage << "****** GCR COEFFICIENTS *****" << std::endl;
+  std::cout << GridLogMessage << GCRPolySym.polynomial << std::endl << std::endl;
+
+  std::cout << GridLogMessage << "****** DIRECT GCR COEFFICIENTS *****" << std::endl;
+  std::cout << GridLogMessage << gcr_sym_coeffs << std::endl << std::endl;
+
+  // test how good the decomposition is
+  std::cout << "Testing fidelity of decomposition by computing ||psi* - sum_i c_i D^i b||^2!" << std::endl;
+  LatticeFermion psiPrime (FGrid);
+
+  // for CG
+  krylovApprox(psiPrime, src, Dsq, cg_coeffs);
+  std::cout << "CG with Dsq, ||psi - psiPrime||^2 = " << norm2(psiCG - psiPrime) << std::endl;
+
+  // for GCR with alpha / beta computation
+  krylovApprox(psiPrime, src, Dsq, GCRPolySym.polynomial);
+  std::cout << "GCR with Dsq, ||psi - psiPrime||^2 = " << norm2(psiGCR - psiPrime) << std::endl;
+
+  // for GCR with alpha / beta computation
+  krylovApprox(psiPrime, src, Dsq, gcr_sym_coeffs);
+  std::cout << "GCR direct with Dsq, ||psi - psiPrime||^2 = " << norm2(psiGCR - psiPrime) << std::endl;
+
+  
+  // std::vector<double> real_cg_diff (N);
+  // for (int i = 0; i < N; i++) { real_cg_diff[i] = std::abs(cg_coeffs[i].real() - CGP.polynomial[i]); }
+  // std::cout << GridLogMessage << "************* COEFF DIFFERENCE ************" << std::endl;
+  // std::cout << GridLogMessage << real_cg_diff << std::endl << std::endl;
+
+  // GCR polynomial reconstruction with Ddwf!
+  std::cout << GridLogMessage << "*******************************************" << std::endl;
+  std::cout << GridLogMessage << "********* TESTING GCR POLY COEFFS *********" << std::endl;
+  std::cout << GridLogMessage << "*******************************************" << std::endl << std::endl;
+
+  // re-init variables
+  src = 1.0;
+  src = (1 / std::sqrt(norm2(src))) * src;
+  psiGCR = Zero(); psiPrime = Zero();
+
+  // test GCR poly
+  PGCRPolynomial<LatticeFermionD> GCRPoly(tol, outer_iters, DLinOp, prec, N+1, N, PF);    // mmax sets the memory, note the last beta doesn't really matter for updating the polynomial
+  GCRPoly(src, psiGCR);
+
+  // Compute Krylov coeffs directly and compare
+  // N = 1;    // compare the N > 1 decomposition with the psi* resulting from N = 1
+  std::vector<ComplexD> gcr_coeffs (N);   // note N --> N + k should just give k coeffs that are 0; this works as intended
+  poly_coeffs(gcr_coeffs, DLinOp, src, psiGCR, FGrid, N, false);
+
+  std::cout << GridLogMessage << "******* GCR POLYNOMIAL COEFFICIENTS *******" << std::endl;
+  std::cout << GridLogMessage << GCRPoly.polynomial << std::endl << std::endl;
+
+  std::cout << GridLogMessage << "****** DIRECT POLYNOMIAL COEFFICIENTS *****" << std::endl;
+  std::cout << GridLogMessage << gcr_coeffs << std::endl << std::endl;
+
+  // test how good the decomposition is
+  std::cout << "Testing fidelity of decomposition by computing ||psi* - sum_i c_i D^i b||^2!" << std::endl;
+
+  // for GCR with alpha / beta computation
+  krylovApprox(psiPrime, src, DLinOp, GCRPoly.polynomial);
+  std::cout << "GCR with Dsq, ||psi - psiPrime||^2 = " << norm2(psiGCR - psiPrime) << std::endl;
+
+  // for GCR with alpha / beta computation
+  krylovApprox(psiPrime, src, DLinOp, gcr_coeffs);
+  std::cout << "GCR direct with Dsq, ||psi - psiPrime||^2 = " << norm2(psiGCR - psiPrime) << std::endl;
+
+  // TESTS TO DO THE N = 2 CASE DIRECTLY
+  /*
+  std::vector<std::complex<double>> alphas {
+    std::complex(0.244300601, 0.00013007545), 
+    std::complex(0.285370971, -0.000160704481)
+  };
+  std::complex<double> beta00 (-0.184661284, -6.52153945e-05);
+  LatticeFermion psi2 (FGrid);
+  LatticeFermion Dsrc (FGrid);
+  DLinOp.Op(src, Dsrc);
+  std::complex<double> c1 = alphas[0] + alphas[1] * (1. + beta00);
+  std::complex<double> c2 = -alphas[0] * alphas[1];
+  psi2 = c1 * src + c2 * Dsrc;
+
+  std::cout << "||b|| = " << norm2(src) << std::endl;
+  std::cout << "||Db|| = " << norm2(Dsrc) << std::endl;
+  // fail; so far this is giving something different than what's being computed in krylovApprox (idk how?)
+
+  std::cout << "c1 and c2 are: " << c1 << " and " << c2 << std::endl;
+  std::cout << "GCRPoly polynomial coeffs are (should equal c1 and c2): " << GCRPoly.polynomial << std::endl;
+  std::cout << "||GCRpsi - psi2||_2^2 = " << norm2(psiGCR - psi2) << std::endl;
+  // pass
+  
+  LatticeFermion src2 (FGrid);
+  src2 = 1.0;
+  src2 = (1 / std::sqrt(norm2(src2))) * src2;
+  std::cout << "||ones - src|| (to verify that src is the same throughout, should be 0) = " << norm2(src2 - src) << std::endl;
+  // pass
+
+  krylovApprox(psiPrime, src, DLinOp, GCRPoly.polynomial);
+  std::cout << "GCR with Dsq, ||psi2 - psiPrime||^2 = " << norm2(psi2 - psiPrime) << std::endl;
+
+  std::vector<ComplexD> psi2_coeffs (N);   // note N --> N + k should just give k coeffs that are 0; this works as intended
+  poly_coeffs(psi2_coeffs, DLinOp, src, psi2, FGrid, N, false);
+  krylovApprox(psiPrime, src, DLinOp, psi2_coeffs);
+  std::cout << "GCR direct with Dsq, ||psi - psiPrime||^2 = " << norm2(psi2 - psiPrime) << std::endl;
+  */
+
+  // std::complex z (10.0, 0.0);     // z = 10
+  // std::cout << GridLogMessage << "************* GCR POLY(z = 10) *************" << std::endl;
+  // std::cout << GridLogMessage << poly_approx(z, GCRPoly.polynomial) << std::endl;
+  // std::cout << GridLogMessage << "************ DIRECT POLY(z = 10) ***********" << std::endl;
+  // std::cout << GridLogMessage << poly_approx(z, gcr_coeffs) << std::endl;
+
+  // std::vector<std::complex<double>> gcr_diff (N);
+  // for (int i = 0; i < N; i++) { gcr_diff[i] = gcr_coeffs[i] - GCRPoly.polynomial[i]; }
+  // std::cout << GridLogMessage << "*********** GCR COEFF DIFFERENCE **********" << std::endl;
+  // std::cout << GridLogMessage << gcr_diff << std::endl << std::endl;
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,380 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+// copied here from Test_general_coarse_pvdagm.cc
+
+#include <cstdlib>
+
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+namespace Grid {
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+		  		RealD, mass , 
+		  		RealD, mstep , 
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
+                                Integer, ReadEvec,
+                                Integer, maxIter,
+	  			RealD, resid,
+	  			RealD, ChebyLow,
+	  			RealD, ChebyHigh,
+	  			Integer, ChebyOrder)
+
+  LanczosParameters() {
+    ////////////////////////////// Default values
+      mass = 0;
+    /////////////////////////////////
+  }
+
+  template <class ReaderClass >
+  LanczosParameters(Reader<ReaderClass> & TheReader){
+    initialize(TheReader);
+  }
+
+  template < class ReaderClass > 
+  void initialize(Reader<ReaderClass> &TheReader){
+//    std::cout << GridLogMessage << "Reading HMC\n";
+    read(TheReader, "HMC", *this);
+  }
+
+
+  void print_parameters() const {
+//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
+//    MD.print_parameters();
+  }
+  
+};
+
+}
+
+template <class T> void writeFile(T& in, std::string const fname){
+#if 1
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}
+
+
+typedef WilsonFermionD WilsonOp;
+typedef typename WilsonFermionD::FermionField FermionField;
+
+template<class Matrix,class Field>
+class InvertNonHermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD _stp;
+public:
+  InvertNonHermitianLinearOperator(Matrix &Mat,RealD stp=1e-8): _Mat(Mat),_stp(stp){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+//    _Mat.Mdiag(in,out);
+//    out = out + shift*in;
+    assert(0);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+//    _Mat.Mdir(in,out,dir,disp);
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+//    _Mat.MdirAll(in,out);
+    assert(0);
+  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+//    _Mat.M(in,out);
+//  RealD mass=-shift;
+//  WilsonCloverFermionD Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t);
+//  NonHermitianLinearOperator<Matrix,Field> HermOp(_Mat);
+//  BiCGSTAB<Field> CG(_stp,10000);
+    _Mat.Mdag(in,tmp);
+    MdagMLinearOperator<Matrix,Field> HermOp(_Mat);
+    ConjugateGradient<Field> CG(_stp,10000);
+    CG(HermOp,tmp,out);
+//    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+//    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};
+
+template<class Field>
+void testSchurFromHess(Arnoldi<Field>& Arn, Field& src, int Nlarge, int Nm, int Nk) {
+
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout << GridLogMessage << "Testing Schur reordering, Nm = " << Nm << ", Nk = " << Nk << std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+
+  std::cout << GridLogMessage << "Running Arnoldi for 1 iteration to get a Hessenberg." << std::endl;
+  Arn(src, 1, Nlarge, Nm, Nlarge);
+  Eigen::MatrixXcd Hess = Arn.getHessenbergMat();
+  std::cout << GridLogMessage << "Hessenberg for use: " << std::endl << Hess << std::endl;
+
+  ComplexSchurDecomposition schur (Hess, true);
+  bool isDecomposed = schur.checkDecomposition();
+  std::cout << "Schur decomp holds? " << isDecomposed << std::endl;
+
+  std::cout << GridLogMessage << "S = " << std::endl << schur.getMatrixS() << std::endl;
+  std::cout << GridLogMessage << "Swapping S(3, 3) with S(4, 4)" << std::endl;
+  schur.swapEvals(3);
+  std::cout << GridLogMessage << "S after swap = " << std::endl << schur.getMatrixS() << std::endl;
+  std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
+
+  // Now move last diagonal element all the way to the front.
+  std::cout << GridLogMessage << "Moving last eval to front. S at start = " << std::endl << schur.getMatrixS() << std::endl;
+  for (int i = 0; i < Nk - 1; i++) {
+    int swapIdx = Nk - 2 - i;
+    schur.swapEvals(swapIdx);
+    std::cout << GridLogMessage << "S after swap of index " << swapIdx << " = " << std::endl << schur.getMatrixS() << std::endl;
+    std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
+  }
+
+  std::cout << GridLogMessage << "Testing Schur reorder" << std::endl;
+  schur.schurReorder(Nk);
+  std::cout << GridLogMessage << "S after reorder = " << std::endl << schur.getMatrixS() << std::endl;
+  std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
+
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+//   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+//  std::vector<int> lat_size {32, 32, 32, 32};
+//  std::cout << "Lattice size: " << lat_size << std::endl;
+  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								          GridDefaultSimd(Nd,vComplex::Nsimd()),
+								          GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+//  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+//  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  GridCartesian         * FGrid   = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  // Construct a coarsened grid
+  // poare TODO: replace this with the following line?
+  Coordinate clatt = GridDefaultLatt();
+//   Coordinate clatt = GridDefaultLatt();              // [PO] initial line before I edited it
+  for(int d=0;d<clatt.size();d++){
+  std::cout << GridLogMessage<< clatt[d] <<std::endl;
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("config");
+//  std::string file("Users/patrickoare/libraries/PETSc-Grid/ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  LanczosParameters LanParams;
+  {
+    XmlReader  HMCrd("LanParams.xml");
+    read(HMCrd,"LanczosParameters",LanParams);
+  }
+
+  std::cout << GridLogMessage<< LanParams <<std::endl;
+  {
+    XmlWriter HMCwr("LanParams.xml.out");
+    write(HMCwr,"LanczosParameters",LanParams);
+  }
+
+
+  RealD mass=0.01;
+  RealD M5=1.8;
+
+  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
+  int Nm = 50;
+  int Nk = 12; 
+  int Np = 38; 
+  // int Nk = Nm+1;     // if just running once
+  int maxIter = 10000;
+  int Nstop = 10;
+  RealD resid = 1.0e-5;
+
+  std::vector<Complex> boundary = {1,1,1,-1};
+  WilsonOp::ImplParams Params(boundary);
+
+//  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+//  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  mass=LanParams.mass;
+  std::cout << GridLogIRL<< "mass "<<mass<<std::endl;
+  WilsonOp WilsonOperator(Umu,*UGrid,*UrbGrid,mass,Params);
+
+  // const int nbasis = 20;            // size of approximate basis for low-mode space
+  const int nbasis = 3;            // size of approximate basis for low-mode space
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNearestStencilGeometry5D geom(Coarse5d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+//  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+//  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+//  typedef ShiftedComplexPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedComplexPVdagM_t;
+//  PVdagM_t PVdagM(Ddwf, Dpv);
+//  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv);
+//  SquaredLinearOperator<DomainWallFermionD, LatticeFermionD> Dsq (Ddwf);
+//  NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
+
+
+  NonHermitianLinearOperator<WilsonOp,FermionField> Dwilson(WilsonOperator); /// <-----
+  InvertNonHermitianLinearOperator<WilsonOp,FermionField> Iwilson(WilsonOperator); /// <-----
+  MdagMLinearOperator<WilsonOp,FermionField> HermOp(WilsonOperator); /// <-----
+  Gamma5HermitianLinearOperator <WilsonOp,LatticeFermion> HermOp2(WilsonOperator); /// <----
+
+  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
+  resid=LanParams.resid;
+  Nstop=LanParams.Nstop;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
+  maxIter=LanParams.maxIter;
+  Nm = Nk + Np;
+  int Nu=16;
+  std::vector<LatticeFermion> src(Nu,FGrid); 
+  for(int i=0;i<Nu;i++) random(RNG5,src[i]);
+
+  if(LanParams.ReadEvec) {
+    std::string evecs_file="evec_in";
+    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD;
+    RD.open(evecs_file);
+    RD.readScidacFieldRecord(src[0],record);
+    RD.close();
+  }
+
+  Coordinate origin ({0,0,0,0});
+  auto tmpSrc = peekSite(src[0], origin);
+  std::cout << "[DEBUG] Source at origin = " <<  tmpSrc << std::endl;
+  LatticeFermion src2 = src[0];
+
+  // Run KrylovSchur and Arnoldi on a Hermitian matrix
+  std::cout << GridLogMessage << "Running Krylov Schur" << std::endl;
+  // KrylovSchur KrySchur (Dsq, FGrid, 1e-8, EvalNormLarge);
+//  KrylovSchur KrySchur (Dsq, FGrid, 1e-8,EvalImNormSmall);
+//  KrySchur(src, maxIter, Nm, Nk, Nstop);
+//  KrylovSchur KrySchur (HermOp2, UGrid, resid,EvalNormSmall);
+//  Hacked, really EvalImagSmall
+#if 1
+    RealD shift=1.5;
+    KrylovSchur KrySchur (Dwilson, UGrid, resid,EvalImNormSmall);
+    KrySchur(src[0], maxIter, Nm, Nk, Nstop,&shift);
+#else
+    KrylovSchur KrySchur (Iwilson, UGrid, resid,EvalImNormSmall);
+    KrySchur(src[0], maxIter, Nm, Nk, Nstop);
+#endif
+  std::cout << GridLogMessage << "evec.size= " << KrySchur.evecs.size()<< std::endl;
+
+  src[0]=KrySchur.evecs[0];
+  for (int i=1;i<Nstop;i++) src[0]+=KrySchur.evecs[i];
+  for (int i=0;i<Nstop;i++) 
+  {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
+        auto evdensity = localInnerProduct(KrySchur.evecs[i],KrySchur.evecs[i] );
+        writeFile(evdensity,evfile);
+
+  }
+
+  {
+        std::string evfile ("./evec_"+std::to_string(mass)+"_sum");
+//        auto evdensity = localInnerProduct(evec[i],evec[i] );
+        writeFile(src[0],evfile);
+  }
+
+
+  /*
+  std::cout << GridLogMessage << "Running Arnoldi" << std::endl;
+  // Arnoldi Arn (Dsq, FGrid, 1e-8);
+  Arnoldi Arn (DLinOp, FGrid, 1e-8);
+  testSchurFromHess<LatticeFermion>(Arn, src, 10, 6, 4);
+
+  Arnoldi Arn2 (DLinOp, FGrid, 1e-8);
+  testSchurFromHess<LatticeFermion>(Arn2, src, 16, 12, 8);
+  */
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,498 @@
+/*************************************************************************************
+
+    Runs the Krylov-Schur algorithm on a (pre-conditioned) domain-wall fermion operator 
+    to determine part of its spectrum. 
+
+    Usage : 
+      $ ./Example_spec_kryschur <Nm> <Nk> <maxiter> <Nstop> <inFile> <outDir> <?rf>
+
+      Nm = Maximum size of approximation subspace.
+      Nk = Size of truncation subspace
+      maxiter = Maximum number of iterations.
+      Nstop   = Stop when Nstop eigenvalues have converged. 
+      inFile  = Gauge configuration to read in.
+      outDir  = Directory to write output to.
+      rf      = (Optional) RitzFilter to sort with. Takes in any string in 
+                  {EvalNormSmall, EvalNormLarge, EvalReSmall, EvalReLarge, EvalImSmall, EvalImLarge}
+    
+    Output:
+      ${outDir}/evals.txt  = Contains all eigenvalues. Each line is formatted as `$idx $eval $ritz`, where:
+                              - $idx is the index of the eigenvalue.
+                              - $eval is the eigenvalue, formated as "(re,im)".
+                              - $ritz is the Ritz estimate of the eigenvalue (deviation from being a true eigenvalue)
+      ${outDir}/evec${idx} = Eigenvector $idx written out in SCIDAC format (if LIME is enabled).
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Patrick Oare <poare@bnl.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <cstdlib>
+
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+#include <Grid/parallelIO/IldgIOtypes.h>
+#include <Grid/parallelIO/IldgIO.h>
+
+using namespace std;
+using namespace Grid;
+
+namespace Grid {
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+		  		RealD, mass ,
+		  		RealD, mstep ,
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
+                                Integer, ReadEvec,
+	  			RealD, resid,
+	  			RealD, ChebyLow,
+	  			RealD, ChebyHigh,
+	  			Integer, ChebyOrder)
+
+  LanczosParameters() {
+    /////////////////////////////////
+  }
+
+  template <class ReaderClass >
+  LanczosParameters(Reader<ReaderClass> & TheReader){
+    initialize(TheReader);
+  }
+
+  template < class ReaderClass >
+  void initialize(Reader<ReaderClass> &TheReader){
+    read(TheReader, "HMC", *this);
+  }
+
+
+  void print_parameters() const {
+//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
+//    MD.print_parameters();
+  }
+
+};
+
+}
+
+template <class T> void writeFile(T& in, std::string const fname){
+#ifdef HAVE_LIME
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0); // Lexico
+  WR.close();
+#endif
+}
+
+/**
+ * Writes the eigensystem of a Krylov Schur object to a directory.
+ *
+ * Parameters
+ * ----------
+ * std::string path
+ *    Directory to write to.
+ */
+template <class Field>
+void writeEigensystem(KrylovSchur<Field> KS, std::string outDir) {
+  int Nk = KS.getNk();
+  std::cout << GridLogMessage << "Writing output to directory: " << outDir << std::endl;
+
+  // Write evals
+  std::string evalPath = outDir + "/evals.txt";
+  std::ofstream fEval;
+  fEval.open(evalPath);
+  Eigen::VectorXcd evals = KS.getEvals();
+  std::vector<RealD> ritz  = KS.getRitzEstimates();
+  for (int i = 0; i < Nk; i++) {
+    // write eigenvalues and Ritz estimates
+    fEval << i << " " << evals(i) << " " << ritz[i];
+    if (i < Nk - 1) { fEval << "\n"; }
+  }
+  fEval.close();
+
+  // Write evecs (TODO: very heavy on storage costs! Don't write them all out)
+  // std::vector<Field> evecs = KS.getEvecs();
+  // for (int i = 0; i < Nk; i++) {
+  //   std::string fName = outDir + "/evec" + std::to_string(i);
+  //   writeFile(evecs[i], fName);     // using method from Grid/HMC/ComputeWilsonFlow.cc
+  // }
+}
+// Hermitize a DWF operator by squaring it
+template<class Matrix,class Field>
+class SquaredLinearOperator : public LinearOperatorBase<Field> {
+
+  public:
+  Matrix &_Mat;
+
+  public:
+    SquaredLinearOperator(Matrix &Mat): _Mat(Mat) {};
+
+    void OpDiag (const Field &in, Field &out) {    assert(0);  }
+    void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+    void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+    void Op     (const Field &in, Field &out){
+      // std::cout << "Op is overloaded as HermOp" << std::endl;
+      HermOp(in, out);
+    }
+    void AdjOp     (const Field &in, Field &out){
+      HermOp(in, out);
+    }
+    void _Op     (const Field &in, Field &out){
+      // std::cout << "Op: M "<<std::endl;
+      _Mat.M(in, out);
+    }
+    void _AdjOp     (const Field &in, Field &out){
+      // std::cout << "AdjOp: Mdag "<<std::endl;
+      _Mat.Mdag(in, out);
+    }
+    void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+    void HermOp(const Field &in, Field &out){
+      // std::cout << "HermOp: Mdag M Mdag M"<<std::endl;
+      Field tmp(in.Grid());
+      _Op(in,tmp);
+      _AdjOp(tmp,out);
+    }
+};
+
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    //    _Mat.M(in,tmp);
+    //    _PV.Mdag(tmp,out);
+    //    _PV.M(out,tmp);
+    //    _Mat.Mdag(tmp,out);
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+};
+
+template<class Matrix, class Field>
+class ShiftedComplexPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  ComplexD shift;
+public:
+ShiftedComplexPVdagMLinearOperator(ComplexD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+  
+  void resetShift(ComplexD newShift) {
+    shift = newShift;
+  }
+};
+
+#if 0
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+#endif
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  // Usage : $ ./Example_spec_kryschur <Nm> <Nk> <maaxiter> <Nstop> <inFile> <outDir>
+  std::string NmStr      = argv[1];
+  std::string NkStr      = argv[2];
+  std::string maxIterStr = argv[3];
+  std::string NstopStr   = argv[4];
+  std::string file       = argv[5];
+  std::string outDir     = argv[6];
+
+  RitzFilter RF;
+  if (argc == 8) {
+    std::string rf       = argv[7];
+    RF = selectRitzFilter(rf);
+  } else {
+    RF = EvalReSmall;
+  }
+  std::cout << "Sorting eigenvalues using " << rfToString(RF) << std::endl;
+
+  //const int Ls=16;
+  const int Ls = 8;
+
+//   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  //std::vector<int> lat_size {16, 16, 16, 32};
+  std::vector<int> lat_size {8, 8, 8, 8};
+  std::cout << "Lattice size: " << lat_size << std::endl;
+  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(lat_size, 
+								          GridDefaultSimd(Nd,vComplex::Nsimd()),
+								          GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  NerscIO::readConfiguration(Umu,header,file);
+
+  // RealD mass=0.01;
+  RealD mass=0.001;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+  typedef ShiftedComplexPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedComplexPVdagM_t;
+  PVdagM_t PVdagM(Ddwf, Dpv);
+  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv);
+  SquaredLinearOperator<DomainWallFermionD, LatticeFermionD> Dsq (Ddwf);
+  NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
+
+  int Nm = std::stoi(NmStr);
+  int Nk = std::stoi(NkStr);
+  int maxIter = std::stoi(maxIterStr);
+  int Nstop = std::stoi(NstopStr);
+
+  std::cout << GridLogMessage << "Runnning Krylov Schur. Nm = " << Nm << ", Nk = " << Nk << ", maxIter = " << maxIter 
+                  << ", Nstop = " << Nstop << std::endl;
+  
+  KrylovSchur KrySchur (PVdagM, FGrid, 1e-8, RF);      // use preconditioned PV^\dag D_{dwf}
+  // KrylovSchur KrySchur (DLinOp, FGrid, 1e-8, RF);         // use D_{dwf}
+  KrySchur(src, maxIter, Nm, Nk, Nstop);
+
+  std::cout<<GridLogMessage << "*******************************************" << std::endl;
+  std::cout<<GridLogMessage << "***************** RESULTS *****************" << std::endl;
+  std::cout<<GridLogMessage << "*******************************************" << std::endl;
+
+  std::cout << GridLogMessage << "Krylov Schur eigenvalues: " << std::endl << KrySchur.getEvals() << std::endl;
+
+  writeEigensystem(KrySchur, outDir);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -3,6 +3,9 @@
 * without regression / tests being applied
 */

+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -535,5 +538,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }

-
-
+#endif
@@ -3,6 +3,9 @@
 * without regression / tests being applied
 */

+#include "disable_examples_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -429,5 +432,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }

-
-
+#endif
@@ -0,0 +1,383 @@
+/*************************************************************************************
+
+    Script for studying the Wilson eigenvectors resulting from the Krylov-Schur process. 
+
+    Usage : 
+      $ ./Example_spec_kryschur <Nm> <Nk> <maxiter> <Nstop> <inFile> <outDir> <?rf>
+
+      Nm = Maximum size of approximation subspace.
+      Nk = Size of truncation subspace
+      maxiter = Maximum number of iterations.
+      Nstop   = Stop when Nstop eigenvalues have converged. 
+      inFile  = Gauge configuration to read in.
+      outDir  = Directory to write output to.
+      rf      = (Optional) RitzFilter to sort with. Takes in any string in 
+                  {EvalNormSmall, EvalNormLarge, EvalReSmall, EvalReLarge, EvalImSmall, EvalImLarge}
+    
+    Output:
+      ${outDir}/evals.txt  = Contains all eigenvalues. Each line is formatted as `$idx $eval $ritz`, where:
+                              - $idx is the index of the eigenvalue.
+                              - $eval is the eigenvalue, formated as "(re,im)".
+                              - $ritz is the Ritz estimate of the eigenvalue (deviation from being a true eigenvalue)
+      ${outDir}/evec${idx} = Eigenvector $idx written out in SCIDAC format (if LIME is enabled).
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Patrick Oare <poare@bnl.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <cstdlib>
+
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+#include <Grid/parallelIO/IldgIOtypes.h>
+#include <Grid/parallelIO/IldgIO.h>
+
+using namespace std;
+using namespace Grid;
+
+template <class T> void writeFile(T& in, std::string const fname){  
+  #ifdef HAVE_LIME
+    // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+    std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+    Grid::emptyUserRecord record;
+    Grid::ScidacWriter WR(in.Grid()->IsBoss());
+    WR.open(fname);
+    WR.writeScidacFieldRecord(in,record,0); // Lexico
+    WR.close();
+  #endif
+}
+
+template <class T> void readFile(T& out, std::string const fname){  
+  #ifdef HAVE_LIME
+    // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+    std::cout << Grid::GridLogMessage << "Reads at: " << fname << std::endl;
+    Grid::emptyUserRecord record;
+    // Grid::ScidacReader SR(out.Grid()->IsBoss());
+    Grid::ScidacReader SR;
+    SR.open(fname);
+    SR.readScidacFieldRecord(out, record);
+    SR.close();
+  #endif
+}
+
+/**
+ * Writes the eigensystem of a Krylov Schur object to a directory. 
+ * 
+ * Parameters
+ * ----------
+ * std::string path
+ *    Directory to write to. 
+ */
+template <class Field>
+void writeEigensystem(KrylovSchur<Field> KS, std::string outDir) {
+  int Nk = KS.getNk();
+  std::cout << GridLogMessage << "Writing output to directory: " << outDir << std::endl;
+  
+  // Write evals
+  std::string evalPath = outDir + "/evals.txt";
+  std::ofstream fEval;
+  fEval.open(evalPath);
+  Eigen::VectorXcd evals = KS.getEvals();
+  std::vector<RealD> ritz  = KS.getRitzEstimates();
+  for (int i = 0; i < Nk; i++) {
+    // write eigenvalues and Ritz estimates
+    fEval << i << " " << evals(i) << " " << ritz[i];
+    if (i < Nk - 1) { fEval << "\n"; }
+  }
+  fEval.close();
+  
+  // Write evecs
+  int Nevecs = Nk;          // don't write all of them
+  std::vector<Field> evecs = KS.getEvecs();
+  for (int i = 0; i < Nevecs; i++) {
+    std::string fName = outDir + "/evec" + std::to_string(i);
+    writeFile(evecs[i], fName);     // using method from Grid/HMC/ComputeWilsonFlow.cc
+  }
+}
+
+// Hermitize a DWF operator by squaring it
+template<class Matrix,class Field>
+class SquaredLinearOperator : public LinearOperatorBase<Field> {
+
+  public:
+  Matrix &_Mat;
+
+  public:
+    SquaredLinearOperator(Matrix &Mat): _Mat(Mat) {};
+
+    void OpDiag (const Field &in, Field &out) {    assert(0);  }
+    void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+    void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+    void Op     (const Field &in, Field &out){
+      // std::cout << "Op is overloaded as HermOp" << std::endl;
+      HermOp(in, out);
+    }
+    void AdjOp     (const Field &in, Field &out){
+      HermOp(in, out);
+    }
+    void _Op     (const Field &in, Field &out){
+      // std::cout << "Op: M "<<std::endl;
+      _Mat.M(in, out);
+    }
+    void _AdjOp     (const Field &in, Field &out){
+      // std::cout << "AdjOp: Mdag "<<std::endl;
+      _Mat.Mdag(in, out);
+    }
+    void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+    void HermOp(const Field &in, Field &out){
+      // std::cout << "HermOp: Mdag M Mdag M"<<std::endl;
+      Field tmp(in.Grid());
+      _Op(in,tmp);
+      _AdjOp(tmp,out);
+    }
+};
+
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    //    _Mat.M(in,tmp);
+    //    _PV.Mdag(tmp,out);
+    //    _PV.M(out,tmp);
+    //    _Mat.Mdag(tmp,out);
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+};
+
+template<class Matrix, class Field>
+class ShiftedComplexPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  ComplexD shift;
+public:
+ShiftedComplexPVdagMLinearOperator(ComplexD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+  
+  void resetShift(ComplexD newShift) {
+    shift = newShift;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  // Usage : $ ./Example_wilson_evecs ${inFile}
+  std::string file       = argv[1];
+
+  const int Ls=16;
+
+//   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  //std::vector<int> lat_size {16, 16, 16, 32};
+  std::vector<int> lat_size {32, 32, 32, 32};
+  std::cout << "Lattice size: " << lat_size << std::endl;
+  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(lat_size, 
+								          GridDefaultSimd(Nd,vComplex::Nsimd()),
+								          GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  // GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  // GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  GridCartesian * FGrid = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  std::vector<int> seeds4({1,2,3,4});
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeFermion    src(FGrid); random(RNG4, src);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  NerscIO::readConfiguration(Umu, header, file);
+
+  std::cout << GridLogMessage << "Loaded configuration" << std::endl;
+
+  // RealD mass = 0.01;
+  RealD M5 = 1.8;
+
+  // Wilson mass
+  RealD mass = -1.6;
+
+  std::cout << GridLogMessage << "masses specified" << std::endl;
+
+  std::vector<Complex> boundary = {1,1,1,-1};
+  WilsonFermionD::ImplParams Params(boundary);
+
+  // DomainWallFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
+  // NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
+
+  // WilsonFermionD Dwilson(Umu, *FGrid, *FrbGrid, mass);
+  WilsonFermionD Dwilson(Umu, *UGrid, *UrbGrid, mass, Params);
+  NonHermitianLinearOperator<WilsonFermionD, LatticeFermionD> DLinOp (Dwilson);
+
+  std::cout << GridLogMessage << "Dirac operator defined" << std::endl;
+
+  std::string eigenPath = "/home/poare/lqcd/multigrid/spectra/32cube-rho0.124-tau4/U_smr_3.000000/Nm72_Nk24_8111835.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov/";
+
+  std::cout << GridLogMessage << "Loading eigenvalues" << std::endl;
+  std::ifstream evalFile(eigenPath + "evals.txt");
+  std::string str;
+  std::vector<ComplexD> evals;
+  while (std::getline(evalFile, str)) {
+      std::cout << GridLogMessage << "Reading line: " << str << std::endl;
+      int i1 = str.find("(") + 1;
+      int i2 = str.find(",") + 1;
+      int i3 = str.find(")");
+      std::cout << "i1,i2,i3 = " << i1 << "," << i2 << "," << i3 << std::endl;
+      std::string reStr = str.substr(i1, i2 - i1);
+      std::string imStr = str.substr(i2, i3 - i2);
+      std::cout << GridLogMessage << "Parsed re = " << reStr << " and im = " << imStr << std::endl;
+      // ComplexD z (std::stof(reStr), std::stof(imStr));
+      ComplexD z (std::stod(reStr), std::stod(imStr));
+      evals.push_back(z);
+  }
+  std::cout << GridLogMessage << "Eigenvalues: " << evals << std::endl;
+
+  int Nevecs = 24;
+  std::vector<LatticeFermion> evecs;
+  LatticeFermion evec (FGrid);
+  for (int i = 0; i < Nevecs; i++) {
+    std::string evecPath = eigenPath + "evec" + std::to_string(i);
+    readFile(evec, evecPath);
+    evecs.push_back(evec);
+  }
+  std::cout << GridLogMessage << "Evecs loaded" << std::endl;
+
+  // Compute < evec | D - \lambda | evec >
+  std::cout << GridLogMessage << "Testing eigenvectors" << std::endl;
+  LatticeFermion Devec (FGrid);
+  ComplexD ritz;
+  for (int i = 0; i < Nevecs; i++) {
+    Devec = Zero();
+    DLinOp.Op(evecs[i], Devec);
+    ritz = std::sqrt(norm2(Devec - evals[i] * evecs[i]));
+    std::cout << GridLogMessage << "i = " << i << ", || (D - lambda) |vi> || = " << ritz << std::endl;
+  }
+  // Eigen::MatrixXcd Dw_evecs;
+  // Dw_evecs = Eigen::MatrixXcd::Zero(Nevecs, Nevecs);
+  // for (int i = 0; i < Nevecs; i++) {
+  //   Linop.Op(evecs[i], Devec);
+  //   for (int j = 0; j < Nevecs; j++) {
+
+  //   }
+  // }
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,374 @@
+/*************************************************************************************
+
+    Runs the Krylov-Schur algorithm on a Wilson fermion operator to determine part of its spectrum. 
+
+    TODO rename this file: really is running the topology change jobs on Aurora. 
+
+    Usage : 
+      $ ./Example_spec_kryschur <Nm> <Nk> <maxiter> <Nstop> <inFile> <outDir> <?rf>
+
+      Nm = Maximum size of approximation subspace.
+      Nk = Size of truncation subspace
+      maxiter = Maximum number of iterations.
+      Nstop   = Stop when Nstop eigenvalues have converged. 
+      inFile  = Gauge configuration to read in.
+      outDir  = Directory to write output to.
+      rf      = (Optional) RitzFilter to sort with. Takes in any string in 
+                  {EvalNormSmall, EvalNormLarge, EvalReSmall, EvalReLarge, EvalImSmall, EvalImLarge}
+    
+    Output:
+      ${outDir}/evals.txt  = Contains all eigenvalues. Each line is formatted as `$idx $eval $ritz`, where:
+                              - $idx is the index of the eigenvalue.
+                              - $eval is the eigenvalue, formated as "(re,im)".
+                              - $ritz is the Ritz estimate of the eigenvalue (deviation from being a true eigenvalue)
+      ${outDir}/evec${idx} = Eigenvector $idx written out in SCIDAC format (if LIME is enabled).
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Patrick Oare <poare@bnl.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+#include <cstdlib>
+
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+#include <Grid/parallelIO/IldgIOtypes.h>
+#include <Grid/parallelIO/IldgIO.h>
+
+using namespace std;
+using namespace Grid;
+
+template <class T> void writeFile(T& in, std::string const fname){  
+  #ifdef HAVE_LIME
+    // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+    std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+    Grid::emptyUserRecord record;
+    Grid::ScidacWriter WR(in.Grid()->IsBoss());
+    WR.open(fname);
+    WR.writeScidacFieldRecord(in,record,0); // Lexico
+    WR.close();
+  #endif
+}
+
+/**
+ * Writes the eigensystem of a Krylov Schur object to a directory. 
+ * 
+ * Parameters
+ * ----------
+ * std::string path
+ *    Directory to write to. 
+ */
+template <class Field>
+void writeEigensystem(KrylovSchur<Field> KS, std::string outDir) {
+  int Nk = KS.getNk();
+  std::cout << GridLogMessage << "Writing output to directory: " << outDir << std::endl;
+  
+  // Write evals
+  std::string evalPath = outDir + "/evals.txt";
+  std::ofstream fEval;
+  fEval.open(evalPath);
+  Eigen::VectorXcd evals = KS.getEvals();
+  std::vector<RealD> ritz  = KS.getRitzEstimates();
+  for (int i = 0; i < Nk; i++) {
+    // write eigenvalues and Ritz estimates
+    fEval << i << " " << evals(i) << " " << ritz[i];
+    if (i < Nk - 1) { fEval << "\n"; }
+  }
+  fEval.close();
+  
+  // Write evecs
+  int Nevecs = Nk;          // don't write all of them
+  std::vector<Field> evecs = KS.getEvecs();
+  for (int i = 0; i < Nevecs; i++) {
+    std::string fName = outDir + "/evec" + std::to_string(i);
+    writeFile(evecs[i], fName);     // using method from Grid/HMC/ComputeWilsonFlow.cc
+  }
+}
+
+// Hermitize a DWF operator by squaring it
+template<class Matrix,class Field>
+class SquaredLinearOperator : public LinearOperatorBase<Field> {
+
+  public:
+  Matrix &_Mat;
+
+  public:
+    SquaredLinearOperator(Matrix &Mat): _Mat(Mat) {};
+
+    void OpDiag (const Field &in, Field &out) {    assert(0);  }
+    void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+    void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+    void Op     (const Field &in, Field &out){
+      // std::cout << "Op is overloaded as HermOp" << std::endl;
+      HermOp(in, out);
+    }
+    void AdjOp     (const Field &in, Field &out){
+      HermOp(in, out);
+    }
+    void _Op     (const Field &in, Field &out){
+      // std::cout << "Op: M "<<std::endl;
+      _Mat.M(in, out);
+    }
+    void _AdjOp     (const Field &in, Field &out){
+      // std::cout << "AdjOp: Mdag "<<std::endl;
+      _Mat.Mdag(in, out);
+    }
+    void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+    void HermOp(const Field &in, Field &out){
+      // std::cout << "HermOp: Mdag M Mdag M"<<std::endl;
+      Field tmp(in.Grid());
+      _Op(in,tmp);
+      _AdjOp(tmp,out);
+    }
+};
+
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    //    _Mat.M(in,tmp);
+    //    _PV.Mdag(tmp,out);
+    //    _PV.M(out,tmp);
+    //    _Mat.Mdag(tmp,out);
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+};
+
+template<class Matrix, class Field>
+class ShiftedComplexPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  ComplexD shift;
+public:
+ShiftedComplexPVdagMLinearOperator(ComplexD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+  
+  void resetShift(ComplexD newShift) {
+    shift = newShift;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  // Usage : $ ./Example_spec_kryschur <Nm> <Nk> <maaxiter> <Nstop> <inFile> <outDir>
+  std::string NmStr      = argv[1];
+  std::string NkStr      = argv[2];
+  std::string maxIterStr = argv[3];
+  std::string NstopStr   = argv[4];
+  std::string file       = argv[5];
+  std::string outDir     = argv[6];
+
+  // RitzFilter RF;
+  // if (argc == 8) {
+  //   std::string rf       = argv[7];
+  //   RF = selectRitzFilter(rf);
+  // } else {
+  //   RF = EvalReSmall;
+  // }
+  // RitzFilter RF;
+  std::string rf       = argv[7];
+  RitzFilter RF        = selectRitzFilter(rf);
+  std::cout << "Sorting eigenvalues using " << rfToString(RF) << std::endl;
+
+  const int Ls=16;
+
+//   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  //std::vector<int> lat_size {16, 16, 16, 32};
+  std::vector<int> lat_size {32, 32, 32, 32};
+  std::cout << "Lattice size: " << lat_size << std::endl;
+  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(lat_size, 
+								          GridDefaultSimd(Nd,vComplex::Nsimd()),
+								          GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  // GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  // GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  GridCartesian * FGrid = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  std::vector<int> seeds4({1,2,3,4});
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+
+  LatticeFermion    src(FGrid); random(RNG4, src);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  NerscIO::readConfiguration(Umu, header, file);
+
+  std::cout << GridLogMessage << "Loaded configuration" << std::endl;
+
+  // RealD mass = 0.01;
+  RealD M5 = 1.8;
+
+  // Wilson mass
+  RealD mass = -1.6;
+
+  std::cout << GridLogMessage << "masses specified" << std::endl;
+
+  std::vector<Complex> boundary = {1,1,1,-1};
+  WilsonFermionD::ImplParams Params(boundary);
+
+  // DomainWallFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
+  // NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
+
+  // WilsonFermionD Dwilson(Umu, *FGrid, *FrbGrid, mass);
+  WilsonFermionD Dwilson(Umu, *UGrid, *UrbGrid, mass, Params);
+  NonHermitianLinearOperator<WilsonFermionD, LatticeFermionD> DLinOp (Dwilson);
+
+  std::cout << GridLogMessage << "Dirac operator defined" << std::endl;
+
+  // Define PV^dag D (if we want)
+  // DomainWallFermionD Dpv(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, 1.0, M5);
+  // typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+  // PVdagM_t PVdagM(Ddwf, Dpv);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+  // SquaredLinearOperator<WilsonFermionD, LatticeFermionD> Dsq (DWilson);
+  // NonHermitianLinearOperator<WilsonFermionD, LatticeFermionD> DLinOp (DWilson);
+
+  int Nm = std::stoi(NmStr);
+  int Nk = std::stoi(NkStr);
+  int maxIter = std::stoi(maxIterStr);
+  int Nstop = std::stoi(NstopStr);
+
+  std::cout << GridLogMessage << "Runnning Krylov Schur. Nm = " << Nm << ", Nk = " << Nk << ", maxIter = " << maxIter 
+                  << ", Nstop = " << Nstop << std::endl;
+  
+  // KrylovSchur KrySchur (PVdagM, FGrid, 1e-8, RF);         // use PV^\dag M
+  KrylovSchur KrySchur (DLinOp, FGrid, 1e-8, RF);         // use Ddwf
+  KrySchur(src, maxIter, Nm, Nk, Nstop);
+
+  std::cout << GridLogMessage << "Checking eigensystem." << std::endl;
+  KrySchur.checkRitzEstimate();
+
+  std::cout<<GridLogMessage << "*******************************************" << std::endl;
+  std::cout<<GridLogMessage << "***************** RESULTS *****************" << std::endl;
+  std::cout<<GridLogMessage << "*******************************************" << std::endl;
+
+  std::cout << GridLogMessage << "Krylov Schur eigenvalues: " << std::endl << KrySchur.getEvals() << std::endl;
+
+  writeEigensystem(KrySchur, outDir);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,15 @@
+#include <Grid/Grid.h>
+#pragma once
+
+#ifndef ENABLE_FERMION_INSTANTIATIONS
+#include <iostream>
+
+int main(void) {
+  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
+	    << "which this example relies on. "
+	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
+	    << "to run this example."
+	    << std::endl;
+  return 1;
+}
+#endif
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -273,8 +276,6 @@ void  TestWhat(What & Ddwf,

  err = phi-chi;
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<< std::endl;
-
-  
 }

-
+#endif
@@ -30,6 +30,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 *  Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features 
 *  in Grid that were intended to be used to support blocked Aggregates, from
 */
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
@@ -256,3 +259,4 @@ int main (int argc, char ** argv) {
  Grid_finalize();
 }

+#endif
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -237,3 +240,5 @@ int main (int argc, char ** argv)
  
  Grid_finalize();
 }
+
+#endif
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -222,3 +225,5 @@ int main (int argc, char ** argv)
  
  Grid_finalize();
 }
+
+#endif
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS
+
 #include <Grid/Grid.h>

 using namespace std;
@@ -118,3 +121,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
 #endif
+#endif
@@ -24,6 +24,8 @@ with this program; if not, write to the Free Software Foundation, Inc.,

 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
+#include "disable_tests_without_instantiations.h"
+#ifdef ENABLE_FERMION_INSTANTIATIONS

 #include <Grid/Grid.h>
 #include <Grid/qcd/utils/A2Autils.h>
@@ -157,3 +159,5 @@ int main(int argc, char *argv[])
  
  return EXIT_SUCCESS;
 }
+
+#endif
@@ -128,6 +128,10 @@ int main (int argc, char ** argv)
  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
  HermFineMatrix FineHermOp(HermOpEO);

+  LatticeFermionD src(FrbGrid); 
+  src = ComplexD(1.0);
+  PowerMethod<LatticeFermionD>       PM;   PM(HermOpEO,src);
+
  ////////////////////////////////////////////////////////////
  ///////////// Coarse basis and Little Dirac Operator ///////
  ////////////////////////////////////////////////////////////
@@ -150,7 +154,7 @@ int main (int argc, char ** argv)
  std::cout << "**************************************"<<std::endl;
  std::cout << "Create Subspace"<<std::endl;
  std::cout << "**************************************"<<std::endl;
-  Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 
+  Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,35.,0.01,500);// <== last run

  std::cout << "**************************************"<<std::endl;
  std::cout << "Refine Subspace"<<std::endl;
@@ -185,7 +189,7 @@ int main (int argc, char ** argv)
  std::cout << "**************************************"<<std::endl;

  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
-  Chebyshev<CoarseVector>      IRLCheby(0.05,40.0,101);  // 1 iter
+  Chebyshev<CoarseVector>      IRLCheby(0.01,16.0,201);  // 1 iter
  MrhsHermMatrix MrhsCoarseOp     (mrhs);

  CoarseVector pm_src(CoarseMrhs);
@@ -193,10 +197,10 @@ int main (int argc, char ** argv)
  PowerMethod<CoarseVector>       cPM;
  cPM(MrhsCoarseOp,pm_src);

-  int Nk=nrhs;
-  int Nm=Nk*3;
-  //  int Nk=36;
-  //  int Nm=144;
+  //  int Nk=16;
+  //  int Nm=Nk*3;
+  int Nk=32;
+  int Nm=128;
  int Nstop=Nk;
  int Nconv_test_interval=1;
  
@@ -210,7 +214,7 @@ int main (int argc, char ** argv)
 							  nrhs,
 							  Nk,
 							  Nm,
-							  1e-4,10);
+							  1e-4,100);

  int Nconv;
  std::vector<RealD>            eval(Nm);
@@ -231,8 +235,6 @@ int main (int argc, char ** argv)
  std::cout << "**************************************"<<std::endl;
  std::cout << " Recompute coarse evecs  "<<std::endl;
  std::cout << "**************************************"<<std::endl;
-  evec.resize(Nm,Coarse5d);
-  eval.resize(Nm);
  for(int r=0;r<nrhs;r++){
    random(CRNG,c_src[r]);
  }
@@ -243,7 +245,7 @@ int main (int argc, char ** argv)
  // Deflation guesser object
  ///////////////////////
  std::cout << "**************************************"<<std::endl;
-  std::cout << " Reimport coarse evecs  "<<std::endl;
+  std::cout << " Reimport coarse evecs "<<evec.size()<<" "<<eval.size()<<std::endl;
  std::cout << "**************************************"<<std::endl;
  MultiRHSDeflation<CoarseVector> MrhsGuesser;
  MrhsGuesser.ImportEigenBasis(evec,eval);
@@ -252,9 +254,11 @@ int main (int argc, char ** argv)
  // Extra HDCG parameters
  //////////////////////////
  int maxit=3000;
-  ConjugateGradient<CoarseVector>  CG(2.0e-1,maxit,false);
-  RealD lo=2.0;
-  int ord = 9;
+  //  ConjugateGradient<CoarseVector>  CG(2.0e-1,maxit,false);
+  //  ConjugateGradient<CoarseVector>  CG(1.0e-2,maxit,false);
+  ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false);
+  RealD lo=0.2;
+  int ord = 7;

  DoNothingGuesser<CoarseVector> DoNothing;
  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
@@ -300,6 +304,19 @@ int main (int argc, char ** argv)
    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
    CGfine(HermOpEO, src, result);
  }
+  {
+    std::cout << "**************************************"<<std::endl;
+    std::cout << "Calling MdagM CG"<<std::endl;
+    std::cout << "**************************************"<<std::endl;
+      
+    LatticeFermion result(FGrid); result=Zero();
+    LatticeFermion    src(FGrid); random(RNG5,src);
+    result=Zero();
+
+    MdagMLinearOperator<MobiusFermionD, LatticeFermionD> HermOp(Ddwf);
+    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
+    CGfine(HermOp, src, result);
+  }
 #endif  
  Grid_finalize();
  return 0;
@@ -368,7 +368,10 @@ int main (int argc, char ** argv)
  TrivialPrecon<CoarseVector> simple;
  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
-  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-2, 100, LinOpCoarse,simple,10,10); 
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-2, 100, LinOpCoarse,simple,12,12);  // 35 outer
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(5.0e-2, 100, LinOpCoarse,simple,12,12);  // 36 outer, 12s
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-1, 100, LinOpCoarse,simple,12,12);  // 36 ; 11s   
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-1, 100, LinOpCoarse,simple,12,12);     
  L2PGCR.Level(3);
  c_res=Zero();
  L2PGCR(c_src,c_res);
@@ -400,7 +403,7 @@ int main (int argc, char ** argv)
 			    LinOpCoarse,
 			    L2PGCR);
  
-  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,100,PVdagM,TwoLevelPrecon,10,10);
  L1PGCR.Level(1);

  f_res=Zero();
@@ -0,0 +1,493 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<< "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class MdagPVLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  MdagPVLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    //    std::cout <<GridLogMessage<< "Op: PVdag M "<<std::endl;
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"HermOp: PVdag M Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+};
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditionerSVD : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _FineToCoarse;
+  Aggregates     & _CoarseToFine;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditionerSVD(Aggregates &FtoC,
+		      Aggregates &CtoF,
+		      FineOperator &Fine,
+		      FineSmoother &PreSmoother,
+		      FineSmoother &PostSmoother,
+		      CoarseOperator &CoarseOperator_,
+		      CoarseSolver &CoarseSolve_)
+    : _FineToCoarse(FtoC),
+      _CoarseToFine(CtoF),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _FineToCoarse.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _FineToCoarse.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _CoarseToFine.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+  
+  RealD mass=0.01;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  const int nbasis = 30;
+  const int cb = 0 ;
+
+
+  NextToNearestStencilGeometry5D geom(Coarse5d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+  typedef MdagPVLinearOperator<DomainWallFermionD,LatticeFermionD> MdagPV_t;
+  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+  PVdagM_t PVdagM(Ddwf,Dpv);
+  MdagPV_t MdagPV(Ddwf,Dpv);
+  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
+  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
+  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
+
+
+  // Run power method on HOA??
+  PowerMethod<LatticeFermion>       PM;
+  //  PM(PVdagM,src);
+  //  PM(MdagPV,src);
+ 
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace V(Coarse5d,FGrid,cb);
+  Subspace U(Coarse5d,FGrid,cb);
+
+  // Breeds right singular vectors with call to HermOp (V)
+  V.CreateSubspaceChebyshev(RNG5,PVdagM,
+			    nbasis,
+			    4000.0,0.003,
+			    500);
+
+  // Breeds left singular vectors with call to HermOp (U)
+  //  U.CreateSubspaceChebyshev(RNG5,PVdagM,
+  U.CreateSubspaceChebyshev(RNG5,MdagPV,
+			    nbasis,
+			    4000.0,0.003,
+			    500);
+  
+  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
+  CombinedSubspace CombinedUV(Coarse5d,FGrid,cb);
+  for(int b=0;b<nbasis;b++){
+    CombinedUV.subspace[b]        = V.subspace[b];
+    CombinedUV.subspace[b+nbasis] = U.subspace[b];
+  }
+
+  int bl, br;
+  std::cout <<" <V| PVdagM| V> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(V.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <V| PVdagM| U> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(U.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <U| PVdagM| V> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(V.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <U| PVdagM| U> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(U.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
+  }}
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperatorV;
+  typedef LittleDiracOperatorV::CoarseVector CoarseVectorV;
+  
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  V.Orthogonalise();
+  for(int b =0 ; b<nbasis;b++){
+    CoarseVectorV c_src (Coarse5d);
+    V.ProjectToSubspace  (c_src,U.subspace[b]);
+    V.PromoteFromSubspace(c_src,src);
+    std::cout << " Completeness of U in V ["<< b<<"] "<< std::sqrt(norm2(src)/norm2(U.subspace[b]))<<std::endl;
+  }
+  
+  CoarseVector c_src (Coarse5d);
+  CoarseVector c_res (Coarse5d);
+  CoarseVector c_proj(Coarse5d);
+  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
+  LittleDiracOpPV.CoarsenOperator(PVdagM,CombinedUV,CombinedUV);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+
+  blockPromote(c_src,err,CombinedUV.subspace);
+
+  LatticeFermion prom(FGrid);
+  prom=Zero();
+  for(int b=0;b<nbasis*2;b++){
+    prom=prom+CombinedUV.subspace[b];
+  }
+
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  PVdagM.Op(prom,tmp);
+  blockProject(c_proj,tmp,CombinedUV.subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOpPV.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 10, LinOpCoarse,simple,20,20); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis*2> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(CombinedUV,CombinedUV,
+			    PVdagM,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,20,20);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,492 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<< "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class MdagPVLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  MdagPVLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    //    std::cout <<GridLogMessage<< "Op: PVdag M "<<std::endl;
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"HermOp: PVdag M Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+};
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditionerSVD : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _FineToCoarse;
+  Aggregates     & _CoarseToFine;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditionerSVD(Aggregates &FtoC,
+		      Aggregates &CtoF,
+		      FineOperator &Fine,
+		      FineSmoother &PreSmoother,
+		      FineSmoother &PostSmoother,
+		      CoarseOperator &CoarseOperator_,
+		      CoarseSolver &CoarseSolve_)
+    : _FineToCoarse(FtoC),
+      _CoarseToFine(CtoF),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _FineToCoarse.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _FineToCoarse.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _CoarseToFine.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+  
+  RealD mass=0.01;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  const int nbasis = 20;
+  const int cb = 0 ;
+
+
+  NextToNearestStencilGeometry5D geom(Coarse5d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+  typedef MdagPVLinearOperator<DomainWallFermionD,LatticeFermionD> MdagPV_t;
+  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+  PVdagM_t PVdagM(Ddwf,Dpv);
+  MdagPV_t MdagPV(Ddwf,Dpv);
+  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
+  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
+  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
+
+
+  // Run power method on HOA??
+  PowerMethod<LatticeFermion>       PM;
+  //  PM(PVdagM,src);
+  //  PM(MdagPV,src);
+ 
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace V(Coarse5d,FGrid,cb);
+  Subspace U(Coarse5d,FGrid,cb);
+
+  // Breeds right singular vectors with call to HermOp (V)
+  V.CreateSubspace(RNG5,PVdagM,nbasis);
+
+  // Breeds left singular vectors with call to HermOp (U)
+  //  U.CreateSubspaceChebyshev(RNG5,MdagPV,
+  U.CreateSubspace(RNG5,PVdagM,nbasis);
+  
+  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
+  CombinedSubspace CombinedUV(Coarse5d,FGrid,cb);
+  for(int b=0;b<nbasis;b++){
+    CombinedUV.subspace[b]        = V.subspace[b];
+    CombinedUV.subspace[b+nbasis] = U.subspace[b];
+  }
+
+  int bl, br;
+  std::cout <<" <V| PVdagM| V> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(V.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <V| PVdagM| U> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(U.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <U| PVdagM| V> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(V.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
+  }}
+  std::cout <<" <U| PVdagM| U> " <<std::endl;
+  for(bl=0;bl<nbasis;bl++){
+  for(br=0;br<nbasis;br++){
+    PVdagM.Op(U.subspace[br],src);
+    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
+  }}
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperatorV;
+  typedef LittleDiracOperatorV::CoarseVector CoarseVectorV;
+  
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  V.Orthogonalise();
+  for(int b =0 ; b<nbasis;b++){
+    CoarseVectorV c_src (Coarse5d);
+    V.ProjectToSubspace  (c_src,U.subspace[b]);
+    V.PromoteFromSubspace(c_src,src);
+    std::cout << " Completeness of U in V ["<< b<<"] "<< std::sqrt(norm2(src)/norm2(U.subspace[b]))<<std::endl;
+  }
+  
+  CoarseVector c_src (Coarse5d);
+  CoarseVector c_res (Coarse5d);
+  CoarseVector c_proj(Coarse5d);
+  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
+  LittleDiracOpPV.CoarsenOperator(PVdagM,CombinedUV,CombinedUV);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+
+  blockPromote(c_src,err,CombinedUV.subspace);
+
+  LatticeFermion prom(FGrid);
+  prom=Zero();
+  for(int b=0;b<nbasis*2;b++){
+    prom=prom+CombinedUV.subspace[b];
+  }
+
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  PVdagM.Op(prom,tmp);
+  blockProject(c_proj,tmp,CombinedUV.subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOpPV.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 10, LinOpCoarse,simple,20,20); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis*2> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(CombinedUV,CombinedUV,
+			    PVdagM,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,20,20);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,479 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Matrix,class Field>
+class PVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<< "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class MdagPVLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+public:
+  MdagPVLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    //    std::cout <<GridLogMessage<< "Op: PVdag M "<<std::endl;
+    _PV.M(in,tmp);
+    _Mat.Mdag(tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout <<GridLogMessage<< "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << GridLogMessage<<"HermOp: PVdag M Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
+  }
+};
+template<class Matrix,class Field>
+class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  Matrix &_PV;
+  RealD shift;
+public:
+  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
+
+  void OpDiag (const Field &in, Field &out) {    assert(0);  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
+  void Op     (const Field &in, Field &out){
+    //    std::cout << "Op: PVdag M "<<std::endl;
+    Field tmp(in.Grid());
+    _Mat.M(in,tmp);
+    _PV.Mdag(tmp,out);
+    out = out + shift * in;
+  }
+  void AdjOp     (const Field &in, Field &out){
+    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
+    Field tmp(in.Grid());
+    _PV.M(tmp,out);
+    _Mat.Mdag(in,tmp);
+    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
+  void HermOp(const Field &in, Field &out){
+    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
+    Field tmp(in.Grid());
+    Op(in,tmp);
+    AdjOp(tmp,out);
+  }
+};
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditionerSVD : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  ///////////////////////////////
+  // SVD is M = U S Vdag
+  //
+  // Define a subset of Vc and Uc in Complex_f,c  matrix
+  // - these are the coarsening, non-square matrices
+  //
+  // Solve a coarse approx to
+  //
+  //      M psi = eta
+  //
+  //  via
+  //
+  //  Uc^dag U S Vdag Vc Vc^dag psi = Uc^dag eta
+  //
+  //  M_coarse Vc^dag psi = M_coarse psi_c = eta_c
+  //  
+  ///////////////////////////////
+  Aggregates     & _U;
+  Aggregates     & _V;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditionerSVD(Aggregates &U,
+		      Aggregates &V,
+		      FineOperator &Fine,
+		      FineSmoother &PreSmoother,
+		      FineSmoother &PostSmoother,
+		      CoarseOperator &CoarseOperator_,
+		      CoarseSolver &CoarseSolve_)
+    : _U(U),
+      _V(V),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _U.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    //  Uc^dag U S Vdag Vc Vc^dag psi = Uc^dag eta
+    // Fine to Coarse 
+    t=-usecond();
+    _U.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _V.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG5,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+  
+  RealD mass=0.01;
+  RealD M5=1.8;
+
+  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  const int nbasis = 60;
+  const int cb = 0 ;
+
+  NextToNearestStencilGeometry5D geom(Coarse5d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+  typedef MdagPVLinearOperator<DomainWallFermionD,LatticeFermionD> MdagPV_t;
+  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+  PVdagM_t PVdagM(Ddwf,Dpv);
+  MdagPV_t MdagPV(Ddwf,Dpv);
+  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
+  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
+  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
+  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
+
+
+  // Run power method on HOA??
+  PowerMethod<LatticeFermion>       PM;
+  PM(PVdagM,src);
+  PM(MdagPV,src);
+
+ 
+  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace V(Coarse5d,FGrid,cb);
+  //  Subspace U(Coarse5d,FGrid,cb);
+
+  // Breeds right singular vectors with call to HermOp
+  V.CreateSubspaceChebyshev(RNG5,PVdagM,
+			    nbasis,
+			    4000.0,0.003,
+			    300);
+
+  // Breeds left singular vectors with call to HermOp
+  //  U.CreateSubspaceChebyshev(RNG5,MdagPV,
+  //			    nbasis,
+  //			    4000.0,0.003,
+  //			    300);
+  //  U.subspace=V.subspace;
+  
+  //  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
+  //  CombinedSubspace CombinedUV(Coarse5d,FGrid,cb);
+  //  for(int b=0;b<nbasis;b++){
+  //    CombinedUV.subspace[b]        = V.subspace[b];
+  //    CombinedUV.subspace[b+nbasis] = U.subspace[b];
+  //  }
+  
+  
+  //  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
+  LittleDiracOpPV.CoarsenOperator(PVdagM,V,V);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  CoarseVector c_src (Coarse5d);
+  CoarseVector c_res (Coarse5d);
+  CoarseVector c_proj(Coarse5d);
+
+
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+
+  //  blockPromote(c_src,err,CoarseToFine.subspace);
+
+  LatticeFermion prom(FGrid);
+  prom=Zero();
+  for(int b=0;b<nbasis;b++){
+    prom=prom+V.subspace[b];
+  }
+
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  PVdagM.Op(prom,tmp);
+  blockProject(c_proj,tmp,V.subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOpPV.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L3PGCR(1.0e-4, 10, LinOpCoarse,simple,20,20); 
+  L3PGCR.Level(3);
+  c_res=Zero();
+  L3PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  //  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis*2> TwoLevelMG;
+  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
+
+  //  TwoLevelMG TwoLevelPrecon(CombinedUV,CombinedUV,
+  TwoLevelMG TwoLevelPrecon(V,V,
+			    PVdagM,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L3PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,333 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
+
+  Complex one(1.0);
+
+  LatticeFermion    src(FGrid); src=one;
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeFermion    precsrc(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  RealD csw =0.0;
+  RealD mass=-0.92;
+
+  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
+
+  const int nbasis = 20;
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NearestStencilGeometry4D geom(Coarse4d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+ 
+  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse4d,FGrid,cb);
+
+  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
+  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.01);
+
+  Aggregates.CreateSubspaceGCR(RNG4,
+			       LinOpDw,
+			       nbasis);
+  
+  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
+  CombinedSubspace CombinedUV(Coarse4d,UGrid,cb);
+  for(int b=0;b<nbasis;b++){
+    Gamma G5(Gamma::Algebra::Gamma5);
+    CombinedUV.subspace[b]        = Aggregates.subspace[b];
+    CombinedUV.subspace[b+nbasis] = G5*Aggregates.subspace[b];
+  }
+
+  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
+  LittleDiracOp.CoarsenOperator(LinOpDw,CombinedUV);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  CoarseVector c_src (Coarse4d);
+  CoarseVector c_res (Coarse4d);
+  CoarseVector c_proj(Coarse4d);
+
+  std::vector<LatticeFermion> subspace(2*nbasis,FGrid);
+  subspace=CombinedUV.subspace;
+
+  c_src = one;  // 1 in every element for vector 1.
+  blockPromote(c_src,err,subspace);
+
+  prom=Zero();
+  for(int b=0;b<2*nbasis;b++){
+    prom=prom+subspace[b];
+  }
+  err=err-prom; 
+  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  LinOpDw.Op(prom,tmp);
+  blockProject(c_proj,tmp,subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOp.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  // CG
+  {
+    MdagMLinearOperator<WilsonFermionD,LatticeFermion> HermOp(Dw);
+    ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+    Dw.Mdag(src,precsrc);
+    CG(HermOp,precsrc,result);
+    result=Zero();
+  }
+ 
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
+  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.001);
+  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.01);
+  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LinOpCoarse,0.001);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-1, 100, LinOpCoarse,simple,30,30); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(2.0e-1, 50, ShiftedLinOpCoarse,simple,50,50); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.1,1,ShiftedLinOpDw,simple_fine,4,4);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,2*nbasis> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(CombinedUV,
+			    LinOpDw,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,16,16);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,326 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
+
+  Complex one(1.0);
+
+  LatticeFermion    src(FGrid); src=one;
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeFermion    precsrc(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  RealD csw =0.0;
+  RealD mass=-0.92;
+
+  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
+
+  const int nbasis = 40;
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NearestStencilGeometry4D geom(Coarse4d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+ 
+  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse4d,FGrid,cb);
+
+  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
+  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.01);
+
+  Aggregates.CreateSubspaceGCR(RNG4,
+			       LinOpDw,
+			       nbasis);
+  
+
+  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
+  LittleDiracOp.CoarsenOperator(LinOpDw,Aggregates);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  CoarseVector c_src (Coarse4d);
+  CoarseVector c_res (Coarse4d);
+  CoarseVector c_proj(Coarse4d);
+
+  std::vector<LatticeFermion> subspace(nbasis,FGrid);
+  subspace=Aggregates.subspace;
+
+  c_src = one;  // 1 in every element for vector 1.
+  blockPromote(c_src,err,subspace);
+
+  prom=Zero();
+  for(int b=0;b<nbasis;b++){
+    prom=prom+subspace[b];
+  }
+  err=err-prom; 
+  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  LinOpDw.Op(prom,tmp);
+  blockProject(c_proj,tmp,subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOp.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  // CG
+  {
+    MdagMLinearOperator<WilsonFermionD,LatticeFermion> HermOp(Dw);
+    ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
+    Dw.Mdag(src,precsrc);
+    CG(HermOp,precsrc,result);
+    result=Zero();
+  }
+ 
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
+  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.001);
+  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.01);
+  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LinOpCoarse,0.001);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-1, 100, LinOpCoarse,simple,30,30); 
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(2.0e-1, 50, ShiftedLinOpCoarse,simple,50,50); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.1,1,ShiftedLinOpDw,simple_fine,6,6);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(Aggregates,
+			    LinOpDw,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,16,16);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,320 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG4,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat");
+  NerscIO::readConfiguration(Umu,header,file);
+  
+  RealD csw =0.0;
+  RealD mass=-0.92;
+
+  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
+
+  const int nbasis = 20;
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NearestStencilGeometry4D geom(Coarse4d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+ 
+  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse4d,FGrid,cb);
+
+  MdagMLinearOperator<WilsonCloverFermionD,LatticeFermion> MdagMOpDw(Dw);
+  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
+  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.5);
+
+  //  Aggregates.CreateSubspaceGCR(RNG4,
+  //			       LinOpDw,
+  //			       nbasis);
+  Aggregates.CreateSubspace(RNG4,MdagMOpDw,nbasis);
+  
+  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
+  CombinedSubspace CombinedUV(Coarse4d,UGrid,cb);
+  for(int b=0;b<nbasis;b++){
+    Gamma G5(Gamma::Algebra::Gamma5);
+    CombinedUV.subspace[b]        = Aggregates.subspace[b];
+    CombinedUV.subspace[b+nbasis] = G5*Aggregates.subspace[b];
+  }
+
+  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
+  LittleDiracOp.CoarsenOperator(LinOpDw,CombinedUV);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  CoarseVector c_src (Coarse4d);
+  CoarseVector c_res (Coarse4d);
+  CoarseVector c_proj(Coarse4d);
+
+  std::vector<LatticeFermion> subspace(2*nbasis,FGrid);
+  subspace=CombinedUV.subspace;
+
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+  blockPromote(c_src,err,subspace);
+
+  prom=Zero();
+  for(int b=0;b<2*nbasis;b++){
+    prom=prom+subspace[b];
+  }
+  err=err-prom; 
+  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  LinOpDw.Op(prom,tmp);
+  blockProject(c_proj,tmp,subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOp.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 100, LinOpCoarse,simple,30,30); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedLinOpDw,simple_fine,4,4);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,2*nbasis> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(CombinedUV,
+			    LinOpDw,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,32,32);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -0,0 +1,312 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_padded_cell.cc
+
+    Copyright (C) 2023
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+template<class Fobj,class CComplex,int nbasis>
+class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
+public:
+  using LinearFunction<Lattice<Fobj> >::operator();
+
+  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
+  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
+  typedef LinearOperatorBase<FineField>                            FineOperator;
+  typedef LinearFunction    <FineField>                            FineSmoother;
+  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
+  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
+  Aggregates     & _Aggregates;
+  FineOperator   & _FineOperator;
+  FineSmoother   & _PreSmoother;
+  FineSmoother   & _PostSmoother;
+  CoarseOperator & _CoarseOperator;
+  CoarseSolver   & _CoarseSolve;
+
+  int    level;  void Level(int lv) {level = lv; };
+
+  MGPreconditioner(Aggregates &Agg,
+		   FineOperator &Fine,
+		   FineSmoother &PreSmoother,
+		   FineSmoother &PostSmoother,
+		   CoarseOperator &CoarseOperator_,
+		   CoarseSolver &CoarseSolve_)
+    : _Aggregates(Agg),
+      _FineOperator(Fine),
+      _PreSmoother(PreSmoother),
+      _PostSmoother(PostSmoother),
+      _CoarseOperator(CoarseOperator_),
+      _CoarseSolve(CoarseSolve_),
+      level(1)  {  }
+
+  virtual void operator()(const FineField &in, FineField & out) 
+  {
+    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
+    //    auto CoarseGrid = _CoarseOperator.Grid();
+    CoarseVector Csrc(CoarseGrid);
+    CoarseVector Csol(CoarseGrid);
+    FineField vec1(in.Grid());
+    FineField vec2(in.Grid());
+
+    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
+
+    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
+    double t;
+    // Fine Smoother
+    //    out = in;
+    out = Zero();
+    t=-usecond();
+    _PreSmoother(in,out);
+    t+=usecond();
+
+    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Update the residual
+    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
+    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
+
+    // Fine to Coarse 
+    t=-usecond();
+    _Aggregates.ProjectToSubspace  (Csrc,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse correction
+    t=-usecond();
+    Csol = Zero();
+    _CoarseSolve(Csrc,Csol);
+    //Csol=Zero();
+    t+=usecond();
+    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Coarse to Fine
+    t=-usecond();  
+    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
+    _Aggregates.PromoteFromSubspace(Csol,vec1); 
+    add(out,out,vec1);
+    t+=usecond();
+    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
+
+    // Residual
+    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
+    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
+
+    // Fine Smoother
+    t=-usecond();
+    //    vec2=vec1;
+    vec2=Zero();
+    _PostSmoother(vec1,vec2);
+    t+=usecond();
+    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
+
+    add( out,out,vec2);
+    std::cout<<GridLogMessage << "Done " <<std::endl;
+  }
+};
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian         * FGrid   = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  // Construct a coarsened grid
+  Coordinate clatt = GridDefaultLatt();
+  for(int d=0;d<clatt.size();d++){
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion    src(FGrid); random(RNG4,src);
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("ckpoint_lat");
+  NerscIO::readConfiguration(Umu,header,file);
+  
+  RealD csw =0.0;
+  RealD mass=-0.92;
+
+  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
+
+  const int nbasis = 40;
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NearestStencilGeometry4D geom(Coarse4d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+ 
+  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
+  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
+  Subspace Aggregates(Coarse4d,FGrid,cb);
+
+  MdagMLinearOperator<WilsonCloverFermionD,LatticeFermion> MdagMOpDw(Dw);
+  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
+  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.5);
+
+  //  Aggregates.CreateSubspaceGCR(RNG4,
+  //			       LinOpDw,
+  //			       nbasis);
+  Aggregates.CreateSubspace(RNG4,MdagMOpDw,nbasis);
+
+  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
+  LittleDiracOp.CoarsenOperator(LinOpDw,Aggregates);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
+  
+  CoarseVector c_src (Coarse4d);
+  CoarseVector c_res (Coarse4d);
+  CoarseVector c_proj(Coarse4d);
+
+  std::vector<LatticeFermion> subspace(nbasis,FGrid);
+  subspace=Aggregates.subspace;
+
+  Complex one(1.0);
+  c_src = one;  // 1 in every element for vector 1.
+  blockPromote(c_src,err,subspace);
+
+  prom=Zero();
+  for(int b=0;b<nbasis;b++){
+    prom=prom+subspace[b];
+  }
+  err=err-prom; 
+  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
+  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
+  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
+
+  LinOpDw.Op(prom,tmp);
+  blockProject(c_proj,tmp,subspace);
+  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
+
+  LittleDiracOp.M(c_src,c_res);
+  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
+
+  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
+
+  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
+  c_proj = c_proj - c_res;
+  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
+  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
+
+
+  /**********
+   * Some solvers
+   **********
+   */
+
+  ///////////////////////////////////////
+  // Coarse grid solver test
+  ///////////////////////////////////////
+
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<CoarseVector> simple;
+  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
+  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 100, LinOpCoarse,simple,30,30); 
+  L2PGCR.Level(3);
+  c_res=Zero();
+  L2PGCR(c_src,c_res);
+
+  ////////////////////////////////////////
+  // Fine grid smoother
+  ////////////////////////////////////////
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
+  std::cout<<GridLogMessage<<"******************* "<<std::endl;
+  TrivialPrecon<LatticeFermionD> simple_fine;
+
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedLinOpDw,simple_fine,6,6);
+  SmootherGCR.Level(2);
+  
+  LatticeFermionD f_src(FGrid);
+  LatticeFermionD f_res(FGrid);
+
+  f_src = one;  // 1 in every element for vector 1.
+  f_res=Zero();
+  SmootherGCR(f_src,f_res);
+
+  typedef MGPreconditioner<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
+
+  TwoLevelMG TwoLevelPrecon(Aggregates,
+			    LinOpDw,
+			    simple_fine,
+			    SmootherGCR,
+			    LinOpCoarse,
+			    L2PGCR);
+  
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,32,32);
+  L1PGCR.Level(1);
+
+  f_res=Zero();
+  L1PGCR(f_src,f_res);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -490,7 +490,7 @@ public:

 	    }
 	  }
-	  GRID_ASSERT(s==nshift);
+	  assert(s==nshift);
 	  coalescedWrite(gStaple_v[ss],stencil_ss);
 	}
 	);
@@ -0,0 +1,16 @@
+#include <Grid/Grid.h>
+
+#pragma once
+
+#ifndef ENABLE_FERMION_INSTANTIATIONS
+#include <iostream>
+
+int main(void) {
+  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
+	    << "which this test relies on. "
+	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
+	    << "to run this test."
+	    << std::endl;
+  return 1;
+}
+#endif
@@ -1,14 +1,18 @@
 <?xml version="1.0"?>
 <grid>
  <LanczosParameters>
-    <mass>0.00107</mass>
+    <mass>0</mass>
+    <mstep>-0.025</mstep>
    <M5>1.8</M5>
    <Ls>48</Ls>
-    <Nstop>10</Nstop>
-    <Nk>15</Nk>
-    <Np>85</Np>
-    <ChebyLow>0.003</ChebyLow>
-    <ChebyHigh>60</ChebyHigh>
-    <ChebyOrder>201</ChebyOrder>
+    <Nstop>5</Nstop>
+    <Nk>5</Nk>
+    <Np>5</Np>
+    <ReadEvec>0</ReadEvec>
+    <maxIter>10000</maxIter>
+    <resid>1e-10</resid>
+    <ChebyLow>1</ChebyLow>
+    <ChebyHigh>100</ChebyHigh>
+    <ChebyOrder>51</ChebyOrder>
  </LanczosParameters>
 </grid>
@@ -32,9 +32,13 @@ directory
 using namespace std;
 using namespace Grid;

-//typedef WilsonFermionD FermionOp;
+#if 0
 typedef DomainWallFermionD FermionOp;
 typedef typename DomainWallFermionD::FermionField FermionField;
+#else
+typedef MobiusFermionD FermionOp;
+typedef typename MobiusFermionD::FermionField FermionField;
+#endif

 template <class T> void writeFile(T& in, std::string const fname){  
 #ifdef HAVE_LIME
@@ -179,12 +183,14 @@ int main(int argc, char** argv) {
  Np=LanParams.Np;

  int Nm = Nk + Np;
-  int MaxIt = 100;
-  RealD resid = 1.0e-4;

+  int MaxIt = 10000;
+  RealD resid = 1.0e-5;
+  RealD mob_b=1.5;

 //while ( mass > - 5.0){
-  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+//  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
 //  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
  Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
@@ -113,6 +113,9 @@ struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 				RealD, resid,
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -204,7 +207,6 @@ int main(int argc, char** argv) {
  int Nstop = 5;
  int Nk = 10;
  int Np = 90;
-  int Nm = Nk + Np;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;

@@ -226,10 +228,14 @@ int main(int argc, char** argv) {
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
-
+  Nstop=LanParams.Nstop;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
  mass=LanParams.mass;
  resid=LanParams.resid;

+  int Nm = Nk + Np;
+

 while ( mass > - 5.0){
  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
@@ -0,0 +1,377 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/lanczos/Test_wilson_bilanczos.cc
+
+    Copyright (C) 2025
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+
+
+#include <cstdlib>
+
+#include <Grid/Grid.h>
+#include <Grid/lattice/PaddedCell.h>
+#include <Grid/stencil/GeneralLocalStencil.h>
+
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
+#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
+#include <Grid/algorithms/iterative/BiCGSTAB.h>
+
+using namespace std;
+using namespace Grid;
+
+namespace Grid {
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+		  		RealD, mass , 
+		  		RealD, mstep , 
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
+                                Integer, ReadEvec,
+                                Integer, maxIter,
+	  			RealD, resid,
+	  			RealD, ChebyLow,
+	  			RealD, ChebyHigh,
+	  			Integer, ChebyOrder)
+
+  LanczosParameters() {
+    ////////////////////////////// Default values
+      mass = 0;
+    /////////////////////////////////
+  }
+
+  template <class ReaderClass >
+  LanczosParameters(Reader<ReaderClass> & TheReader){
+    initialize(TheReader);
+  }
+
+  template < class ReaderClass > 
+  void initialize(Reader<ReaderClass> &TheReader){
+//    std::cout << GridLogMessage << "Reading HMC\n";
+    read(TheReader, "HMC", *this);
+  }
+
+
+  void print_parameters() const {
+//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
+//    MD.print_parameters();
+  }
+  
+};
+
+}
+
+template <class T> void writeFile(T& in, std::string const fname){
+#if 1
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}
+
+
+typedef WilsonFermionD WilsonOp;
+typedef typename WilsonFermionD::FermionField FermionField;
+
+template<class Matrix,class Field>
+class InvertNonHermitianLinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD _stp;
+public:
+  InvertNonHermitianLinearOperator(Matrix &Mat,RealD stp=1e-8): _Mat(Mat),_stp(stp){};
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+//    _Mat.Mdiag(in,out);
+//    out = out + shift*in;
+    assert(0);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+//    _Mat.Mdir(in,out,dir,disp);
+    assert(0);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+//    _Mat.MdirAll(in,out);
+    assert(0);
+  };
+  void Op     (const Field &in, Field &out){
+    Field tmp(in.Grid());
+    _Mat.Mdag(in,tmp);
+    MdagMLinearOperator<Matrix,Field> HermOp(_Mat);
+    ConjugateGradient<Field> CG(_stp,10000);
+    CG(HermOp,tmp,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    _Mat.Mdag(in,out);
+//    out = out + shift * in;
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    assert(0);
+  }
+  void HermOp(const Field &in, Field &out){
+    assert(0);
+  }
+};
+
+template<class Field>
+void testSchurFromHess(Arnoldi<Field>& Arn, Field& src, int Nlarge, int Nm, int Nk) {
+
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout << GridLogMessage << "Testing Schur reordering, Nm = " << Nm << ", Nk = " << Nk << std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+
+  std::cout << GridLogMessage << "Running Arnoldi for 1 iteration to get a Hessenberg." << std::endl;
+  Arn(src, 1, Nlarge, Nm, Nlarge);
+  Eigen::MatrixXcd Hess = Arn.getHessenbergMat();
+  std::cout << GridLogMessage << "Hessenberg for use: " << std::endl << Hess << std::endl;
+
+  ComplexSchurDecomposition schur (Hess, true);
+  bool isDecomposed = schur.checkDecomposition();
+  std::cout << "Schur decomp holds? " << isDecomposed << std::endl;
+
+  std::cout << GridLogMessage << "S = " << std::endl << schur.getMatrixS() << std::endl;
+  std::cout << GridLogMessage << "Swapping S(3, 3) with S(4, 4)" << std::endl;
+  schur.swapEvals(3);
+  std::cout << GridLogMessage << "S after swap = " << std::endl << schur.getMatrixS() << std::endl;
+  std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
+
+  // Now move last diagonal element all the way to the front.
+  std::cout << GridLogMessage << "Moving last eval to front. S at start = " << std::endl << schur.getMatrixS() << std::endl;
+  for (int i = 0; i < Nk - 1; i++) {
+    int swapIdx = Nk - 2 - i;
+    schur.swapEvals(swapIdx);
+    std::cout << GridLogMessage << "S after swap of index " << swapIdx << " = " << std::endl << schur.getMatrixS() << std::endl;
+    std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
+  }
+
+  std::cout << GridLogMessage << "Testing Schur reorder" << std::endl;
+  schur.schurReorder(Nk);
+  std::cout << GridLogMessage << "S after reorder = " << std::endl << schur.getMatrixS() << std::endl;
+  std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
+
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int Ls=16;
+
+//   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+//  std::vector<int> lat_size {32, 32, 32, 32};
+//  std::cout << "Lattice size: " << lat_size << std::endl;
+  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
+								          GridDefaultSimd(Nd,vComplex::Nsimd()),
+								          GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+//  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+//  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  GridCartesian         * FGrid   = UGrid;
+  GridRedBlackCartesian * FrbGrid = UrbGrid;
+
+  // Construct a coarsened grid
+  // poare TODO: replace this with the following line?
+  Coordinate clatt = GridDefaultLatt();
+//   Coordinate clatt = GridDefaultLatt();              // [PO] initial line before I edited it
+  for(int d=0;d<clatt.size();d++){
+  std::cout << GridLogMessage<< clatt[d] <<std::endl;
+    clatt[d] = clatt[d]/2;
+    //    clatt[d] = clatt[d]/4;
+  }
+  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
+  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
+
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+  std::vector<int> cseeds({5,6,7,8});
+  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
+
+  LatticeFermion result(FGrid); result=Zero();
+  LatticeFermion    ref(FGrid); ref=Zero();
+  LatticeFermion    tmp(FGrid);
+  LatticeFermion    err(FGrid);
+  LatticeGaugeField Umu(UGrid);
+
+  FieldMetaData header;
+  std::string file("config");
+//  std::string file("Users/patrickoare/libraries/PETSc-Grid/ckpoint_lat.4000");
+  NerscIO::readConfiguration(Umu,header,file);
+
+  LanczosParameters LanParams;
+  {
+    XmlReader  HMCrd("LanParams.xml");
+    read(HMCrd,"LanczosParameters",LanParams);
+  }
+
+  std::cout << GridLogMessage<< LanParams <<std::endl;
+  {
+    XmlWriter HMCwr("LanParams.xml.out");
+    write(HMCwr,"LanczosParameters",LanParams);
+  }
+
+
+  RealD mass=0.01;
+  RealD M5=1.8;
+
+  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
+  int Nm = 50;
+  int Nk = 12; 
+  int Np = 38; 
+  // int Nk = Nm+1;     // if just running once
+  int maxIter = 10000;
+  int Nstop = 10;
+  RealD resid = 1.0e-5;
+
+  std::vector<Complex> boundary = {1,1,1,-1};
+  WilsonOp::ImplParams Params(boundary);
+
+//  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+//  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
+
+  mass=LanParams.mass;
+  std::cout << GridLogIRL<< "mass "<<mass<<std::endl;
+  WilsonOp WilsonOperator(Umu,*UGrid,*UrbGrid,mass,Params);
+
+  // const int nbasis = 20;            // size of approximate basis for low-mode space
+  const int nbasis = 3;            // size of approximate basis for low-mode space
+  const int cb = 0 ;
+  LatticeFermion prom(FGrid);
+
+  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
+  typedef LittleDiracOperator::CoarseVector CoarseVector;
+
+  NextToNearestStencilGeometry5D geom(Coarse5d);
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+
+//  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
+//  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
+//  typedef ShiftedComplexPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedComplexPVdagM_t;
+//  PVdagM_t PVdagM(Ddwf, Dpv);
+//  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv);
+//  SquaredLinearOperator<DomainWallFermionD, LatticeFermionD> Dsq (Ddwf);
+//  NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
+
+
+  NonHermitianLinearOperator<WilsonOp,FermionField> Dwilson(WilsonOperator); /// <-----
+//  InvertNonHermitianLinearOperator<WilsonOp,FermionField> Iwilson(WilsonOperator); /// <-----
+  MdagMLinearOperator<WilsonOp,FermionField> HermOp(WilsonOperator); /// <-----
+  Gamma5HermitianLinearOperator <WilsonOp,LatticeFermion> HermOp2(WilsonOperator); /// <----
+
+  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
+  resid=LanParams.resid;
+  Nstop=LanParams.Nstop;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
+  maxIter=LanParams.maxIter;
+  Nm = Nk + Np;
+  int Nu=16;
+  std::vector<LatticeFermion> src(Nu,FGrid); 
+  for(int i=0;i<Nu;i++) random(RNG5,src[i]);
+
+  if(LanParams.ReadEvec) {
+    std::string evecs_file="evec_in";
+    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD;
+    RD.open(evecs_file);
+    RD.readScidacFieldRecord(src[0],record);
+    RD.close();
+  }
+
+  Coordinate origin ({0,0,0,0});
+  auto tmpSrc = peekSite(src[0], origin);
+  std::cout << "[DEBUG] Source at origin = " <<  tmpSrc << std::endl;
+  LatticeFermion src2 = src[0];
+
+  // Run KrylovSchur and Arnoldi on a Hermitian matrix
+  std::cout << GridLogMessage << "Running Krylov Schur" << std::endl;
+#if 0
+#if 1
+    RealD shift=1.5;
+    KrylovSchur KrySchur (Dwilson, UGrid, resid,EvalImNormSmall);
+    KrySchur(src[0], maxIter, Nm, Nk, Nstop,&shift);
+#else
+    KrylovSchur KrySchur (Iwilson, UGrid, resid,EvalImNormSmall);
+    KrySchur(src[0], maxIter, Nm, Nk, Nstop);
+#endif
+    std::cout << GridLogMessage << "evec.size= " << KrySchur.evecs.size()<< std::endl;
+#else
+  LanczosBidiagonalization<FermionField> LB(Dwilson, UGrid);
+  LB.run(src[0], Nm, resid);
+  RestartedLanczosBidiagonalization<FermionField> IRLBA(Dwilson, UGrid, Nstop, Nm, resid, maxIter,false);
+  IRLBA.run(src[0]);
+#endif
+
+#if 0
+  src[0]=KrySchur.evecs[0];
+  for (int i=1;i<Nstop;i++) src[0]+=KrySchur.evecs[i];
+  for (int i=0;i<Nstop;i++) 
+  {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
+        auto evdensity = localInnerProduct(KrySchur.evecs[i],KrySchur.evecs[i] );
+        writeFile(evdensity,evfile);
+
+  }
+
+  {
+        std::string evfile ("./evec_"+std::to_string(mass)+"_sum");
+//        auto evdensity = localInnerProduct(evec[i],evec[i] );
+        writeFile(src[0],evfile);
+  }
+#endif
+
+
+  /*
+  std::cout << GridLogMessage << "Running Arnoldi" << std::endl;
+  // Arnoldi Arn (Dsq, FGrid, 1e-8);
+  Arnoldi Arn (DLinOp, FGrid, 1e-8);
+  testSchurFromHess<LatticeFermion>(Arn, src, 10, 6, 4);
+
+  Arnoldi Arn2 (DLinOp, FGrid, 1e-8);
+  testSchurFromHess<LatticeFermion>(Arn2, src, 16, 12, 8);
+  */
+
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
+  std::cout<<GridLogMessage<<std::endl;
+  std::cout<<GridLogMessage << "Done "<< std::endl;
+
+  Grid_finalize();
+  return 0;
+}
@@ -6,7 +6,7 @@ Source file: ./tests/Test_dwf_lanczos.cc

 Copyright (C) 2015

-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -27,6 +27,9 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
+#include <Grid/parallelIO/IldgIOtypes.h>
+#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
+

 using namespace std;
 using namespace Grid;
@@ -38,18 +41,111 @@ typedef typename WilsonFermionD::FermionField FermionField;

 RealD AllZero(RealD x) { return 0.; }

+template <class T> void writeFile(T& in, std::string const fname){
+#if 1
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}
+
+
+namespace Grid {
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+		  		RealD, mass , 
+		  		RealD, mstep , 
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
+                                Integer, ReadEvec,
+	  			RealD, resid,
+	  			RealD, ChebyLow,
+	  			RealD, ChebyHigh,
+	  			Integer, ChebyOrder)
+//                                  Integer, StartTrajectory,
+//                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
+//                                  bool, MetropolisTest,
+//                                  Integer, NoMetropolisUntil,
+//                                  std::string, StartingType,
+//                                  Integer, SW,
+//				  RealD, Kappa,
+//                                  IntegratorParameters, MD)
+
+  LanczosParameters() {
+    ////////////////////////////// Default values
+      mass = 0;
+//    MetropolisTest    = true;
+//    NoMetropolisUntil = 10;
+//    StartTrajectory   = 0;
+//    SW                = 2;
+//    Trajectories      = 10;
+//    StartingType      = "HotStart";
+    /////////////////////////////////
+  }
+
+  template <class ReaderClass >
+  LanczosParameters(Reader<ReaderClass> & TheReader){
+    initialize(TheReader);
+  }
+
+  template < class ReaderClass > 
+  void initialize(Reader<ReaderClass> &TheReader){
+//    std::cout << GridLogMessage << "Reading HMC\n";
+    read(TheReader, "HMC", *this);
+  }
+
+
+  void print_parameters() const {
+//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
+//    MD.print_parameters();
+  }
+  
+};
+
+}
+
 int main(int argc, char** argv) {
  Grid_init(&argc, &argv);

+  int Ndir=4;
+  auto mpi_layout  = GridDefaultMpi();
+  std::vector<int> nblock(4,1);
+  std::vector<int> mpi_split(4,1);
+//Interested in avoiding degeneracy only for now
+  nblock[3]=2;
+
+  int mrhs=1;
+  for(int i =0;i<Ndir;i++){
+      mpi_split[i] = mpi_layout[i] / nblock[i];
+      mrhs *= nblock[i];
+  }
+
+
  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
      GridDefaultMpi());
-  GridRedBlackCartesian* UrbGrid =
-      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
+  GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
+                                                    GridDefaultSimd(Nd,vComplex::Nsimd()),
+                                                    mpi_split,
+                                                    *UGrid);
+
+  GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+
  GridCartesian* FGrid = UGrid;
  GridRedBlackCartesian* FrbGrid = UrbGrid;
-  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid,
-         FrbGrid);
+//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);

  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
@@ -61,7 +157,16 @@ int main(int argc, char** argv) {
  RNG5.SeedFixedIntegers(seeds5);

  LatticeGaugeField Umu(UGrid);
-  SU<Nc>::HotConfiguration(RNG4, Umu);
+//  SU<Nc>::HotConfiguration(RNG4, Umu);
+//  SU<Nc>::ColdConfiguration(Umu);
+
+  FieldMetaData header;
+  std::string file("./config");
+
+//  int precision32 = 0;
+//  int tworow      = 0;
+//  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
+  NerscIO::readConfiguration(Umu,header,file);

 /*
  std::vector<LatticeColourMatrix> U(4, UGrid);
@@ -70,30 +175,100 @@ int main(int argc, char** argv) {
  }
 */

-  RealD mass = -0.1;
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
-  MdagMLinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator); /// <-----
-  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
+  int Nstop = 10;
+  int Nu = 1;
+  int Nk = 20;
+  int Np = 80;
+  int Nm = Nk + Np;
+  int MaxIt = 10000;
+  RealD resid = 1.0e-5;

-  const int Nstop = 20;
-  const int Nk = 60;
-  const int Np = 60;
-  const int Nm = Nk + Np;
-  const int MaxIt = 10000;
-  RealD resid = 1.0e-6;
+  RealD mass = -1.0;
+
+  LanczosParameters LanParams;
+#if 1
+  {
+    XmlReader  HMCrd("LanParams.xml");
+    read(HMCrd,"LanczosParameters",LanParams);
+  }
+#else
+  {
+    LanParams.mass = mass;
+  }
+#endif
+  std::cout << GridLogMessage<< LanParams <<std::endl;
+  { 
+    XmlWriter HMCwr("LanParams.xml.out");
+    write(HMCwr,"LanczosParameters",LanParams);
+  }
+
+  mass=LanParams.mass;
+  resid=LanParams.resid;
+  Nstop=LanParams.Nstop;
+  Nu = mrhs;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
+  Nm = Nk + Np;
+
+//  FermionField src(FGrid);
+  std::vector<FermionField> src(Nu,FGrid);
+  for(int i =0;i<Nu;i++) gaussian(RNG5, src[i]);
+
+  if(LanParams.ReadEvec) {
+    std::string evecs_file="evec_in";
+    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD;
+    RD.open(evecs_file);
+    RD.readScidacFieldRecord(src[0],record);
+    RD.close();
+  }
+
+  std::vector<Complex> boundary = {1,1,1,-1};
+//  std::vector<Complex> boundary = {1,1,1,1};
+  FermionOp::ImplParams Params(boundary);
+
+  GridCartesian         * SFGrid   = SGrid;
+  GridRedBlackCartesian * SFrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SFGrid);
+//  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
+
+  LatticeGaugeField s_Umu(SGrid);
+  Grid_split  (Umu,s_Umu);
+
+
+
+while ( mass > - 2.0){
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
+  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
+  FermionOp WilsonSplit(s_Umu,*SFGrid,*SFrbGrid,mass,Params);
+  MdagMLinearOperator<FermionOp,FermionField> SHermOp(WilsonSplit); /// <-----
+  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
+  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----

  std::vector<double> Coeffs{0, 1.};
  Polynomial<FermionField> PolyX(Coeffs);
-  Chebyshev<FermionField> Cheby(0.0, 10., 12);
+//  Chebyshev<FermionField> Cheby(0.5, 60., 31);
+//                                  RealD, ChebyLow,
+//                                RealD, ChebyHigh,
+//                                Integer, ChebyOrder)
+
+  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);

  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
     PlainHermOp<FermionField> Op     (HermOp);
+     PlainHermOp<FermionField> Op2     (HermOp2);

-  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
-
+//  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
+//  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);
+    ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
+                                                     FrbGrid,SFrbGrid,mrhs,
+                                                     Cheby,
+                                                     Nstop, Nstop*2,
+                                                     Nu, Nk, Nm,
+                                                     resid, MaxIt,
+                                                     IRBLdiagonaliseWithEigen);
+  IRBL.split_test=1;
  std::vector<RealD> eval(Nm);
-  FermionField src(FGrid);
-  gaussian(RNG5, src);
  std::vector<FermionField> evec(Nm, FGrid);
  for (int i = 0; i < 1; i++) {
    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
@@ -101,9 +276,40 @@ int main(int argc, char** argv) {
  };

  int Nconv;
-  IRL.calc(eval, evec, src, Nconv);
+//  IRL.calc(eval, evec, src, Nconv);
+  IRBL.calc(eval, evec, src, Nconv,LanczosType::irbl);

-  std::cout << eval << std::endl;
+  std::cout << mass <<" : " << eval << std::endl;
+
+  Gamma g5(Gamma::Algebra::Gamma5) ;
+  ComplexD dot;
+  FermionField tmp(FGrid);
+  FermionField sav(FGrid);
+  sav=evec[0];
+  for (int i = 0; i < Nstop ; i++) {
+    tmp = g5*evec[i];
+    dot = innerProduct(tmp,evec[i]);
+    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
+//    if ( i<1)
+    {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
+        auto evdensity = localInnerProduct(evec[i],evec[i] );
+	writeFile(evdensity,evfile);
+    }
+    if (i>0) sav += evec[i];
+  }
+  {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_sum");
+//        auto evdensity = localInnerProduct(evec[i],evec[i] );
+	writeFile(sav,evfile);
+  }
+  for(int i =0;i<Nu;i++) src[i]=evec[i];
+  for(int i=Nu;i<Nstop;i++) src[i%Nu] +=evec[i];
+//  src  = evec[0]+evec[1]+evec[2];
+//  src  += evec[3]+evec[4]+evec[5];
+//  src  += evec[6]+evec[7]+evec[8];
+  mass += LanParams.mstep;
+}

  Grid_finalize();
 }
@@ -27,6 +27,7 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
+#include <Grid/parallelIO/IldgIOtypes.h>

 using namespace std;
 using namespace Grid;
@@ -38,11 +39,32 @@ typedef typename WilsonFermionD::FermionField FermionField;

 RealD AllZero(RealD x) { return 0.; }

+template <class T> void writeFile(T& in, std::string const fname){
+#if 1
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}
+
+
 namespace Grid {

 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
+		  		RealD, mstep , 
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
+                                Integer, ReadEvec,
+                                Integer, maxIter,
+	  			RealD, resid,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -115,12 +137,13 @@ int main(int argc, char** argv) {

  LatticeGaugeField Umu(UGrid);
 //  SU<Nc>::HotConfiguration(RNG4, Umu);
+//  SU<Nc>::ColdConfiguration(Umu);

  FieldMetaData header;
  std::string file("./config");

-  int precision32 = 0;
-  int tworow      = 0;
+//  int precision32 = 0;
+//  int tworow      = 0;
 //  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
  NerscIO::readConfiguration(Umu,header,file);

@@ -158,10 +181,32 @@ int main(int argc, char** argv) {
  }

  mass=LanParams.mass;
+  resid=LanParams.resid;
+  Nstop=LanParams.Nstop;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
+  MaxIt=LanParams.maxIter;
+  Nm = Nk + Np;
+
+  FermionField src(FGrid);
+  gaussian(RNG5, src);
+  if(LanParams.ReadEvec) {
+    std::string evecs_file="evec_in";
+    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD;
+    RD.open(evecs_file);
+    RD.readScidacFieldRecord(src,record);
+    RD.close();
+  }
+
+  std::vector<Complex> boundary = {1,1,1,-1};
+//  std::vector<Complex> boundary = {1,1,1,1};
+  FermionOp::ImplParams Params(boundary);


-while ( mass > - 5.0){
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+while ( mass > - 2.0){
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
@@ -179,11 +224,10 @@ while ( mass > - 5.0){
     PlainHermOp<FermionField> Op     (HermOp);
     PlainHermOp<FermionField> Op2     (HermOp2);

-  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
+  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
+//  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);

  std::vector<RealD> eval(Nm);
-  FermionField src(FGrid);
-  gaussian(RNG5, src);
  std::vector<FermionField> evec(Nm, FGrid);
  for (int i = 0; i < 1; i++) {
    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
@@ -192,19 +236,46 @@ while ( mass > - 5.0){

  int Nconv;
  IRL.calc(eval, evec, src, Nconv);
+//  IRL.calc(eval,  src, Nconv);

  std::cout << mass <<" : " << eval << std::endl;

  Gamma g5(Gamma::Algebra::Gamma5) ;
  ComplexD dot;
  FermionField tmp(FGrid);
+  FermionField sav(FGrid);
+  sav=evec[0];
  for (int i = 0; i < Nstop ; i++) {
    tmp = g5*evec[i];
    dot = innerProduct(tmp,evec[i]);
    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
+//    if ( i<1)
+    {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
+        auto evdensity = localInnerProduct(evec[i],evec[i] );
+	writeFile(evdensity,evfile);
+//  if(LanParams.ReadEvec) {
+//    std::string evecs_file="evec_in";
+  {
+    std::cout << GridLogIRL<< "Reading evecs from "<<evfile<<std::endl;
+    emptyUserRecord record;
+    Grid::ScidacReader RD;
+    RD.open(evfile);
+    RD.readScidacFieldRecord(evdensity,record);
+    RD.close();
+  }
+    }
+    if (i>0) sav += evec[i];
+  }
+  {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_sum");
+//        auto evdensity = localInnerProduct(evec[i],evec[i] );
+	writeFile(sav,evfile);
  }
  src  = evec[0]+evec[1]+evec[2];
-  mass += -0.1;
+  src  += evec[3]+evec[4]+evec[5];
+  src  += evec[6]+evec[7]+evec[8];
+  mass += LanParams.mstep;
 }

  Grid_finalize();
Author	SHA1	Message	Date
Chulwoo Jung	09aa843984	Changed batchedInnerProduct for portability	2026-03-17 18:54:18 -04:00
Chulwoo Jung	24752002fa	Verbosity reduction batched inner product for reorthogonalization	2026-03-17 13:02:16 -04:00
Chulwoo Jung	f3223021fd	RestartedLanczosBidiagonalization seems to have been fixed	2026-03-16 14:34:56 -04:00
Chulwoo Jung	4e1d95d3bb	Claude implementation of Thick Restarted Lanczos Bidiagonalization	2026-03-13 19:12:54 -04:00
Chulwoo Jung	2ed38f63ca	Merge branch 'develop' of https://github.com/paboyle/Grid into KS_shifted	2026-03-12 10:49:21 -04:00
Chulwoo Jung	80d2a8d88d	wqMerge branch 'develop' of https://github.com/paboyle/Grid into KS_shifted	2026-03-11 21:49:26 -04:00
Peter Boyle	595ceaac37	Include grid header and make the ENABLE correct	2026-03-11 17:24:44 -04:00
Peter Boyle	daf5834e8e	Fixing incorrect PR about disable fermion instantiations	2026-03-11 17:05:46 -04:00
Chulwoo Jung	2ac5431401	Turning of NERSC header checking	2026-03-06 14:16:20 -05:00
Peter Boyle	0d8658a039	Optimised	2026-03-05 06:06:32 -05:00
Peter Boyle	095e004d01	Setup change GCR	2026-03-05 06:06:32 -05:00
Peter Boyle	0acabee7f6	Modest change	2026-03-05 06:06:32 -05:00
Peter Boyle	76fbcffb60	Improvement to 16^3 hdcg	2026-03-05 06:06:32 -05:00
Peter Boyle	a0a62d7ead	Merge pull request #478 from vataspro/PolyakovUpstream Spatial Polyakov Loop implementation	2026-02-24 20:45:42 -05:00
Peter Boyle	c5038ea6a5	Merge pull request #483 from cmcknigh/bugfix/rocm7-rocblas-type-refactor Adding a version check to handle rocBlas type refactor	2026-02-24 20:45:03 -05:00
Peter Boyle	a5120903eb	Merge pull request #486 from RChrHill/fix/sp4-fp32 Define Sp4 ProjectOnGeneralGroup for generic vtype	2026-02-24 20:44:08 -05:00
Peter Boyle	00b286a08a	Merge pull request #488 from RChrHill/feature/additional-ET-traces Add ET support for Lattice spin- and colour-traces	2026-02-24 20:43:45 -05:00
Peter Boyle	24a9759353	Merge pull request #485 from edbennett/skip-fermion-instantiations Be able to skip compiling fermion instantiations altogether	2026-02-24 20:43:20 -05:00
edbennett	1b56f6f46d	be able to skip compiling fermion instantiations altogether	2026-02-24 23:52:18 +00:00
Peter Boyle	2a8084d569	Subspace setup	2026-02-13 17:26:11 -05:00
Peter Boyle	6ff29f9d4f	Alternate multigrids	2026-02-13 17:25:45 -05:00
RChHill	c4d3e79193	Add ET support for Lattice spin- and colour-traces	2026-01-29 14:46:52 +00:00
Chulwoo Jung	3e71cac7ae	Merge branch 'specflow2' of github.com:chulwoo1/Grid into KS_shifted	2026-01-15 03:40:52 +00:00
Chulwoo Jung	e8e7ef08fc	KrylovSchur and spectral flow updates	2026-01-15 03:20:01 +00:00
Chulwoo Jung	5c00fe6bef	Merge branch 'develop' of github.com:poare/Grid into KS_shifted	2026-01-12 06:26:15 +00:00
Chulwoo Jung	3175788f97	Added explicit shift before pulling	2026-01-12 06:25:09 +00:00
Patrick Oare	6f1788bb38	modified Givens rotation to implement a sparse multiplication	2026-01-06 16:19:48 -05:00
Peter Boyle	7cd3f21e6b	preserving a bunch of experiments on setup and g5 subspace doubling	2026-01-06 05:57:39 -05:00
Chulwoo Jung	dcda74f924	Timing info for schurReorder,etc	2025-12-18 18:23:50 +00:00
Chulwoo Jung	df4c2a082b	Reducing comments	2025-12-09 14:23:22 +00:00
Chulwoo Jung	88611659a3	Appear to be working	2025-12-08 21:08:14 -05:00
Chulwoo Jung	504b85dfc0	Restarting and adding codes back in	2025-12-08 13:27:06 -05:00
Chulwoo Jung	43ea83e5e1	Checking in to move back to genoa	2025-12-05 23:56:40 +00:00
Chulwoo Jung	376150c3df	Adding	2025-12-04 21:29:31 -05:00
Chulwoo Jung	842e0391e7	Checking in to move back to aurora	2025-12-04 20:13:44 -05:00
Chulwoo Jung	17e3799bcc	Necessary code for Harmonic KS added	2025-12-03 19:38:45 -05:00
Chulwoo Jung	985ab70f85	Checking in without adjusting Nk	2025-12-03 14:46:34 -05:00
Chulwoo Jung	1e85081986	Adding shift and debugging	2025-12-03 00:16:51 -05:00
Chulwoo Jung	3876fe5a29	Merge branch 'KrylovSchur' of github.com:chulwoo1/Grid into KS_shifted	2025-12-02 17:46:40 -05:00
Chulwoo Jung	6692425aa2	Checking in before pulling	2025-11-26 17:17:22 -05:00
Chulwoo Jung	d5ac4fc67f	Starting to modified KS	2025-11-26 22:13:27 +00:00
Chulwoo Jung	3538faf449	Starting Harmonic (shift and inverse)	2025-11-24 17:05:35 -05:00
RChHill	b650b89682	Define Sp4 ProjectOnGeneralGroup for generic vtype	2025-11-19 13:26:52 +00:00
Patrick Oare	0b457b9d52	fixed ritz estimate bug	2025-11-07 18:56:08 +00:00
Chulwoo Jung	fe0ab5f1a9	Merge branch 'develop' of github.com:poare/Grid into develop	2025-11-07 15:50:22 +00:00
Chulwoo Jung	caa66418bd	Checking in before pulling	2025-11-06 22:44:05 +00:00
Allen McKnight	4304245c1b	Merge branch 'develop' into bugfix/rocm7-rocblas-type-refactor	2025-11-04 08:50:11 -06:00
Chulwoo Jung	786496f22e	Checking in before pulling KrylovSchur	2025-11-03 21:18:56 +00:00
Patrick R Oare	68af1bba67	commented some slow code out	2025-10-31 11:47:29 -04:00
Patrick Oare	bf2a715ef7	bug in wilson eigenvectors: ritz estimates not equalling deviation from being an evec	2025-10-31 15:31:46 +00:00
Patrick Oare	4042ebf1bf	added ImNorm to sort	2025-10-20 19:01:53 +00:00
Your Name	1d1fd3bcaf	adding a version check to handle rocblas type change	2025-10-02 15:24:24 -05:00
Patrick R Oare	82f35001ff	small bug fix for wilson spectrum since we're actually running DWF	2025-09-25 15:36:42 -04:00
Patrick Oare	fa30c791aa	updated wilson spec	2025-09-23 15:24:50 +00:00
Patrick Oare	612049f1dd	commented out evec writer because it was taking up all the space on SDCC	2025-09-18 15:09:31 -04:00
Patrick Oare	0b92ef990c	found bug in unprec DWF: was using \|\cdot\| in comparison for the eigenvalue sorting	2025-09-12 13:31:39 -04:00
Patrick Oare	82d411ca7b	added inline to rf functions	2025-09-10 17:16:48 -04:00
Patrick Oare	597086a031	added wilson spectrum example	2025-09-10 15:41:00 -04:00
Patrick Oare	b210ddf9a7	added commented out line to run un-preconditioned DWF	2025-09-09 15:14:11 -04:00
Patrick Oare	c5d02e5799	updated RitzFilter enum and the input to run krylov schur	2025-09-09 13:02:11 -04:00
Patrick Oare	9dcd7ca761	added IO for evecs / evals	2025-09-08 12:59:48 -04:00
Chulwoo Jung	c1e5ef9476	Adding config input	2025-08-15 20:52:36 +00:00
Patrick Oare	6fd71aea9d	may have found bug	2025-08-15 12:13:01 -04:00
Patrick Oare	a18b0d496c	added more debug output	2025-08-15 11:51:15 -04:00
Patrick Oare	19f0737b98	trying one more thing	2025-08-14 14:47:38 -04:00
Patrick Oare	16d3c9cf75	added another debug feature	2025-08-14 14:37:49 -04:00
Patrick Oare	3b9fc72451	modified debug output slightly	2025-08-14 14:06:32 -04:00
Patrick Oare	99644f5d0a	commented out arg assert in kryschur spec example	2025-08-13 14:57:57 -04:00
Chulwoo Jung	7780d88d26	Adding simple lanczos, boundary to specflow(!)	2025-08-06 23:41:53 +00:00
Chulwoo Jung	2bf9179d2c	Adding mass step	2025-08-06 16:52:51 +00:00
Chulwoo Jung	c606f5dca0	Move out src initialization for re-use / Adding antiperiodic BC	2025-08-06 16:51:14 +00:00
Patrick Oare	632f5916c7	small log change for KS	2025-08-04 15:43:23 -04:00
Patrick Oare	9057694895	added double orthog to KS	2025-08-04 15:30:18 -04:00
Patrick Oare	5e85aef19d	added updates to GCR polynomial code	2025-07-31 16:42:35 -04:00
Patrick Oare	2b6d40c7e1	added example files	2025-07-31 16:41:28 -04:00
Patrick Oare	33b80c4e8e	added eigensolver code for arnoldi and krylov schur	2025-07-31 16:40:24 -04:00
Chulwoo Jung	8419cc5c64	specflow evec I/O added,	2025-07-11 15:57:23 -04:00
Alexis Provatas	c646d91527	Fix names, protect against bad index values, clean docstrings	2025-05-01 10:52:00 +01:00
Alexis Provatas	a2b98d82e1	remove obsolete spatial polyakov observable file	2025-05-01 10:52:00 +01:00
Alexis Provatas	7b9415c088	Move observable logger to Polyakov Loop file and fix docstring	2025-05-01 10:52:00 +01:00
Alexis Provatas	cb7110f492	Add Spatial Polyakov Loop observable	2025-05-01 10:52:00 +01:00
Alexis Provatas	0c7af66490	Create Spatial Polyakov Observable Module	2025-05-01 10:52:00 +01:00
Alexis Provatas	496d1b914a	Generalise Polyakov loop and overload for temporal direction	2025-05-01 10:52:00 +01:00
Chulwoo Jung	2cc6deb8e0	Merge branch 'develop' of https://github.com/paboyle/Grid into ic2	2025-04-25 10:48:41 -04:00
Chulwoo Jung	19d0590579	Checking in for merging	2025-04-25 10:48:22 -04:00