Changed batchedInnerProduct for portability

Verbosity reduction batched inner product for reorthogonalization
RestartedLanczosBidiagonalization seems to have been fixed
2026-06-23 20:23:17 +01:00 · 2026-03-17 18:54:18 -04:00 · 2026-03-17 13:02:16 -04:00 · 2026-03-16 14:34:56 -04:00 · 2026-03-13 19:12:54 -04:00 · 2026-03-12 10:49:21 -04:00
90 changed files with 10441 additions and 201 deletions
@@ -54,22 +54,24 @@ Version.h: version-cache
 include Make.inc
 include Eigen.inc
-extra_sources+=$(WILS_FERMION_FILES)
+if BUILD_FERMION_INSTANTIATIONS
-extra_sources+=$(STAG_FERMION_FILES)
+  extra_sources+=$(WILS_FERMION_FILES)
  extra_sources+=$(STAG_FERMION_FILES)
 if BUILD_ZMOBIUS
-  extra_sources+=$(ZWILS_FERMION_FILES)
+    extra_sources+=$(ZWILS_FERMION_FILES)
 endif
 if BUILD_GPARITY
-  extra_sources+=$(GP_FERMION_FILES)
+    extra_sources+=$(GP_FERMION_FILES)
 endif
 if BUILD_FERMION_REPS
-  extra_sources+=$(ADJ_FERMION_FILES)
+    extra_sources+=$(ADJ_FERMION_FILES)
-  extra_sources+=$(TWOIND_FERMION_FILES)
+    extra_sources+=$(TWOIND_FERMION_FILES)
 endif
 if BUILD_SP
    extra_sources+=$(SP_FERMION_FILES)
 if BUILD_FERMION_REPS
-    extra_sources+=$(SP_TWOIND_FERMION_FILES)
+      extra_sources+=$(SP_TWOIND_FERMION_FILES)
 endif
 endif
 endif
@@ -75,6 +75,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/iterative/SimpleLanczos.h>
 #include <Grid/algorithms/iterative/PowerMethod.h>
 #include <Grid/algorithms/iterative/AdefGeneric.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
@@ -83,4 +84,9 @@ NAMESPACE_CHECK(PowerMethod);
 NAMESPACE_CHECK(multigrid);
 #include <Grid/algorithms/FFT.h>
 #include <Grid/algorithms/iterative/KrylovSchur.h>
 #include <Grid/algorithms/iterative/Arnoldi.h>
 #include <Grid/algorithms/iterative/LanczosBidiagonalization.h>
 #include <Grid/algorithms/iterative/RestartedLanczosBidiagonalization.h>
 #endif
@@ -28,6 +28,7 @@ Author: Peter Boyle <pboyle@bnl.gov>
 #pragma once
 #ifdef GRID_HIP
 #include <hip/hip_version.h>
 #include <hipblas/hipblas.h>
 #endif
 #ifdef GRID_CUDA
@@ -255,16 +256,29 @@ public:
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
 #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasZgemmBatched(gridblasHandle,
 				   hOpA,
 				   hOpB,
 				   m,n,k,
-				   (hipblasDoubleComplex *) &alpha_p[0],
+				   (hipDoubleComplex *) &alpha_p[0],
-				   (hipblasDoubleComplex **)&Amk[0], lda,
+				   (hipDoubleComplex **)&Amk[0], lda,
-				   (hipblasDoubleComplex **)&Bkn[0], ldb,
+				   (hipDoubleComplex **)&Bkn[0], ldb,
-				   (hipblasDoubleComplex *) &beta_p[0],
+				   (hipDoubleComplex *) &beta_p[0],
-				   (hipblasDoubleComplex **)&Cmn[0], ldc,
+				   (hipDoubleComplex **)&Cmn[0], ldc,
 				   batchCount);
 #else
    auto err = hipblasZgemmBatched(gridblasHandle,
                                   hOpA,
                                   hOpB,
                                   m,n,k,
                                   (hipblasDoubleComplex *) &alpha_p[0],
                                   (hipblasDoubleComplex **)&Amk[0], lda,
                                   (hipblasDoubleComplex **)&Bkn[0], ldb,
                                   (hipblasDoubleComplex *) &beta_p[0],
                                   (hipblasDoubleComplex **)&Cmn[0], ldc,
                                   batchCount);
 #endif
    //	 std::cout << " hipblas return code " <<(int)err<<std::endl;
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
@@ -503,17 +517,30 @@ public:
    if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
    if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
    if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
 #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasCgemmBatched(gridblasHandle,
 				   hOpA,
 				   hOpB,
 				   m,n,k,
-				   (hipblasComplex *) &alpha_p[0],
+				   (hipComplex *) &alpha_p[0],
-				   (hipblasComplex **)&Amk[0], lda,
+				   (hipComplex **)&Amk[0], lda,
-				   (hipblasComplex **)&Bkn[0], ldb,
+				   (hipComplex **)&Bkn[0], ldb,
-				   (hipblasComplex *) &beta_p[0],
+				   (hipComplex *) &beta_p[0],
-				   (hipblasComplex **)&Cmn[0], ldc,
+				   (hipComplex **)&Cmn[0], ldc,
 				   batchCount);
 #else
    auto err = hipblasCgemmBatched(gridblasHandle,
                                   hOpA,
                                   hOpB,
                                   m,n,k,
                                   (hipblasComplex *) &alpha_p[0],
                                   (hipblasComplex **)&Amk[0], lda,
                                   (hipblasComplex **)&Bkn[0], ldb,
                                   (hipblasComplex *) &beta_p[0],
                                   (hipblasComplex **)&Cmn[0], ldc,
                                   batchCount);
 #endif
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
@@ -1094,11 +1121,19 @@ public:
    GRID_ASSERT(info.size()==batchCount);
 #ifdef GRID_HIP
 #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
-				    (hipblasDoubleComplex **)&Ann[0], (int)n,
+				    (hipDoubleComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
 				    (int*) &info[0],
 				    (int)batchCount);
 #else
    auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
                                    (hipblasDoubleComplex **)&Ann[0], (int)n,
                                    (int*) &ipiv[0],
                                    (int*) &info[0],
                                    (int)batchCount);
 #endif
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
@@ -1124,11 +1159,20 @@ public:
    GRID_ASSERT(info.size()==batchCount);
 #ifdef GRID_HIP
 #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
-				    (hipblasComplex **)&Ann[0], (int)n,
+				    (hipComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
 				    (int*) &info[0],
 				    (int)batchCount);
 #else
    auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
                                    (hipblasComplex **)&Ann[0], (int)n,
                                    (int*) &ipiv[0],
                                    (int*) &info[0],
                                    (int)batchCount);
 #endif
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
@@ -1201,12 +1245,22 @@ public:
    GRID_ASSERT(Cnn.size()==batchCount);
 #ifdef GRID_HIP
 #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
-				    (hipblasDoubleComplex **)&Ann[0], (int)n,
+				    (hipDoubleComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
-				    (hipblasDoubleComplex **)&Cnn[0], (int)n,
+				    (hipDoubleComplex **)&Cnn[0], (int)n,
 				    (int*) &info[0],
 				    (int)batchCount);
 #else
    auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
                                    (hipblasDoubleComplex **)&Ann[0], (int)n,
                                    (int*) &ipiv[0],
                                    (hipblasDoubleComplex **)&Cnn[0], (int)n,
                                    (int*) &info[0],
                                    (int)batchCount);
 #endif
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
@@ -1235,12 +1289,21 @@ public:
    GRID_ASSERT(Cnn.size()==batchCount);
 #ifdef GRID_HIP
 #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >=7)
    auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
-				    (hipblasComplex **)&Ann[0], (int)n,
+				    (hipComplex **)&Ann[0], (int)n,
 				    (int*) &ipiv[0],
-				    (hipblasComplex **)&Cnn[0], (int)n,
+				    (hipComplex **)&Cnn[0], (int)n,
 				    (int*) &info[0],
 				    (int)batchCount);
 #else
    auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
                                    (hipblasComplex **)&Ann[0], (int)n,
                                    (int*) &ipiv[0],
                                    (hipblasComplex **)&Cnn[0], (int)n,
                                    (int*) &info[0],
                                    (int)batchCount);
 #endif
    GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
 #endif
 #ifdef GRID_CUDA
@@ -92,8 +92,8 @@ class TwoLevelCGmrhs
  // Vector case
  virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
  {
-    //    SolveSingleSystem(src,x);
+    SolveSingleSystem(src,x);
-    SolvePrecBlockCG(src,x);
+	// SolvePrecBlockCG(src,x);
  }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -0,0 +1,433 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid 
 Source file: ./lib/algorithms/iterative/Arnoldi.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Patrick Oare <poare@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_ARNOLDI_H
 #define GRID_ARNOLDI_H
 NAMESPACE_BEGIN(Grid); 
 //Moved to KrylovSchur
 #if 0
 /**
 <<<<<<< HEAD
 * Options for which Ritz values to keep in implicit restart.
 */
 enum RitzFilter {
  EvalNormSmall,           // Keep evals with smallest norm
  EvalNormLarge,           // Keep evals with largest norm
  EvalReSmall,             // Keep evals with smallest real part
  EvalReLarge              // Keep evals with largest real part
 };
 // Select comparison function from RitzFilter
 struct ComplexComparator
 {
  RitzFilter f;
  ComplexComparator (RitzFilter _f) : f(_f) {}
  bool operator()(std::complex<double> z1, std::complex<double> z2) { 
    switch (f) {
      RealD tmp1, tmp2;
      tmp1=std::abs(std::imag(z1));
      tmp2=std::abs(std::imag(z2));
      case EvalNormSmall:
        return std::abs(z1) < std::abs(z2);
      case EvalNormLarge:
        return std::abs(z1) > std::abs(z2);
 // Terrible hack
 //        return std::abs(std::real(z1)) < std::abs(std::real(z2));
 //	if ( std::abs(std::real(z1))  >4.) tmp1 +=1.;
 //	if ( std::abs(std::real(z2))  >4.) tmp2 +=1.;
      case EvalReSmall:
 	  return tmp1 < tmp2;
 //        return std::abs(std::imag(z1)) < std::abs(std::imag(z2));
      case EvalReLarge:
 	  return tmp1 > tmp2;
 //        return std::abs(std::real(z1)) > std::abs(std::real(z2));
      default:
        assert(0);
    }
  }
 };
 =======
 >>>>>>> 68af1bba67dd62881ead5ab1e54962a5486a0791
 #endif
 /**
 * Implementation of the Arnoldi algorithm.
 */
 template<class Field> 
 class Arnoldi {
  private:
    std::string cname = std::string("Arnoldi");
    int MaxIter;   // Max iterations
    RealD Tolerance;
    RealD ssq;
    RealD rtol;
    int Nm;           // Number of basis vectors to track (equals MaxIter if no restart)
    int Nk;           // Number of basis vectors to keep every restart (equals -1 if no restart)
    int Nstop;       // Stop after converging Nstop eigenvectors.
    LinearOperatorBase<Field> &Linop;
    GridBase *Grid;
    RealD approxLambdaMax;
    RealD beta_k;
    Field f;
    std::vector<Field> basis;               // orthonormal Arnoldi basis
    Eigen::MatrixXcd Hess;                  // Hessenberg matrix of size Nbasis (after construction)
    Eigen::MatrixXcd Qt;                    // Transpose of basis rotation which projects out high modes.
    Eigen::VectorXcd evals;                 // evals of Hess
    Eigen::MatrixXcd littleEvecs;           // Nm x Nm evecs matrix
    std::vector<Field> evecs;               // Vector of evec fields
    RitzFilter ritzFilter;                        // how to sort evals
  public:       
    Arnoldi(LinearOperatorBase<Field> &_Linop, GridBase *_Grid, RealD _Tolerance, RitzFilter filter = EvalReSmall)
      : Linop(_Linop), Grid(_Grid), Tolerance(_Tolerance), ritzFilter(filter), f(_Grid), MaxIter(-1), Nm(-1), Nk(-1), 
          Nstop (-1), evals (0), evecs (), ssq (0.0), rtol (0.0), beta_k (0.0), approxLambdaMax (0.0)
    {
      f = Zero();
    };
    /**
     * Runs the Arnoldi loop with(out) implicit restarting. For each iteration:
     *   - Runs an Arnoldi step.
     *   - Computes the eigensystem of the Hessenberg matrix.
     *   - Performs implicit restarting.
     */
    void operator()(const Field& v0, int _maxIter, int _Nm, int _Nk, int _Nstop, bool doubleOrthog = false) {
      MaxIter = _maxIter;
      Nm = _Nm; Nk = _Nk;
      Nstop = _Nstop;
      ssq = norm2(v0);
      RealD approxLambdaMax = approxMaxEval(v0);
      rtol = Tolerance * approxLambdaMax;
      ComplexComparator compareComplex (ritzFilter);
      std::cout << GridLogMessage << "Comparing Ritz values with: " << ritzFilter << std::endl;
      int start = 1;
      Field startVec = v0;
      littleEvecs = Eigen::MatrixXcd::Zero(Nm, Nm);
      for (int i = 0; i < MaxIter; i++) {
        std::cout << GridLogMessage << "Restart Iteration " << i << std::endl;
        // Perform Arnoldi steps to compute Krylov basis and Rayleigh quotient (Hess)
        arnoldiIteration(startVec, Nm, start, doubleOrthog);
        startVec = f;
        // compute eigensystem and sort evals
        // compute_eigensystem();
        compute_eigensystem(Hess);
        std::cout << GridLogMessage << "Eigenvalues after Arnoldi step: " << std::endl << evals << std::endl;
        std::sort(evals.begin(), evals.end(), compareComplex);
        std::cout << GridLogMessage << "Ritz values after sorting (first Nk preserved): " << std::endl << evals << std::endl;
        // SU(N)::tepidConfiguration
        // Implicit restart to de-weight unwanted eigenvalues
        implicitRestart(_Nm, _Nk);      // probably can delete _Nm and _Nk from function args
        start = Nk;
        // check convergence and return if needed.
        int Nconv = converged();
        std::cout << GridLogMessage << "Number of evecs converged: " << Nconv << std::endl;
        if (Nconv >= Nstop || i == MaxIter - 1) {
          std::cout << GridLogMessage << "Converged with " << Nconv << " / " << Nstop << " eigenvectors on iteration " 
                        << i << "." << std::endl;
          basisRotate(evecs, Qt, 0, Nk, 0, Nk, Nm);
          std::cout << GridLogMessage << "Eigenvalues [first " << Nconv << " converged]: " << std::endl << evals << std::endl;
          return;
        }
      }      
    }
    /**
     * Approximates the maximum eigenvalue of Linop.Op to normalize the residual and test for convergence. 
     * 
     * Parameters
     * ----------
     * Field& v0
     *  Source field to start with. Must have non-zero norm.
     * int MAX_ITER (default = 50)
     *  Maximum number of iterations for power approximation. 
     * 
     * Returns
     * -------
     * RealD lamApprox
     *  Approximation of largest eigenvalue. 
     */
    RealD approxMaxEval(const Field& v0, int MAX_ITER = 50) {
      assert (norm2(v0) > 1e-8);                        // must have relatively large source norm to start
      RealD lamApprox = 0.0;
      RealD denom = 1.0; RealD num = 1.0;
      Field v0cp (Grid); Field tmp (Grid);
      v0cp = v0;
      denom = std::sqrt(norm2(v0cp));
      for (int i = 0; i < MAX_ITER; i++) {
        Linop.Op(v0cp, tmp);                               // CAREFUL: do not do Op(tmp, tmp)
        v0cp = tmp;
        num = std::sqrt(norm2(v0cp));                      // num = |A^{n+1} v0|
        lamApprox = num / denom;                           // lam = |A^{n+1} v0| / |A^n v0|
        std::cout << GridLogDebug << "Approx for max eval: " << lamApprox << std::endl;
        denom = num;                                       // denom = |A^{n} v0|
      }
      return lamApprox;
    }
    /**
     * Constructs the Arnoldi basis for the Krylov space K_n(D, src). (TODO make private)
     * 
     * Parameters
     * ----------
     * v0 : Field&
     *  Source to generate Krylov basis. 
     * Nm : int
     *  Final size of the basis desired. If the basis becomes complete before a basis of size Nm is constructed 
     *  (determined by relative tolerance Tolerance), stops iteration there. 
     * doubleOrthog : bool (default = false)
     *  Whether to double orthogonalize the basis (for numerical cancellations) or not. 
     * start        : int (default = 0)
     *  If non-zero, assumes part of the Arnoldi basis has already been constructed. 
     */
    void arnoldiIteration(const Field& v0, int Nm, int start = 1, bool doubleOrthog = false)
    {
      ComplexD coeff;
      Field w (Grid);           // A acting on last Krylov vector. 
      if (start == 1) {       // initialize everything that we need.
        RealD v0Norm = 1 / std::sqrt(ssq);
        basis.push_back(v0Norm * v0);                // normalized source
        Hess = Eigen::MatrixXcd::Zero(Nm, Nm);
        f = Zero();
      } else {
        assert( start == basis.size() );      // should be starting at the end of basis (start = Nk)
        Eigen::MatrixXcd HessCp = Hess;
        Hess = Eigen::MatrixXcd::Zero(Nm, Nm);
        Hess(Eigen::seqN(0, Nk), Eigen::seqN(0, Nk)) = HessCp;
      }
      // Construct next Arnoldi vector by normalizing w_i = Dv_i - \sum_j v_j h_{ji}
      for (int i = start - 1; i < Nm; i++) {
        Linop.Op(basis.back(), w);
        for (int j = 0; j < basis.size(); j++) {
          coeff = innerProduct(basis[j], w);       // coeff = h_{ij}. Note that since {vi} is ONB it's OK to subtract it off after. 
          Hess(j, i) = coeff;
          w -= coeff * basis[j];
        }
        if (doubleOrthog) {
          // TODO implement
        }
        // add w_i to the pile
        if (i < Nm - 1) {
          coeff = std::sqrt(norm2(w));
          Hess(i+1, i) = coeff;
          basis.push_back(
            (1.0/coeff) * w
          );
        }
        // after iterations, update f and beta_k = ||f||
        f = w;                                // make sure f is not normalized
        beta_k = std::sqrt(norm2(f));         // beta_k = ||f_k|| determines convergence.
      }
      std::cout << GridLogMessage << "|f|^2 after Arnoldi step = " << norm2(f) << std::endl;
      std::cout << GridLogDebug << "Computed Hessenberg matrix = " << std::endl << Hess << std::endl;
      return;
    }
    /**
     * Approximates the eigensystem of the linear operator by computing the eigensystem of 
     * the Hessenberg matrix. Assumes that the Hessenberg matrix has already been constructed (by 
     * calling the operator() function).
     * 
     * TODO implement in parent class eventually.
     * 
     * Parameters
     * ----------
     * Eigen::MatrixXcd& S
     *  Schur matrix (upper triangular) similar to original Rayleigh quotient.
     */
    void compute_eigensystem(Eigen::MatrixXcd& S)
    {
      std::cout << GridLogMessage << "Computing eigenvalues." << std::endl;
      evecs.clear();
      Eigen::ComplexEigenSolver<Eigen::MatrixXcd> es;
      es.compute(S);
      evals = es.eigenvalues();
      littleEvecs = es.eigenvectors();
      // Convert evecs to lattice fields
      for (int k = 0; k < evals.size(); k++) {
        Eigen::VectorXcd vec = littleEvecs.col(k);
        Field tmp (basis[0].Grid());
        tmp = Zero();
        for (int j = 0; j < basis.size(); j++) {
          tmp = tmp + vec[j] * basis[j];
        }
        evecs.push_back(tmp);
      }
      std::cout << GridLogMessage << "Eigenvalues: " << std::endl << evals << std::endl;
    }
    /**
     * Verifies the factorization DV = V^\dag H + f e^\dag with the last-computed 
     * V, H, f. 
     */
    // RealD verifyFactorization() {
    //   int k = basis.size();         // number of basis vectors, also the size of H.
    //   std::vector<Field> factorized (k, Zero());
    //   Field tmp (FGrid); tmp = Zero();
    //   for (int i = 0; i < basis.size(); i++) {
    //     Linop.Op(basis[i], tmp);
    //   }
    //   // basisRotate(basis, Q, 0, Nk, 0, Nk, Nm);
    //   // Linop.Op(, )
    // }
    /* Getters */
    Eigen::MatrixXcd    getHessenbergMat()  { return Hess; }
    Field               getF()              { return f; }
    std::vector<Field>  getBasis()          { return basis; }
    Eigen::VectorXcd    getEvals()          { return evals; }
    std::vector<Field>  getEvecs()          { return evecs; }
    /**
     * Implements implicit restarting for Arnoldi. Assumes eigenvalues are sorted. 
     * 
     * Parameters
     * ----------
     * int _Nm
     *  Size of basis to keep (Hessenberg is MxM).
     * int Nk
     *  Number of basis vectors to keep at each restart.
     */
    void implicitRestart(int _Nm, int _Nk) {
      assert ( _Nk <= _Nm );
      Nm = _Nm; Nk = _Nk;
      int Np = Nm - Nk;       // keep Nk smallest (or largest, depends on sort function) evecs
      std::cout << GridLogMessage << "Computing QR Factorizations." << std::endl;
      Eigen::MatrixXcd Q = Eigen::MatrixXcd::Identity(Nm, Nm);
      Eigen::MatrixXcd Qi (Nm, Nm);
      Eigen::MatrixXcd R (Nm, Nm);
      for (int i = Nk; i < Nm; i++) {        // keep the first Nk eigenvalues and iterate through the last Np. Should loop Np times
        // Useful debugging output
        std::cout << GridLogDebug << "Computing QR factorization for i = " << i << std::endl;
        std::cout << GridLogDebug << "Eval shift = " << evals[i] << std::endl;
        std::cout << GridLogDebug << "Hess before rotation: " << Hess << std::endl;
        // QR factorize 
        Eigen::HouseholderQR<Eigen::MatrixXcd> QR (Hess - evals[i] * Eigen::MatrixXcd::Identity(Nm, Nm));
        Qi = QR.householderQ();
        Q = Q * Qi;
        Hess = Qi.adjoint() * Hess * Qi;
        std::cout << GridLogDebug << "Qt up to i = " << Q.transpose() << std::endl;
      }
      std::cout << GridLogDebug << "Hess after all rotations: " << std::endl << Hess << std::endl; 
      // form Arnoldi vector f: f is normal to the basis vectors and its norm \beta is used to determine the Ritz estimate. 
      std::complex<double> beta = Hess(Nk, Nk-1);
      std::complex<double> sigma = Q(Nm-1, Nk-1);
      f = basis[Nk] * beta + f * sigma;
      RealD betak = std::sqrt(norm2(f));
      std::cout << GridLogMessage << "|f|^2 after implicit restart = " << norm2(f) << std::endl;
      // Rotate basis by Qt
      Qt = Q.transpose();
      basisRotate(basis, Qt, 0, Nk + 1, 0, Nm, Nm);
      // rotate
      basisRotate(evecs, Qt, 0, Nk + 1, 0, Nm, Nm);
      // Truncate the basis and restart
      basis = std::vector<Field> (basis.begin(), basis.begin() + Nk);
      // evecs = std::vector<Field> (evecs.begin(), evecs.begin() + Nk);
      Hess = Hess(Eigen::seqN(0, Nk), Eigen::seqN(0, Nk));
      std::cout << "evecs size: " << evecs.size() << std::endl;
    }
    /**
     * Computes the number of Arnoldi eigenvectors that have converged. An eigenvector s is considered converged 
     * for a tolerance epsilon if 
     *    r(s) := |\beta e_m^T s| < epsilon
     * where beta is the norm of f_{m+1}.
     * 
     * Parameters
     * ----------
     * 
     * Returns
     * -------
     * int : Number of converged eigenvectors.
     */
    int converged() {
      int Nconv = 0;
      for (int k = 0; k < evecs.size(); k++) {
        RealD emTs = std::abs(littleEvecs(Nm - 1, k));           // e_m^T s
        RealD ritzEstimate = beta_k * emTs;
        // TODO should be ritzEstimate < Tolerance * lambda_max
        std::cout << GridLogMessage << "Ritz estimate for evec " << k << " = " << ritzEstimate << std::endl;
        if (ritzEstimate < rtol) {
          Nconv++;
        }
      }
      return Nconv;
    }
 };
 NAMESPACE_END(Grid);
 #endif
@@ -0,0 +1,277 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/algorithms/iterative/ConjugateGradientTimeslice.h
 Copyright (C) 2015
 Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 			   /*  END LEGAL */
 #ifndef GRID_CONJUGATE_GRADIENT_TIMESLICE_H
 #define GRID_CONJUGATE_GRADIENT_TIMESLICE_H
 NAMESPACE_BEGIN(Grid);
 /////////////////////////////////////////////////////////////
 // Base classes for iterative processes based on operators
 // single input vec, single output vec.
 /////////////////////////////////////////////////////////////
 /**
 * Simple modification of conjugate gradient that outputs the residual as a function 
 * of time, in order to study the large wavelength behavior of the solver. 
 */
 template <class Field>
 class ConjugateGradientTimeslice : public OperatorFunction<Field> {
 public:
  using OperatorFunction<Field>::operator();
  bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
                           // Defaults true.
  RealD Tolerance;
  Integer MaxIterations;
  Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
  RealD TrueResidual;
  ConjugateGradientTimeslice(RealD tol, Integer maxit, bool err_on_no_conv = true)
    : Tolerance(tol),
      MaxIterations(maxit),
      ErrorOnNoConverge(err_on_no_conv)
  {};
  virtual void LogIteration(int k,RealD a,RealD b){
    //    std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
  };
  virtual void LogBegin(void){
    std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
  };
    void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
      this->LogBegin();
      GRID_TRACE("ConjugateGradientTimeslice");
    GridStopWatch PreambleTimer;
    GridStopWatch ConstructTimer;
    GridStopWatch NormTimer;
    GridStopWatch AssignTimer;
    PreambleTimer.Start();
    psi.Checkerboard() = src.Checkerboard();
    conformable(psi, src);
    RealD cp, c, a, d, b, ssq, qq;
    //RealD b_pred;
    // Was doing copies
    ConstructTimer.Start();
    Field p  (src.Grid());
    Field mmp(src.Grid());
    Field r  (src.Grid());
    ConstructTimer.Stop();
    // Initial residual computation & set up
    NormTimer.Start();
    ssq = norm2(src);                 // Norm of source vector ||b||^2
    ssqtx = localNorm2(src);          // Norm |b(x, t)|^2 as a field
    std::vector<RealD> ssqt;          // Norm of source not summed over time slices, ssq(t) = \sum_x |b(x, t)|^2
    sliceSum(ssqtx, ssqt, Tdir);      // TODO make sure Tdir is globally defined
    RealD guess = norm2(psi);         // Norm of initial guess ||psi||^2
    NormTimer.Stop();
    assert(std::isnan(guess) == 0);
    AssignTimer.Start();
    if ( guess == 0.0 ) {
      r = src;
      p = r;
      a = ssq;
    } else { 
      Linop.HermOpAndNorm(psi, mmp, d, b);        // 
      r = src - mmp;      // Initial residual r0 = b - A guess
      p = r;              // initial conj vector p0 = r0
      a = norm2(p);
    }
    cp = a;
    AssignTimer.Stop();
    // Handle trivial case of zero src
    if (ssq == 0.){
      psi = Zero();
      IterationsToComplete = 1;
      TrueResidual = 0.;
      return;
    }
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   src " << ssq << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:    mp " << d << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:   mmp " << b << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:  cp,r " << cp << std::endl;
    std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient:     p " << a << std::endl;
    RealD rsq = Tolerance * Tolerance * ssq;
    // Check if guess is really REALLY good :)
    if (cp <= rsq) {
      TrueResidual = std::sqrt(a/ssq);
      std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
      IterationsToComplete = 0;	
      return;
    }
    std::cout << GridLogIterative << std::setprecision(8)
              << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
    PreambleTimer.Stop();
    GridStopWatch LinalgTimer;
    GridStopWatch InnerTimer;
    GridStopWatch AxpyNormTimer;
    GridStopWatch LinearCombTimer;
    GridStopWatch MatrixTimer;
    GridStopWatch SolverTimer;
    RealD usecs = -usecond();
    SolverTimer.Start();
    int k;
    for (k = 1; k <= MaxIterations; k++) {
      GridStopWatch IterationTimer;
      IterationTimer.Start();
      c = cp;
      MatrixTimer.Start();
      Linop.HermOp(p, mmp);         // Computes mmp = Ap
      MatrixTimer.Stop();
      LinalgTimer.Start();
      InnerTimer.Start();
      ComplexD dc  = innerProduct(p,mmp);         // p^\dagger A p
      InnerTimer.Stop();
      d = dc.real();
      a = c / d;
      // What is axpy? Some accelerator or something? Check Lattice_arith.h
      AxpyNormTimer.Start();
      // axpy_norm computes ax+by for vectors x and y compatible with a GPU. Here b is set to 1 (see the function in Lattice_reduction.h). 
      // The first argument passes r by reference, so it stores r --> -a * Ap + 1 * r, i.e. it performs an update on 
      // r_k --> r_{k+1} = r_k - \alpha_k A p_k. The function returns the norm squared of the first variable, i.e. ||r_{k+1}||^2.
      cp = axpy_norm(r, -a, mmp, r);
      AxpyNormTimer.Stop();
      b = cp / c;
      LinearCombTimer.Start();
      {
        autoView( psi_v , psi, AcceleratorWrite);
        autoView( p_v   , p,   AcceleratorWrite);
        autoView( r_v   , r,   AcceleratorWrite);
        accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
            coalescedWrite(psi_v[ss], a      *  p_v(ss) + psi_v(ss));
            coalescedWrite(p_v[ss]  , b      *  p_v(ss) + r_v  (ss));
        });
      }
      LinearCombTimer.Stop();
      LinalgTimer.Stop();
      LogIteration(k,a,b);
      IterationTimer.Stop();
      if ( (k % 500) == 0 ) {
        std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
      } else { 
        std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
                << " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
      }
      // Stopping condition
      if (cp <= rsq) {
        usecs +=usecond();
        SolverTimer.Stop();
        Linop.HermOpAndNorm(psi, mmp, d, qq);
        p = mmp - src;
        GridBase *grid = src.Grid();
        RealD DwfFlops = (1452. )*grid->gSites()*4*k
   	               + (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
        RealD srcnorm = std::sqrt(norm2(src));
        RealD resnorm = std::sqrt(norm2(p));
        RealD true_residual = resnorm / srcnorm;
        std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k 
          << "\tComputed residual " << std::sqrt(cp / ssq)
          << "\tTrue residual " << true_residual
          << "\tTarget " << Tolerance << std::endl;
        // GridLogMessage logs the message to the terminal output; GridLogPerformance probably writes to a log file?
        //	std::cout << GridLogMessage << "\tPreamble   " << PreambleTimer.Elapsed() <<std::endl;
        std::cout << GridLogMessage << "\tSolver Elapsed    " << SolverTimer.Elapsed() <<std::endl;
        std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
        std::cout << GridLogPerformance << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
        std::cout << GridLogPerformance << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
        std::cout << GridLogPerformance << "\t\tInner      " << InnerTimer.Elapsed() <<std::endl;
        std::cout << GridLogPerformance << "\t\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
        std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
        std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
        if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
        IterationsToComplete = k;	
        TrueResidual = true_residual;
        return;
      }
    }
    // Failed. Calculate true residual before giving up                                                         
    // Linop.HermOpAndNorm(psi, mmp, d, qq);
    //    p = mmp - src;
    //TrueResidual = sqrt(norm2(p)/ssq);
    //    TrueResidual = 1;
    std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations
    	      <<" residual "<< std::sqrt(cp / ssq)<< std::endl;
    SolverTimer.Stop();
    std::cout << GridLogMessage << "\tPreamble   " << PreambleTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tConstruct  " << ConstructTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tNorm       " << NormTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tAssign     " << AssignTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "\tSolver     " << SolverTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
    std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
    std::cout << GridLogMessage<< "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
    std::cout << GridLogPerformance << "\t\tInner      " << InnerTimer.Elapsed() <<std::endl;
    std::cout << GridLogPerformance << "\t\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl;
    std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
    if (ErrorOnNoConverge) assert(0);
    IterationsToComplete = k;
  }
 };
 NAMESPACE_END(Grid);
 #endif
@@ -53,6 +53,18 @@ enum IRLdiagonalisation {
  IRLdiagonaliseWithEigen
 };
 enum IRLeigsort { 
  IRLeigsortMax,
  IRLeigsortSqMin
 };
 #if 0
 bool square_comp(RealD a, RealD b){
 	if (a*a<b*b) return true;
 	return false;
 }
 #endif
 template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field>
 {
 public:
@@ -119,9 +131,10 @@ class ImplicitlyRestartedLanczos {
  /////////////////////////
  // Constructor
  /////////////////////////
-  
+ public:
-public:       
+  IRLeigsort EigSort;
  //////////////////////////////////////////////////////////////////
  // PAB:
  //////////////////////////////////////////////////////////////////
@@ -154,6 +167,7 @@ public:
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
    EigSort(IRLeigsortMax), 
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
    ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
@@ -170,6 +184,7 @@ public:
    Nstop(_Nstop)  ,      Nk(_Nk),      Nm(_Nm),
    eresid(_eresid),      betastp(_betastp),
    MaxIter(_MaxIter)  ,      MinRestart(_MinRestart),
    EigSort(IRLeigsortMax), 
    orth_period(_orth_period), diagonalisation(_diagonalisation)  { };
  ////////////////////////////////
@@ -316,8 +331,12 @@ until convergence
      // sorting
      //////////////////////////////////
      eval2_copy = eval2;
 //      if (EigSort==IRLeigsortMax)
 //      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),square_comp);
 //      else
      std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
      std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
 //      eval2_copy = eval2;
      const int chunk=8;
      for(int io=0; io<k2;io+=chunk){
 	std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
@@ -333,6 +352,7 @@ until convergence
      //////////////////////////////////
      Qt = Eigen::MatrixXd::Identity(Nm,Nm);
      for(int ip=k2; ip<Nm; ++ip){ 
 //        std::cout<<GridLogIRL <<"QR decompose "<<eval2[ip]<<std::endl;
 	QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
      }
      std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
@@ -375,7 +395,8 @@ until convergence
 	//  power of two search pattern;  not every evalue in eval2 is assessed.
 	int allconv =1;
-	for(int jj = 1; jj<=Nstop; jj*=2){
+//	for(int jj = 1; jj<=Nstop; jj*=2){
 	for(int jj = 1; jj<=Nstop; jj++){
 	  int j = Nstop-jj;
 	  RealD e = eval2_copy[j]; // Discard the evalue
 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	    
@@ -0,0 +1,276 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./Grid/algorithms/iterative/LanczosBidiagonalization.h
 Copyright (C) 2015
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_LANCZOS_BIDIAGONALIZATION_H
 #define GRID_LANCZOS_BIDIAGONALIZATION_H
 NAMESPACE_BEGIN(Grid);
 /**
 * Lanczos Bidiagonalization (Golub-Kahan)
 *
 * For a linear operator A with adjoint A^dag, constructs the bidiagonal
 * decomposition:
 *
 *   A  V_m = U_m B_m
 *   A^dag U_m = V_m B_m^T + beta_{m+1} v_{m+1} e_m^T
 *
 * where:
 *   V_m = [v_1, ..., v_m]  right Lanczos vectors (orthonormal)
 *   U_m = [u_1, ..., u_m]  left  Lanczos vectors (orthonormal)
 *   B_m is upper bidiagonal with diag(alpha_1,...,alpha_m) and
 *       superdiag(beta_2,...,beta_m)
 *
 * The singular values of A are approximated by those of B_m.
 * The singular values of B_m are the square roots of the eigenvalues of
 * the symmetric tridiagonal matrix B_m^T B_m.
 *
 * Usage:
 *   LanczosBidiagonalization<Field> lb(Linop, grid);
 *   lb.run(src, Nm, tol);
 *   // Access results via getters.
 */
 template <class Field>
 class LanczosBidiagonalization {
  public: 
  LinearOperatorBase<Field> &Linop;
  GridBase *Grid;
  int Nm;           // number of Lanczos steps taken
  RealD Tolerance;  // convergence threshold on beta_{k+1} / alpha_k
  std::vector<Field>  V;       // right Lanczos vectors v_1 ... v_m
  std::vector<Field>  U;       // left  Lanczos vectors u_1 ... u_m
  std::vector<RealD>  alpha;   // diagonal of bidiagonal matrix
  std::vector<RealD>  beta;    // super-diagonal (beta[k] couples u_k and v_{k+1})
  // SVD of the bidiagonal matrix (filled after computeSVD())
  Eigen::VectorXd  singularValues;
  Eigen::MatrixXd  leftSVecs;   // columns are left  singular vectors of B
  Eigen::MatrixXd  rightSVecs;  // columns are right singular vectors of B
 public:
  LanczosBidiagonalization(LinearOperatorBase<Field> &_Linop, GridBase *_Grid,
                           RealD _tol = 1.0e-8)
    : Linop(_Linop), Grid(_Grid), Tolerance(_tol), Nm(0)
  {}
  /**
   * Run the Golub-Kahan Lanczos bidiagonalization.
   *
   * Parameters
   * ----------
   * src  : starting vector (need not be normalised)
   * Nmax : maximum number of Lanczos steps
   * reorth : if true, full reorthogonalisation of both V and U bases
   */
  void run(const Field &src, int Nmax, bool reorth = true)
  {
    assert(norm2(src) > 0.0);
    V.clear(); U.clear();
    alpha.clear(); beta.clear();
    Nm = 0;
    Field p(Grid), r(Grid);
    // --- initialise: v_1 = src / ||src|| ---
    Field v(Grid);
    v = src;
    RealD nrm = std::sqrt(norm2(v));
    v = (1.0 / nrm) * v;
    V.push_back(v);
    for (int k = 0; k < Nmax; ++k) {
      // p = A v_k
      Linop.Op(V[k], p);
      // p = p - beta_k * u_{k-1}   (remove previous left vector)
      if (k > 0) {
        p = p - beta[k-1] * U[k-1];
      }
      // alpha_k = ||p||
      RealD ak = std::sqrt(norm2(p));
      if (ak < 1.0e-14) {
        std::cout << GridLogMessage
                  << "LanczosBidiagonalization: lucky breakdown at step "
                  << k << " (alpha = " << ak << ")" << std::endl;
        break;
      }
      alpha.push_back(ak);
      // u_k = p / alpha_k
      Field u(Grid);
      u = (1.0 / ak) * p;
      // full reorthogonalisation of u against previous U
      if (reorth) {
        for (int j = 0; j < (int)U.size(); ++j) {
          ComplexD ip = innerProduct(U[j], u);
          u = u - ip * U[j];
        }
        RealD unrm = std::sqrt(norm2(u));
        if (unrm > 1.0e-14) u = (1.0 / unrm) * u;
      }
      U.push_back(u);
      // r = A^dag u_k - alpha_k * v_k
      Linop.AdjOp(U[k], r);
      r = r - ak * V[k];
      // full reorthogonalisation of r against previous V
      if (reorth) {
        for (int j = 0; j < (int)V.size(); ++j) {
          ComplexD ip = innerProduct(V[j], r);
          r = r - ip * V[j];
        }
      }
      // beta_{k+1} = ||r||
      RealD bk = std::sqrt(norm2(r));
      beta.push_back(bk);
      Nm = k + 1;
      std::cout << GridLogMessage
                << "LanczosBidiagonalization step " << k
                << "  alpha = " << ak
                << "  beta  = " << bk << std::endl;
      // convergence: residual beta / alpha small enough
      if (bk / ak < Tolerance) {
        std::cout << GridLogMessage
                  << "LanczosBidiagonalization converged at step " << k
                  << "  (beta/alpha = " << bk / ak << ")" << std::endl;
        break;
      }
      if (k == Nmax - 1) break;   // no v_{k+2} needed after last step
      // v_{k+1} = r / beta_{k+1}
      Field vnext(Grid);
      vnext = (1.0 / bk) * r;
      V.push_back(vnext);
    }
  }
  /**
   * Compute the SVD of the bidiagonal matrix B using Eigen.
   * Singular values are stored in descending order.
   */
  void computeSVD()
  {
    int m = Nm;
    Eigen::MatrixXd B = Eigen::MatrixXd::Zero(m, m);
    for (int k = 0; k < m; ++k) {
      B(k, k) = alpha[k];
      if (k + 1 < m && k < (int)beta.size())
        B(k, k+1) = beta[k];
    }
    Eigen::JacobiSVD<Eigen::MatrixXd> svd(B,
        Eigen::ComputeThinU | Eigen::ComputeThinV);
    singularValues = svd.singularValues();   // already sorted descending
    leftSVecs      = svd.matrixU();
    rightSVecs     = svd.matrixV();
    std::cout << GridLogMessage
              << "LanczosBidiagonalization: singular values of B_" << m
              << std::endl;
    for (int k = 0; k < m; ++k)
      std::cout << GridLogMessage << "  sigma[" << k << "] = "
                << singularValues(k) << std::endl;
  }
  /**
   * Return the k-th approximate left singular vector of A in the full
   * lattice space.  computeSVD() must have been called first.
   */
  Field leftSingularVector(int k)
  {
    assert(k < (int)leftSVecs.cols());
    Field svec(Grid);
    svec = Zero();
    for (int j = 0; j < Nm; ++j)
      svec = svec + leftSVecs(j, k) * U[j];
    return svec;
  }
  /**
   * Return the k-th approximate right singular vector of A in the full
   * lattice space.  computeSVD() must have been called first.
   */
  Field rightSingularVector(int k)
  {
    assert(k < (int)rightSVecs.cols());
    Field svec(Grid);
    svec = Zero();
    for (int j = 0; j < Nm; ++j)
      svec = svec + rightSVecs(j, k) * V[j];
    return svec;
  }
  /**
   * Verify the bidiagonalization: returns max residual
   *   max_k || A v_k - alpha_k u_k - beta_k u_{k-1} ||
   */
  RealD verify()
  {
    Field tmp(Grid);
    RealD maxres = 0.0;
    for (int k = 0; k < Nm; ++k) {
      Linop.Op(V[k], tmp);
      tmp = tmp - alpha[k] * U[k];
      if (k > 0 && k-1 < (int)beta.size())
        tmp = tmp - beta[k-1] * U[k-1];
      RealD res = std::sqrt(norm2(tmp));
      if (res > maxres) maxres = res;
      std::cout << GridLogMessage
                << "LanczosBidiagonalization verify step " << k
                << "  ||A v_k - alpha_k u_k - beta_{k-1} u_{k-1}|| = "
                << res << std::endl;
    }
    return maxres;
  }
  /* Getters */
  int                       getNm()           const { return Nm; }
  const std::vector<Field>& getV()            const { return V; }
  const std::vector<Field>& getU()            const { return U; }
  const std::vector<RealD>& getAlpha()        const { return alpha; }
  const std::vector<RealD>& getBeta()         const { return beta; }
  Eigen::VectorXd           getSingularValues() const { return singularValues; }
 };
 NAMESPACE_END(Grid);
 #endif
@@ -60,18 +60,29 @@ public:
  void Level(int lv) { level=lv; };
-  PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) : 
+  PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax, int _nstep) : 
    Tolerance(tol), 
    MaxIterations(maxit),
    Linop(_Linop),
    Preconditioner(Prec),
    mmax(_mmax),
-    nstep(_nstep)
+    nstep(_nstep)         // what is nstep vs mmax? one is the number of inner iterations
  { 
    level=1;
    verbose=1;
  };
  // virtual method stubs for updating GCR polynomial
  virtual void LogBegin(void){
    std::cout << "GCR::LogBegin() "<<std::endl;
  };
  virtual void LogIteration(int k, ComplexD a, std::vector<ComplexD> betas){
    std::cout << "GCR::LogIteration() "<<std::endl;
  };
  virtual void LogComplete(std::vector<ComplexD>& alphas, std::vector<std::vector<ComplexD>>& betas) {
    std::cout << "GCR::LogComplete() "<<std::endl;
  };
  void operator() (const Field &src, Field &psi){
    //    psi=Zero();
@@ -96,19 +107,18 @@ public:
      GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
      if(cp<rsq) {
        SolverTimer.Stop();
-	SolverTimer.Stop();
+        Linop.Op(psi,r);
        axpy(r,-1.0,src,r);
        RealD tr = norm2(r);
        GCRLogLevel<<"PGCR: Converged on iteration " <<steps
          << " computed residual "<<sqrt(cp/ssq)
          << " true residual "    <<sqrt(tr/ssq)
          << " target "           <<Tolerance <<std::endl;
-	Linop.Op(psi,r);
+        GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
-	axpy(r,-1.0,src,r);
+        return;
 	RealD tr = norm2(r);
 	GCRLogLevel<<"PGCR: Converged on iteration " <<steps
 		 << " computed residual "<<sqrt(cp/ssq)
 		 << " true residual "    <<sqrt(tr/ssq)
 		 << " target "           <<Tolerance <<std::endl;
 	GCRLogLevel<<"PGCR Time elapsed: Total  "<< SolverTimer.Elapsed() <<std::endl;
 	return;
      }
    }
@@ -135,9 +145,9 @@ public:
    ////////////////////////////////
    // history for flexible orthog
    ////////////////////////////////
-    std::vector<Field> q(mmax,grid);
+    std::vector<Field> q(mmax,grid);        // q = Ap
-    std::vector<Field> p(mmax,grid);
+    std::vector<Field> p(mmax,grid);        // store mmax conjugate momenta
-    std::vector<RealD> qq(mmax);
+    std::vector<RealD> qq(mmax);            // qq = (Ap)^2 = <p|A^\dagger A |p> (denom of \alpha)
    GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
@@ -155,7 +165,9 @@ public:
    LinalgTimer.Start();
    r=src-Az;
    LinalgTimer.Stop();
-    GCRLogLevel<< "PGCR true residual r = src - A psi   "<<norm2(r) <<std::endl;
+    GCRLogLevel<< "PGCR true residual r = src - A psi   "<< norm2(r) <<std::endl;
    this->LogBegin();       // initialize polynomial GCR if needed (TODO think about placement of this)
    /////////////////////
    // p = Prec(r)
@@ -178,32 +190,45 @@ public:
    p[0]= z;
    q[0]= Az;
    qq[0]= zAAz;
    std::cout << "||init p - src||: " << norm2(p[0] - src) << std::endl;   // for debugging
    cp =norm2(r);
    LinalgTimer.Stop();
    std::vector<ComplexD> all_alphas;
    std::vector<std::vector<ComplexD>> all_betas;
    for(int k=0;k<nstep;k++){
      steps++;
      int kp     = k+1;
-      int peri_k = k %mmax;
+      int peri_k = k %mmax;     // only store mmax vectors; just roll around if needed
      int peri_kp= kp%mmax;
      // std::cout << "peri_kp = " << peri_kp << std::endl;
      LinalgTimer.Start();
      rq= innerProduct(q[peri_k],r); // what if rAr not real?
-      a = rq/qq[peri_k];
+      a = rq/qq[peri_k];              // compute alpha_j
-      axpy(psi,a,p[peri_k],psi);         
+      all_alphas.push_back(a);
-      cp = axpy_norm(r,-a,q[peri_k],r);
+      axpy(psi,a,p[peri_k],psi);      // update psi --> psi + \alpha p
      cp = axpy_norm(r,-a,q[peri_k],r);       // update r --> r - \alpha D p. Note q = Dp
      LinalgTimer.Stop();
-      GCRLogLevel<< "PGCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
+      // LogIterationA(k + 1, a);
-      if((k==nstep-1)||(cp<rsq)){
+      GCRLogLevel<< "GCR step["<<steps<<"]  resid " << cp << " target " <<rsq<<std::endl; 
-	return cp;
+
-      }
+      // moving this to end of loop so that it doesn't exit beforehand
      // TODO if I want to uncomment this, I have to split the LogIteration again and put LogIterationA() beforehand
      // if((k==nstep-1)||(cp<rsq)){
      //   return cp;
      // }
      PrecTimer.Start();
@@ -221,22 +246,205 @@ public:
      q[peri_kp]=Az;
      p[peri_kp]=z;
      // Field Dsrc (grid);
      // Linop.Op(src, Dsrc);
      // std::cout << "||q[peri_kp] - D(src)||: " << norm2(q[peri_kp] - Dsrc) << std::endl;   // for debugging
          // // delete after testing
          // std::cout << "Testing Dsq on one for GCR: " << std::endl;
          // Field myField (grid);
          // myField = 1.0;
          // Field out1 (grid); Field out2 (grid);
          // Linop.HermOp(myField, out1);
          // Linop.Op(myField, out2);
          // std::cout << "Dsq.Hermop(ones) has norm " << norm2(out1) << std::endl;
          // std::cout << "Dsq.Op(ones) has norm " << norm2(out2) << std::endl;
      // basically northog = k+1 if mmax is large
      int northog = ((kp)>(mmax-1))?(mmax-1):(kp);  // if more than mmax done, we orthog all mmax history.
      // std::cout << "northog: " << northog << std::endl;
      std::vector<ComplexD> betas (northog);
      // std::cout << "peri_kp: " << peri_kp << std::endl;
      // we iterate backwards counting down from the current k+1 index (peri_kp) because we 
      for(int back=0;back<northog;back++){
 	int peri_back=(k-back)%mmax;   	  GRID_ASSERT((k-back)>=0);
-	b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
+        // b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
-	p[peri_kp]=p[peri_kp]+b*p[peri_back];
+        b=-(innerProduct(q[peri_back],Az))/qq[peri_back];     // TODO try complex beta
-	q[peri_kp]=q[peri_kp]+b*q[peri_back];
+        p[peri_kp]=p[peri_kp]+b*p[peri_back];
        q[peri_kp]=q[peri_kp]+b*q[peri_back];
        // LogIterationB(peri_back, b);
        // betas[back] = b;    // may need to change the indexing if I ever do it with restarts
        // std::cout << "[DEBUG] pushing beta for back = " << back << ", peri_back = " << peri_back << std::endl;
        betas[peri_back] = b;    // may need to change the indexing if I ever do it with restarts
      }
      qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
      LinalgTimer.Stop();
      // log iteration and update GCR polynomial if necessary.
      all_betas.push_back(betas);
      LogIteration(k + 1, a, betas);
      // finish if necessary
      if((k==nstep-1)||(cp<rsq)){
        std::cout << "All alphas: " << std::endl << all_alphas << std::endl;
        std::cout << "All betas: " << std::endl << all_betas << std::endl;
        LogComplete(all_alphas, all_betas);
        std::cout << "Exiting GCR." << std::endl;
        return cp;
      }
    }
    GRID_ASSERT(0); // never reached
    return cp;
  }
 };
 class PolynomialFile: Serializable {
  public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(PolynomialFile, 
      std::vector<std::vector<std::complex<double>>>, data,
      std::vector<std::vector<std::complex<double>>>, betas,
      std::vector<std::complex<double>>,              alphas
    );
 };
 // Optionally record the GCR polynomial. [PO]: TODO
 template <class Field>
 class PGCRPolynomial : public PrecGeneralisedConjugateResidualNonHermitian<Field> {
 public:
  std::vector<ComplexD> ak;
  std::vector<std::vector<ComplexD>> bk;
  // std::vector<ComplexD> poly_p;
  std::vector<std::vector<ComplexD>> poly_p;
  std::vector<ComplexD> poly_Ap;        // polynomial in Ap_j (only store it for last p)
  std::vector<ComplexD> poly_r;
  std::vector<ComplexD> polynomial;
  PolynomialFile& PF;
 public:
  PGCRPolynomial(RealD tol, Integer maxit,LinearOperatorBase<Field> &_Linop, LinearFunction<Field> &Prec, int _mmax, int _nstep, PolynomialFile& _PF)
    : PrecGeneralisedConjugateResidualNonHermitian<Field>(tol, maxit, _Linop, Prec, _mmax, _nstep), PF(_PF)
  {};
  // think this applies the polynomial in A = Linop to a field src. The coeffs are 
  // stored in the vector `polynomial`.
  void PolyOp(const Field &src, Field &psi)
  {
    Field tmp(src.Grid());
    Field AtoN(src.Grid());
    AtoN = src;
    psi=AtoN*polynomial[0];
    for(int n=1;n<polynomial.size();n++){
      tmp = AtoN;
      this->Linop.Op(tmp,AtoN);               // iterate A^n
      psi = psi + polynomial[n]*AtoN;       // psi += poly_n A^n src
    }
  }
  // [PO TODO] debug this
  void PGCRsequence(const Field &src, Field &x)
  {
    Field Ap(src.Grid());
    Field r(src.Grid());
    // Field p(src.Grid());
    // p=src;
    std::vector<Field> p;
    p.push_back(src);
    r=src;
    x=Zero();
    x.Checkerboard()=src.Checkerboard();
    for(int k=0;k<ak.size();k++){
      x = x + ak[k]*p[k];
      this->Linop.Op(p[k], Ap);
      r = r - ak[k] * Ap;
      // p[k] = r;
      p.push_back(r);
      for (int i = 0; i < k; i++) {     // [PO TODO] check indices
        p[k+1] += bk[i, k+1] * p[i];
      }
      // p = r + bk[k] * p;
    }
  }
  void Solve(const Field &src, Field &psi)
  {
    psi=Zero();
    this->operator()(src, psi);
  }
  virtual void LogBegin(void)
  {
    std::cout << "PGCR::LogBegin() "<<std::endl;
    ak.resize(0);
    bk.resize(0);
    polynomial.resize(0);
    poly_Ap.push_back(0.0);     // start with (0.0); during first iteration should change to (0.0, 1.0)
    std::vector<ComplexD> p0_tmp;
    p0_tmp.push_back(1.0);
    poly_p.push_back(p0_tmp);
    poly_r.push_back(1.0);
  };
  // Updates vector psi and r and initializes vector p[k+1]
  virtual void LogIteration(int k, ComplexD a, std::vector<ComplexD> betas){
    std::cout << "PGCR::LogIteration(k = " << k << ")" << std::endl;
    ak.push_back(a);
    bk.push_back(betas);
    // update Ap by pushing p[k] to the right
    poly_Ap.push_back(0.0);   // need to pad the end with an element
    poly_Ap[0] = 0.0;         // technically this should be unnecessary, as the first component is never set
    for(int i = 0; i < k; i++){
      poly_Ap[i+1]=poly_p[k-1][i];        // A\vec{p} = (0, \vec{p}) bc A shifts components of p to the right
    }
    // update psi_{k+1} --> psi_k + a_k p_k
    polynomial.push_back(0.0);
    for(int i = 0; i < k; i++) {
      polynomial[i] += a * poly_p[k-1][i];
    }
    {
      std::vector<std::complex<double>> poly_stdcmplx(polynomial.begin(), polynomial.end());
      PF.data.push_back(poly_stdcmplx);
    }
    //  r_{k+1} --> r_k - a_k A p_k
    //  p_{k+1} --> r_k + \sum_{i=0}^k \beta_{ik} p_i, input betas = (\beta_{ik})_i
    poly_r.push_back(0.0);        // should be of size k+1 if we start with k = 1
    std::vector<ComplexD> p_next (k + 1, ComplexD(0.0));     // p_{k+1} = same size as r_{k+1}
    for(int i = 0; i < k + 1; i++){
      poly_r[i] = poly_r[i] - a * poly_Ap[i];     // update r_{k+1} --> r_k - \alpha_k A p_k
      p_next[i] = poly_r[i];                 // init new vector as r_{k+1}
    }
    // p_{k+1} --> p_{k+1} + \sum_i \beta_{ij} p_i
    int nbeta = betas.size();
    std::cout << "Betas: " << betas << std::endl;
    for (int j = 0; j < nbeta; j++) {
      for (int i = 0; i < j+1; i++) {
        p_next[i] += betas[j] * poly_p[j][i];
      }
    }
    poly_p.push_back(p_next);                 // add p_{k+1} to the list of p's
  }
  virtual void LogComplete(std::vector<ComplexD>& alphas, std::vector<std::vector<ComplexD>>& betas) {
    /** Logs all alphas and betas to complete the iterations. */
    std::cout << "PGCR::LogComplete() "<<std::endl;
    for (int i = 0; i < alphas.size(); i++) {
      PF.alphas.push_back(std::complex<double>(alphas[i].real(), alphas[i].imag()));
      std::vector<std::complex<double>> beta_stdcmplx(betas[i].begin(), betas[i].end());
      PF.betas.push_back(beta_stdcmplx);
    }
  };
 };
 NAMESPACE_END(Grid);
 #endif
@@ -0,0 +1,753 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./Grid/algorithms/iterative/RestartedLanczosBidiagonalization.h
 Copyright (C) 2015
 Author: Chulwoo Jung <chulwoo@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_RESTARTED_LANCZOS_BIDIAGONALIZATION_H
 #define GRID_RESTARTED_LANCZOS_BIDIAGONALIZATION_H
 NAMESPACE_BEGIN(Grid);
 /**
 * Implicitly Restarted Lanczos Bidiagonalization (IRLBA)
 *
 * Computes the p largest (or p smallest) singular triplets of a linear
 * operator A using the Golub-Kahan-Lanczos bidiagonalization with implicit
 * restart via thick-restart / QR shifts.
 *
 * Algorithm (Baglama & Reichel, SIAM J. Sci. Comput. 27(1):19-42, 2005):
 *
 *   Outer loop:
 *     1. Extend the p-step (or seed) bidiagonalization to k steps:
 *           A  V_k = U_k B_k
 *           A^dag U_k = V_k B_k^T + beta_{k+1} v_{k+1} e_k^T
 *     2. Compute SVD:  B_k = X Sigma Y^T
 *     3. Check convergence of the p desired singular values via
 *           |beta_{k+1} * y_{k,i}|  <  tol * sigma_i
 *        where y_{k,i} is the last component of the i-th right singular vector.
 *     4. Apply k-p implicit QR shifts to implicitly compress the basis
 *        to p steps (Sorensen-Lehoucq thick restart):
 *           B_p^+ = X_p^T B_k Y_p   (upper bidiagonal, p x p)
 *        and update the lattice vectors:
 *           V_p^+ = V_k Y_p
 *           U_p^+ = U_k X_p
 *        The new residual coupling is
 *           beta_p^+ v_{p+1}^+ = beta_{k+1} v_{k+1} * (e_k^T Y_p)_p
 *                               + B_k(p,p+1) * (orthogonal tail from QR)
 *     5. Go to step 1.
 *
 * Template parameter
 * ------------------
 *   Field : lattice field type (must support Grid algebra operations)
 *
 * Usage
 * -----
 *   RestartedLanczosBidiagonalization<Field> irlba(Linop, grid, p, k, tol, maxIter);
 *   irlba.run(src);
 *   // Results available via getters.
 */
 template <class Field>
 class RestartedLanczosBidiagonalization {
 public:
  LinearOperatorBase<Field> &Linop;
  GridBase *Grid;
  int    Nk;       // number of desired singular triplets
  int    Nm;       // Lanczos basis size (Nm > Nk)
  RealD  Tolerance;
  int    MaxIter;
  bool   largest; // if true, target largest singular values; otherwise smallest
  // Converged singular triplets (filled after run())
  std::vector<RealD>  singularValues;   // sigma_0 >= sigma_1 >= ...
  std::vector<Field>  leftVectors;      // approximate left singular vectors
  std::vector<Field>  rightVectors;     // approximate right singular vectors
 private:
  // Working bases (size up to Nm+1)
  std::vector<Field>  V;    // right Lanczos vectors
  std::vector<Field>  U;    // left  Lanczos vectors
  std::vector<RealD>  alpha;
  std::vector<RealD>  beta;
  // After a thick restart, the column at index restart_col of U^dag A V
  // has extra non-zero entries (rows 0..restart_col-2) beyond what the
  // upper bidiagonal captures.  fvec[j] = <U[j] | A V[restart_col]> for
  // j = 0..restart_col-1.  (fvec[restart_col-1] == beta[restart_col-1].)
  // reset_col == -1 means no restart has occurred yet (pure bidiagonal).
  std::vector<RealD>  fvec;
  int                 restart_col;
 public:
  RestartedLanczosBidiagonalization(LinearOperatorBase<Field> &_Linop,
                                    GridBase *_Grid,
                                    int _Nk, int _Nm,
                                    RealD _tol   = 1.0e-8,
                                    int   _maxIt = 300,
                                    bool  _largest = true)
    : Linop(_Linop), Grid(_Grid),
      Nk(_Nk), Nm(_Nm),
      Tolerance(_tol), MaxIter(_maxIt),
      largest(_largest)
  {
    assert(Nm > Nk);
  }
  /**
   * Run IRLBA starting from src.
   * On exit, singularValues, leftVectors, rightVectors are filled with
   * the Nk converged singular triplets.
   */
  void run(const Field &src)
  {
    assert(norm2(src) > 0.0);
    singularValues.clear();
    leftVectors.clear();
    rightVectors.clear();
    // Allocate working bases
    V.clear(); U.clear();
    alpha.clear(); beta.clear();
    fvec.clear(); restart_col = -1;
    V.reserve(Nm + 1);
    U.reserve(Nm);
    // Seed: v_0 = src / ||src||
    Field vtmp(Grid);
    vtmp = src;
    RealD nrm = std::sqrt(norm2(vtmp));
    vtmp = (1.0 / nrm) * vtmp;
    V.push_back(vtmp);
    int pStart = 0;  // current basis size at start of extension
    RealD betaRestart = 0.0; // coupling from previous restart
    for (int iter = 0; iter < MaxIter; ++iter) {
      // ----------------------------------------------------------------
      // Step 1: extend from pStart steps to Nm steps
      // ----------------------------------------------------------------
      extendBasis(pStart, Nm, betaRestart);
 //      verify();
      // ----------------------------------------------------------------
      // Step 2: SVD of the Nm x Nm B matrix.
      // iter=0 (pStart==0): B is exactly bidiagonal — use buildBidiagonal.
      // iter>0 (pStart==Nk): after a thick restart, column restart_col of
      // U^dag A V has extra off-diagonal entries captured by fvec; use
      // buildFullB so the Ritz values and restart vectors are computed from
      // the exact projected matrix A V = U B_full.
      // ----------------------------------------------------------------
      Eigen::MatrixXd B = (pStart == 0) ? buildBidiagonal(Nm) : buildFullB(Nm);
      Eigen::JacobiSVD<Eigen::MatrixXd> svd(B,
          Eigen::ComputeThinU | Eigen::ComputeThinV);
      Eigen::VectorXd sigma = svd.singularValues();  // descending
      Eigen::MatrixXd X     = svd.matrixU();          // Nm x Nm left SVecs of B
      Eigen::MatrixXd Y     = svd.matrixV();          // Nm x Nm right SVecs of B
      // If targeting smallest, reorder so desired ones come first
      Eigen::VectorXi order = sortOrder(sigma);
      // ----------------------------------------------------------------
      // Step 3: check convergence of the Nk desired singular values
      // ----------------------------------------------------------------
      RealD betaK = beta.back();  // beta_{k+1}
      // In our convention A V = U B (exact), the residual is in the A^dag
      // direction: A^dag u_j - sigma_j v_j = betaK * X[Nm-1,j] * V[Nm].
      // Convergence criterion: |betaK * X[Nm-1, idx]| < tol * sigma_idx.
      int nconv = 0;
      for (int i = 0; i < Nk; ++i) {
        int idx = order(i);
        RealD res = std::abs(betaK * X(Nm - 1, idx));
        RealD thr = Tolerance * std::max(sigma(idx), 1.0e-14);
        std::cout << GridLogMessage
                  << "IRLBA iter " << iter
                  << "  sigma[" << i << "] = " << sigma(idx)
                  << "  res = " << res
                  << "  thr = " << thr << std::endl;
        if (res < thr) ++nconv;
        else break;  // residuals not strictly ordered but break is conservative
      }
      if (nconv >= Nk) {
        std::cout << GridLogMessage
                  << "IRLBA converged: " << nconv << " singular values after "
                  << iter + 1 << " iterations." << std::endl;
        // Collect converged triplets
        extractTriplets(Nm, sigma, X, Y, order, Nk);
        return;
      }
      // ----------------------------------------------------------------
      // Step 4: implicit restart — compress to Nk steps
      // ----------------------------------------------------------------
      implicitRestart(Nm, Nk, sigma, X, Y, order, betaK, betaRestart);
 //      verify();
      // Lucky breakdown: exact invariant subspace found; convergence is exact.
      // B_p^+ = diag(alpha[0..Nk-1]); extract directly from restart basis.
      if (betaRestart < 1.0e-14) {
        std::cout << GridLogMessage
                  << "IRLBA: lucky breakdown after restart (betaRestart = 0)."
                  << " Extracting " << Nk << " exact Ritz triplets." << std::endl;
        // Re-run SVD on the p-step diagonal B^+ to get sorted Ritz triplets.
        Eigen::MatrixXd Bp = buildBidiagonal(Nk);
        Eigen::JacobiSVD<Eigen::MatrixXd> svdp(Bp,
            Eigen::ComputeThinU | Eigen::ComputeThinV);
        Eigen::VectorXi ordp = sortOrder(svdp.singularValues());
        extractTriplets(Nk, svdp.singularValues(), svdp.matrixU(),
                        svdp.matrixV(), ordp, Nk);
        return;
      }
      pStart = Nk;
    }
    std::cout << GridLogMessage
              << "IRLBA: did not converge in " << MaxIter
              << " iterations. Returning best approximations." << std::endl;
    // Return best available approximations
    Eigen::MatrixXd B = buildFullB((int)alpha.size());
    Eigen::JacobiSVD<Eigen::MatrixXd> svd(B,
        Eigen::ComputeThinU | Eigen::ComputeThinV);
    Eigen::VectorXd sigma = svd.singularValues();
    Eigen::MatrixXd X     = svd.matrixU();
    Eigen::MatrixXd Y     = svd.matrixV();
    Eigen::VectorXi order = sortOrder(sigma);
    int nout = std::min(Nk, (int)alpha.size());
    extractTriplets((int)alpha.size(), sigma, X, Y, order, nout);
  }
  /* Getters */
  int getNk() const { return (int)singularValues.size(); }
  const std::vector<RealD>&  getSingularValues() const { return singularValues; }
  const std::vector<Field>&  getLeftVectors()    const { return leftVectors; }
  const std::vector<Field>&  getRightVectors()   const { return rightVectors; }
  /**
   * Print B_k and U^dag A V to verify the bidiagonalization relation
   *   A V_m = U_m B_m   (exact in our GK convention)
   * On the first call (pStart=0), max|B - U^dag A V| should be ~machine epsilon.
   * After a restart and extension, the column p of U^dag A V deviates from B
   * by O(betaK): this is expected because the thick restart breaks the Krylov
   * structure at column p, introducing off-diagonal terms proportional to betaK.
   * These terms vanish as betaK -> 0 (convergence), so the algorithm is correct.
   */
  void verify()
  {
    int m  = (int)alpha.size();
    int nU = (int)U.size();
    int nV = (int)V.size();
    if (m == 0) { std::cout << GridLogMessage << "IRLBA verify: empty basis" << std::endl; return; }
    // Build reference matrix Bref (nU x nV):
    //   Columns 0..m-1 : buildFullB(m)  (bidiagonal + fvec column at restart_col)
    //   Column  m      : residual column, two cases:
    //     (a) restart_col == m (right after implicitRestart, before extendBasis):
    //         V[m] = sgn*V_old[Nm], so <U[i]|A|V[m]> = fvec[i] for all i
    //     (b) otherwise (pure GK or after extendBasis):
    //         only entry (m-1, m) = beta[m-1]  (GK recurrence residual)
    Eigen::MatrixXd Bref = Eigen::MatrixXd::Zero(nU, nV);
    {
      Eigen::MatrixXd Bfull = buildFullB(m);
      int cols = std::min(m, nV);
      Bref.block(0, 0, m, cols) = Bfull.block(0, 0, m, cols);
    }
    if (nV > m && m > 0) {
      if (restart_col == m && (int)fvec.size() == m) {
        // Case (a): right after implicitRestart
        for (int i = 0; i < m; ++i) Bref(i, m) = fvec[i];
      } else if ((int)beta.size() >= m) {
        // Case (b): standard GK residual column
        Bref(m - 1, m) = beta[m - 1];
      }
    }
    // Compute M[i,j] = <U[i] | A | V[j]>
    Eigen::MatrixXd M = Eigen::MatrixXd::Zero(nU, nV);
    Field Avj(Grid);
    for (int j = 0; j < nV; ++j) {
      Linop.Op(V[j], Avj);
      for (int i = 0; i < nU; ++i) {
        ComplexD ip = innerProduct(U[i], Avj);
        M(i, j) = ip.real();
      }
    }
    // Print Bref
    std::cout << GridLogMessage
              << "IRLBA verify: Bref (" << nU << "x" << nV << "):" << std::endl;
    for (int i = 0; i < nU; ++i) {
      std::cout << GridLogMessage << "  row " << i << ": ";
      for (int j = 0; j < nV; ++j) std::cout << Bref(i,j) << " ";
      std::cout << std::endl;
    }
    // Print U^dag A V
    std::cout << GridLogMessage
              << "IRLBA verify: U^dag A V (" << nU << "x" << nV << "):" << std::endl;
    for (int i = 0; i < nU; ++i) {
      std::cout << GridLogMessage << "  row " << i << ": ";
      for (int j = 0; j < nV; ++j) std::cout << M(i,j) << " ";
      std::cout << std::endl;
    }
    // Max deviation over the full nU x nV matrix
    RealD maxdev = (Bref - M).cwiseAbs().maxCoeff();
    std::cout << GridLogMessage
              << "IRLBA verify: max|Bref - U^dag A V| = " << maxdev << std::endl;
    // Beta
    std::cout << GridLogMessage << "IRLBA verify: beta[0.." << (int)beta.size()-1 << "] = ";
    for (auto b : beta) std::cout << b << " ";
    std::cout << std::endl;
  }
 private:
  // ------------------------------------------------------------------
  // Build the m x m upper-bidiagonal matrix from alpha[0..m-1], beta[0..m-2]
  // ------------------------------------------------------------------
  Eigen::MatrixXd buildBidiagonal(int m) const
  {
    Eigen::MatrixXd B = Eigen::MatrixXd::Zero(m, m);
    for (int k = 0; k < m; ++k) {
      B(k, k) = alpha[k];
      if (k + 1 < m && k < (int)beta.size())
        B(k, k + 1) = beta[k];
    }
    return B;
  }
  // ------------------------------------------------------------------
  // Build the full m x m B matrix, including the non-bidiagonal column
  // at restart_col that arises after a thick restart.
  //
  // After restart, A V[restart_col] has projections onto all U[0..restart_col-1]
  // (not just U[restart_col-1]).  These are stored in fvec[0..restart_col-1]
  // and make column restart_col of U^dag A V non-bidiagonal.
  // ------------------------------------------------------------------
  Eigen::MatrixXd buildFullB(int m) const
  {
    Eigen::MatrixXd B = buildBidiagonal(m);
    if (restart_col >= 0 && restart_col < m && (int)fvec.size() > 0) {
      for (int j = 0; j < restart_col && j < (int)fvec.size(); ++j){
        B(j, restart_col) = fvec[j];
        std::cout << GridLogDebug << "buildFullB: B  " <<j<<" "<<restart_col<<B(j, restart_col)<<std::endl;
      }
    }
    return B;
  }
  // ------------------------------------------------------------------
  // Return a permutation vector that puts the desired Nk singular values
  // first (largest first if largest==true, smallest first otherwise).
  // Eigen's JacobiSVD already returns sigma in descending order, so for
  // largest we just return 0,1,...,m-1; for smallest we reverse.
  // ------------------------------------------------------------------
  Eigen::VectorXi sortOrder(const Eigen::VectorXd &sigma) const
  {
    int m = (int)sigma.size();
    Eigen::VectorXi ord(m);
    if (largest) {
      for (int i = 0; i < m; ++i) ord(i) = i;
    } else {
      for (int i = 0; i < m; ++i) ord(i) = m - 1 - i;
    }
    return ord;
  }
  // ------------------------------------------------------------------
  // Extend the Lanczos bidiagonalization from pStart to kEnd steps.
  // On first call pStart==0 (V[0] already set).
  // On restart calls V[0..pStart], U[0..pStart-1], alpha[0..pStart-1],
  // beta[0..pStart-1] are already set; betaRestart is the coupling
  // beta_{pStart} that drives the first new U step.
  // ------------------------------------------------------------------
  void extendBasis(int pStart, int kEnd, RealD betaRestart)
  {
    // Truncate containers to pStart (Lattice has no default constructor)
    if ((int)V.size() > pStart + 1) V.erase(V.begin() + pStart + 1, V.end());
    if ((int)U.size() > pStart)     U.erase(U.begin() + pStart,     U.end());
    alpha.resize(pStart);
    beta.resize(pStart);
    Field p(Grid), r(Grid);
    for (int k = pStart; k < kEnd; ++k) {
      // p = A v_k
      Linop.Op(V[k], p);
      // Remove previous left vector coupling
      if (k > 0) {
        p = p - beta[k - 1] * U[k - 1];
      }
      // On the first step after a restart, beta[pStart-1] was already set;
      // but V[pStart] was already constructed including the beta correction,
      // so no extra subtraction needed here beyond the standard recurrence.
      // Reorthogonalize p against U, then alpha_k = ||p||, u_k = p/alpha_k
      reorthogonalize(p, U);
      RealD ak = std::sqrt(norm2(p));
      if (ak < 1.0e-14) {
        std::cout << GridLogMessage
                  << "IRLBA extendBasis: lucky breakdown at step " << k
                  << " (alpha = " << ak << ")" << std::endl;
        alpha.push_back(ak);
        Field zero(Grid); zero = Zero();
        U.push_back(zero);
        beta.push_back(0.0);
        V.push_back(zero);
        break;
      }
      alpha.push_back(ak);
      Field u(Grid);
      u = (1.0 / ak) * p;
      U.push_back(u);
      // r = A^dag u_k - alpha_k v_k, reorthogonalize, then beta_{k+1} = ||r||
      Linop.AdjOp(U[k], r);
      r = r - ak * V[k];
      reorthogonalize(r, V);
      RealD bk = std::sqrt(norm2(r));
      beta.push_back(bk);
      std::cout << GridLogMessage
                << "IRLBA extend step " << k
                << "  alpha = " << ak
                << "  beta  = " << bk << std::endl;
      // Always push v_{k+1} (needed as residual direction for restart)
      if (bk < 1.0e-14) {
        std::cout << GridLogMessage
                  << "IRLBA extendBasis: lucky breakdown (beta = 0) at step "
                  << k << std::endl;
        Field zero(Grid); zero = Zero();
        V.push_back(zero);
        break;
      }
      Field vnext(Grid);
      vnext = (1.0 / bk) * r;
      V.push_back(vnext);
      if (k == kEnd - 1) break;  // v_{k+1} pushed above; stop here
    }
  }
 public:
  // ------------------------------------------------------------------
  // Block reorthogonalization helpers.
  // Declared public because CUDA extended lambdas cannot live inside
  // private/protected member functions.
  //
  // batchInnerProducts: computes c[j] = <basis[j], vec> for all j
  //   in a single GPU pass (one accelerator_barrier instead of n).
  //   Queues n pairs of (per-site kernel, reduceKernel) to computeStream
  //   without intermediate CPU syncs, then syncs once at the end.
  //
  // batchUpdate: computes vec -= sum_j c[j]*basis[j] in one GPU kernel.
  //
  // reorthogonalize: two-pass Classical Gram-Schmidt (CGS2) using the
  //   two helpers above.  Each pass costs 2 GPU syncs (1 IP + 1 update)
  //   instead of 2n syncs per pass in the old sequential MGS.
  // ------------------------------------------------------------------
  void batchInnerProducts(const Field &vec,
                          const std::vector<Field> &basis,
                          std::vector<ComplexD> &c)
  {
    int n = (int)basis.size();
    c.resize(n);
    if (n == 0) return;
    typedef typename Field::vector_object         vobj;
    typedef decltype(innerProduct(vobj(), vobj())) inner_t;
    typedef decltype(basis[0].View(AcceleratorRead)) View;
    GridBase *grid = vec.Grid();
    uint64_t oSites = grid->oSites();
    uint64_t nsimd  = grid->Nsimd();
    // all_ip[j * oSites + ss] = per-site inner product of basis[j] and vec at site ss.
    // Layout: n contiguous blocks of oSites each.
    deviceVector<inner_t> all_ip((uint64_t)n * oSites);
    inner_t *all_ip_p = &all_ip[0];
    hostVector<View>   h_basis_v(n);
    deviceVector<View> d_basis_v(n);
    for (int j = 0; j < n; ++j) {
      h_basis_v[j] = basis[j].View(AcceleratorRead);
      acceleratorPut(d_basis_v[j], h_basis_v[j]);
    }
    View *basis_vp = &d_basis_v[0];
    // Queue n per-site kernels to the accelerator stream — no intermediate barriers.
    autoView(vec_v, vec, AcceleratorRead);
    for (int j = 0; j < n; ++j) {
      int      jj      = j;
      uint64_t oSites_ = oSites;
      accelerator_for(ss, oSites, nsimd, {
        auto x = coalescedRead(basis_vp[jj][ss]);
        auto y = coalescedRead(vec_v[ss]);
        coalescedWrite(all_ip_p[jj * oSites_ + ss], innerProduct(x, y));
      });
    }
    // ONE sync after all n kernels
    accelerator_barrier();
    // Copy all per-site results to host
    hostVector<inner_t> all_ip_h((uint64_t)n * oSites);
    acceleratorCopyFromDevice(all_ip_p, &all_ip_h[0], (uint64_t)n * oSites * sizeof(inner_t));
    // Reduce on host: sum over oSites, then collapse SIMD lanes via Reduce(TensorRemove(...))
    // TensorRemove strips the iSinglet tensor wrapper to expose the SIMD scalar type.
    // Reduce sums all nsimd lanes and returns a plain scalar (RealD or ComplexD).
    std::vector<ComplexD> raw(n);
    for (int j = 0; j < n; ++j) {
      inner_t sum = Zero();
      for (uint64_t ss = 0; ss < oSites; ++ss)
        sum += all_ip_h[(uint64_t)j * oSites + ss];
      raw[j] = ComplexD(Reduce(TensorRemove(sum)));
    }
    grid->GlobalSumVector(&raw[0], n);
    for (int j = 0; j < n; ++j) c[j] = raw[j];
    for (int j = 0; j < n; ++j) h_basis_v[j].ViewClose();
  }
  void batchUpdate(Field &vec,
                   const std::vector<Field> &basis,
                   const std::vector<ComplexD> &c)
  {
    int n = (int)basis.size();
    if (n == 0) return;
    typedef typename Field::vector_object vobj;
    typedef decltype(basis[0].View(AcceleratorRead)) View;
    GridBase *grid = vec.Grid();
    uint64_t oSites = grid->oSites();
    uint64_t nsimd  = grid->Nsimd();
    // Split complex coefficients into real/imag double arrays on device.
    // Using doubles avoids potential ComplexD-device-code compatibility issues.
    hostVector<double>   h_re(n), h_im(n);
    deviceVector<double> d_re(n), d_im(n);
    for (int k = 0; k < n; ++k) {
      h_re[k] = c[k].real();
      h_im[k] = c[k].imag();
    }
    acceleratorCopyToDevice(&h_re[0], &d_re[0], n * sizeof(double));
    acceleratorCopyToDevice(&h_im[0], &d_im[0], n * sizeof(double));
    double *re_p = &d_re[0];
    double *im_p = &d_im[0];
    // Basis views
    hostVector<View>   h_basis_v(n);
    deviceVector<View> d_basis_v(n);
    for (int k = 0; k < n; ++k) {
      h_basis_v[k] = basis[k].View(AcceleratorRead);
      acceleratorPut(d_basis_v[k], h_basis_v[k]);
    }
    View *basis_vp = &d_basis_v[0];
    // Single kernel: vec[ss] -= sum_k (re[k] + i*im[k]) * basis[k][ss]
    autoView(vec_v, vec, AcceleratorWrite);
    accelerator_for(ss, oSites, nsimd, {
      auto v = coalescedRead(vec_v[ss]);
      for (int k = 0; k < n; ++k) {
        auto b = coalescedRead(basis_vp[k][ss]);
        v = v - re_p[k] * b - timesI(im_p[k] * b);
      }
      coalescedWrite(vec_v[ss], v);
    });
    for (int k = 0; k < n; ++k) h_basis_v[k].ViewClose();
  }
  // ------------------------------------------------------------------
  // Full reorthogonalization using two-pass Classical Gram-Schmidt (CGS2).
  // Each pass calls batchInnerProducts (1 GPU sync) + batchUpdate (1 sync),
  // replacing the old 2n GPU syncs per pass from sequential MGS.
  // ------------------------------------------------------------------
  void reorthogonalize(Field &vec, const std::vector<Field> &basis)
  {
    if (basis.empty()) return;
    std::vector<ComplexD> c;
    for (int pass = 0; pass < 2; ++pass) {
      batchInnerProducts(vec, basis, c);
      batchUpdate(vec, basis, c);
    }
  }
  // ------------------------------------------------------------------
  // Implicit restart: given the Nm-step bidiagonalization and its SVD,
  // compress to Nk steps via implicit QR shifts applied to B_k.
  //
  // The "shifts" are the Nm - Nk singular values we want to deflate
  // (those NOT in the desired set).  We apply them as implicit QR steps
  // to the bidiagonal matrix, then update the lattice bases accordingly.
  //
  // After this call:
  //   V[0..Nk],  U[0..Nk-1],  alpha[0..Nk-1],  beta[0..Nk-1]  are updated.
  //   betaRestart  ← new beta_Nk coupling for the next extension.
  // ------------------------------------------------------------------
  void implicitRestart(int k, int p,
                       const Eigen::VectorXd &sigma,
                       const Eigen::MatrixXd &X,
                       const Eigen::MatrixXd &Y,
                       const Eigen::VectorXi &order,
                       RealD betaK,
                       RealD &betaRestart)
  {
    // Thick restart (Baglama & Reichel, Sec. 2.2):
    //
    // Given B_k = X Sigma Y^T, define the new p-step basis by:
    //   V^+_i = V_k * y_{order(i)}      (right sing. vec. of B_k)
    //   U^+_i = U_k * x_{order(i)}      (left  sing. vec. of B_k)
    //
    // Then A V^+_i = A V_k y_{order(i)} = U_k B_k y_{order(i)}
    //             = sigma_{order(i)} U_k x_{order(i)} = sigma_{order(i)} U^+_i
    //
    // So B_p^+ = diag(sigma_{order(0)}, ..., sigma_{order(p-1)}) — DIAGONAL,
    // all internal betas are zero.
    //
    // The residual coupling comes from A^dag U_k = V_k B_k^T + betaK V[k] e_{k-1}^T:
    //   A^dag U^+_{p-1} - sigma_{order(p-1)} V^+_{p-1}
    //     = V_k (B_k^T x_{order(p-1)} - sigma_{order(p-1)} y_{order(p-1)})
    //       + betaK * X(k-1, order(p-1)) * V[k]
    //     = betaK * X(k-1, order(p-1)) * V[k]   (since B_k^T x_j = sigma_j y_j)
    //
    // Therefore: betaRestart = |betaK * X(k-1, order(p-1))|
    //            V[p] = sign(X(k-1, order(p-1))) * V[k]
    // ---- Build new lattice vectors ----
    std::vector<Field> Vnew, Unew;
    Vnew.reserve(p + 1);
    Unew.reserve(p);
    for (int i = 0; i < p; ++i) {
      int idx = order(i);
      Field vi(Grid); vi = Zero();
      for (int j = 0; j < k; ++j)
        vi = vi + Y(j, idx) * V[j];
      Vnew.push_back(vi);
    }
    for (int i = 0; i < p; ++i) {
      int idx = order(i);
      Field ui(Grid); ui = Zero();
      for (int j = 0; j < k; ++j)
        ui = ui + X(j, idx) * U[j];
      Unew.push_back(ui);
    }
    // New v_{p} (0-indexed: V[p]) = sign * V[k]
    // From A^dag U_k = V_k B_k^T + betaK V[k] e_{k-1}^T:
    //   A^dag U^+_j - sigma_j V^+_j = betaK * X(k-1, order(j)) * V[k]
    // The last Ritz pair (j=p-1) defines betaRestart and the sign of V[p].
    // All p couplings (j=0..p-1) are stored in fvec so that buildFullB can
    // reconstruct the exact column p of U^dag A V after the next extension.
    RealD coeff = betaK * X(k - 1, order(p - 1));
    betaRestart  = std::abs(coeff);
    RealD sgn = (coeff >= 0.0) ? 1.0 : -1.0;
    fvec.resize(p);
    for (int j = 0; j < p; ++j)
      fvec[j] = betaK * X(k - 1, order(j)) * sgn;
    // fvec[p-1] == betaRestart by construction
    restart_col = p;
    Field vp(Grid);
    if (betaRestart > 1.0e-14) {
      vp = sgn * V[k];
    } else {
      betaRestart = 0.0;
      vp = Zero();
    }
    Vnew.push_back(vp);  // V[p]
    // ---- New alpha, beta ----
    // B_p^+ is diagonal: alpha^+_i = sigma_{order(i)}, all internal beta = 0
    std::vector<RealD> alpha_new(p), beta_new(p);
    for (int i = 0; i < p; ++i) alpha_new[i] = sigma(order(i));
    for (int i = 0; i < p - 1; ++i) beta_new[i] = 0.0;
    beta_new[p - 1] = betaRestart;
    // ---- Commit new state ----
    V = Vnew;
    U = Unew;
    alpha = alpha_new;
    beta  = beta_new;
    std::cout << GridLogMessage
              << "IRLBA restart: compressed to " << p << " steps,"
              << "  new beta_p = " << betaRestart << std::endl;
  }
  // ------------------------------------------------------------------
  // Extract the desired singular triplets into the public output vectors.
  // ------------------------------------------------------------------
  void extractTriplets(int m,
                       const Eigen::VectorXd &sigma,
                       const Eigen::MatrixXd &X,
                       const Eigen::MatrixXd &Y,
                       const Eigen::VectorXi &order,
                       int nout)
  {
    singularValues.resize(nout);
    leftVectors.clear();   leftVectors.reserve(nout);
    rightVectors.clear();  rightVectors.reserve(nout);
    for (int i = 0; i < nout; ++i) {
      int idx = order(i);
      singularValues[i] = sigma(idx);
      // Left singular vector of A:  svec_L = U_m * x_i
      Field svL(Grid); svL = Zero();
      for (int j = 0; j < m && j < (int)U.size(); ++j)
        svL = svL + X(j, idx) * U[j];
      leftVectors.push_back(svL);
      // Right singular vector of A:  svec_R = V_m * y_i
      Field svR(Grid); svR = Zero();
      for (int j = 0; j < m && j < (int)V.size(); ++j)
        svR = svR + Y(j, idx) * V[j];
      rightVectors.push_back(svR);
    }
  }
 };
 NAMESPACE_END(Grid);
 #endif
@@ -0,0 +1,931 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
    Copyright (C) 2015
 Author: Chulwoo Jung <chulwoo@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #ifndef GRID_LANC_H
 #define GRID_LANC_H
 #include <string.h>		//memset
 #ifdef USE_LAPACK
 #ifdef USE_MKL
 #include<mkl_lapack.h>
 #else
 void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
 		    double *vl, double *vu, int *il, int *iu, double *abstol,
 		    int *m, double *w, double *z, int *ldz, int *isuppz,
 		    double *work, int *lwork, int *iwork, int *liwork,
 		    int *info);
 //#include <lapacke/lapacke.h>
 #endif
 #endif
 //#include <Grid/algorithms/densematrix/DenseMatrix.h>
 // eliminate temorary vector in calc()
 #define MEM_SAVE
 namespace Grid
 {
  struct Bisection
  {
 #if 0
    static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
 			  std::vector < RealD > &BETA,
 			  std::vector < RealD > &eig)
    {
      int i, j;
        std::vector < RealD > evec1 (row_num + 3);
        std::vector < RealD > evec2 (row_num + 3);
      RealD eps2;
        ALPHA[1] = 0.;
        BETHA[1] = 0.;
      for (i = 0; i < row_num - 1; i++)
 	{
 	  ALPHA[i + 1] = A[i * (row_num + 1)].real ();
 	  BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
 	}
      ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
      // Do we really need to sort here?
      int begin = 1;
      int end = row_num;
      int swapped = 1;
      while (swapped)
 	{
 	  swapped = 0;
 	  for (i = begin; i < end; i++)
 	    {
 	      if (mag (evec2[i]) > mag (evec2[i + 1]))
 		{
 		  swap (evec2 + i, evec2 + i + 1);
 		  swapped = 1;
 		}
 	    }
 	  end--;
 	  for (i = end - 1; i >= begin; i--)
 	    {
 	      if (mag (evec2[i]) > mag (evec2[i + 1]))
 		{
 		  swap (evec2 + i, evec2 + i + 1);
 		  swapped = 1;
 		}
 	    }
 	  begin++;
 	}
      for (i = 0; i < row_num; i++)
 	{
 	  for (j = 0; j < row_num; j++)
 	    {
 	      if (i == j)
 		H[i * row_num + j] = evec2[i + 1];
 	      else
 		H[i * row_num + j] = 0.;
 	    }
 	}
    }
 #endif
    static void bisec (std::vector < RealD > &c,
 		       std::vector < RealD > &b,
 		       int n,
 		       int m1,
 		       int m2,
 		       RealD eps1,
 		       RealD relfeh, std::vector < RealD > &x, RealD & eps2)
    {
      std::vector < RealD > wu (n + 2);
      RealD h, q, x1, xu, x0, xmin, xmax;
      int i, a, k;
      b[1] = 0.0;
      xmin = c[n] - fabs (b[n]);
      xmax = c[n] + fabs (b[n]);
      for (i = 1; i < n; i++)
 	{
 	  h = fabs (b[i]) + fabs (b[i + 1]);
 	  if (c[i] + h > xmax)
 	    xmax = c[i] + h;
 	  if (c[i] - h < xmin)
 	    xmin = c[i] - h;
 	}
      xmax *= 2.;
      eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
      if (eps1 <= 0.0)
 	eps1 = eps2;
      eps2 = 0.5 * eps1 + 7.0 * (eps2);
      x0 = xmax;
      for (i = m1; i <= m2; i++)
 	{
 	  x[i] = xmax;
 	  wu[i] = xmin;
 	}
      for (k = m2; k >= m1; k--)
 	{
 	  xu = xmin;
 	  i = k;
 	  do
 	    {
 	      if (xu < wu[i])
 		{
 		  xu = wu[i];
 		  i = m1 - 1;
 		}
 	      i--;
 	    }
 	  while (i >= m1);
 	  if (x0 > x[k])
 	    x0 = x[k];
 	  while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
 	    {
 	      x1 = (xu + x0) / 2;
 	      a = 0;
 	      q = 1.0;
 	      for (i = 1; i <= n; i++)
 		{
 		  q =
 		    c[i] - x1 -
 		    ((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
 		  if (q < 0)
 		    a++;
 		}
 //      printf("x1=%0.14e a=%d\n",x1,a);
 	      if (a < k)
 		{
 		  if (a < m1)
 		    {
 		      xu = x1;
 		      wu[m1] = x1;
 		    }
 		  else
 		    {
 		      xu = x1;
 		      wu[a + 1] = x1;
 		      if (x[a] > x1)
 			x[a] = x1;
 		    }
 		}
 	      else
 		x0 = x1;
 	    }
 	  printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
 	  x[k] = (x0 + xu) / 2;
 	}
    }
  };
 /////////////////////////////////////////////////////////////
 // Implicitly restarted lanczos
 /////////////////////////////////////////////////////////////
  template < class Field > class SimpleLanczos
  {
    const RealD small = 1.0e-16;
  public:
    int lock;
    int get;
    int Niter;
    int converged;
    int Nstop;			// Number of evecs checked for convergence
    int Nk;			// Number of converged sought
    int Np;			// Np -- Number of spare vecs in kryloc space
    int Nm;			// Nm -- total number of vectors
    RealD OrthoTime;
    RealD eresid;
 //    SortEigen < Field > _sort;
    LinearFunction < Field > &_Linop;
 //    OperatorFunction < Field > &_poly;
    /////////////////////////
    // Constructor
    /////////////////////////
    void init (void)
    {
    };
 //    void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector  < RealD > >&evecs);
    SimpleLanczos (LinearFunction < Field > &Linop,	// op
 //		   OperatorFunction < Field > &poly,	// polynmial
 		   int _Nstop,	// sought vecs
 		   int _Nk,	// sought vecs
 		   int _Nm,	// spare vecs
 		   RealD _eresid,	// resid in lmdue deficit 
 		   int _Niter):	// Max iterations
      _Linop (Linop),
 //     _poly (poly),
      Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
    {
      Np = Nm - Nk;
      assert (Np > 0);
    };
    /////////////////////////
    // Sanity checked this routine (step) against Saad.
    /////////////////////////
    void RitzMatrix (std::vector < Field > &evec, int k)
    {
      if (1)
 	return;
      GridBase *grid = evec[0].Grid();
      Field w (grid);
      std::cout << GridLogMessage << "RitzMatrix " << std::endl;
      for (int i = 0; i < k; i++)
 	{
 	  _Linop(evec[i], w);
 //      _poly(_Linop,evec[i],w);
 	  std::cout << GridLogMessage << "[" << i << "] ";
 	  for (int j = 0; j < k; j++)
 	    {
 	      ComplexD in = innerProduct (evec[j], w);
 	      if (fabs ((double) i - j) > 1)
 		{
 		  if (abs (in) > 1.0e-9)
 		    {
 		      std::cout << GridLogMessage << "oops" << std::endl;
 		      abort ();
 		    }
 		  else
 		    std::cout << GridLogMessage << " 0 ";
 		}
 	      else
 		{
 		  std::cout << GridLogMessage << " " << in << " ";
 		}
 	    }
 	  std::cout << GridLogMessage << std::endl;
 	}
    }
    void step (std::vector < RealD > &lmd,
 	       std::vector < RealD > &lme,
 	       Field & last, Field & current, Field & next, uint64_t k)
    {
      if (lmd.size () <= k)
 	lmd.resize (k + Nm);
      if (lme.size () <= k)
 	lme.resize (k + Nm);
 //      _poly(_Linop,current,next );   // 3. wk:=Avk−βkv_{k−1}
      _Linop(current, next);	// 3. wk:=Avk−βkv_{k−1}
      if (k > 0)
 	{
 	  next -= lme[k - 1] * last;
 	}
 //      std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
      ComplexD zalph = innerProduct (current, next);	// 4. αk:=(wk,vk)
      RealD alph = real (zalph);
      next = next - alph * current;	// 5. wk:=wk−αkvk
 //      std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
      RealD beta = normalise (next);	// 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
      // 7. vk+1 := wk/βk+1
 //       norm=beta;
      int interval = Nm / 100 + 1;
      if ((k % interval) == 0)
 	std::
 	  cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
 	  beta << std::endl;
      const RealD tiny = 1.0e-20;
      if (beta < tiny)
 	{
 	  std::cout << GridLogMessage << " beta is tiny " << beta << std::
 	    endl;
 	}
      lmd[k] = alph;
      lme[k] = beta;
    }
    void qr_decomp (std::vector < RealD > &lmd,
 		    std::vector  < RealD > &lme,
 		    int Nk,
 		    int Nm,
 		    std::vector  < RealD > &Qt, RealD Dsh, int kmin, int kmax)
    {
      int k = kmin - 1;
      RealD x;
      RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
      RealD c = (lmd[k] - Dsh) * Fden;
      RealD s = -lme[k] * Fden;
      RealD tmpa1 = lmd[k];
      RealD tmpa2 = lmd[k + 1];
      RealD tmpb = lme[k];
      lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
      lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
      lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
      x = -s * lme[k + 1];
      lme[k + 1] = c * lme[k + 1];
      for (int i = 0; i < Nk; ++i)
 	{
 	  RealD Qtmp1 = Qt[i + Nm * k];
 	  RealD Qtmp2 = Qt[i + Nm * (k + 1)];
 	  Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
 	  Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
 	}
      // Givens transformations
      for (int k = kmin; k < kmax - 1; ++k)
 	{
 	  RealD Fden = 1.0 / hypot (x, lme[k - 1]);
 	  RealD c = lme[k - 1] * Fden;
 	  RealD s = -x * Fden;
 	  RealD tmpa1 = lmd[k];
 	  RealD tmpa2 = lmd[k + 1];
 	  RealD tmpb = lme[k];
 	  lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
 	  lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
 	  lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
 	  lme[k - 1] = c * lme[k - 1] - s * x;
 	  if (k != kmax - 2)
 	    {
 	      x = -s * lme[k + 1];
 	      lme[k + 1] = c * lme[k + 1];
 	    }
 	  for (int i = 0; i < Nk; ++i)
 	    {
 	      RealD Qtmp1 = Qt[i + Nm * k];
 	      RealD Qtmp2 = Qt[i + Nm * (k + 1)];
 	      Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
 	      Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
 	    }
 	}
    }
 #if 0
 #ifdef USE_LAPACK
 #ifdef USE_MKL
 #define LAPACK_INT MKL_INT
 #else
 #define LAPACK_INT long long
 #endif
    void diagonalize_lapack (std::vector  < RealD > &lmd, std::vector  < RealD > &lme, int N1,	// all
 			     int N2,	// get
 			     GridBase * grid)
    {
      const int size = Nm;
      LAPACK_INT NN = N1;
      double evals_tmp[NN];
      double DD[NN];
      double EE[NN];
      for (int i = 0; i < NN; i++)
 	for (int j = i - 1; j <= i + 1; j++)
 	  if (j < NN && j >= 0)
 	    {
 	      if (i == j)
 		DD[i] = lmd[i];
 	      if (i == j)
 		evals_tmp[i] = lmd[i];
 	      if (j == (i - 1))
 		EE[j] = lme[j];
 	    }
      LAPACK_INT evals_found;
      LAPACK_INT lwork =
 	((18 * NN) >
 	 (1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
      LAPACK_INT liwork = 3 + NN * 10;
      LAPACK_INT iwork[liwork];
      double work[lwork];
      LAPACK_INT isuppz[2 * NN];
      char jobz = 'N';		// calculate evals only
      char range = 'I';		// calculate il-th to iu-th evals
      //    char range = 'A'; // calculate all evals
      char uplo = 'U';		// refer to upper half of original matrix
      char compz = 'I';		// Compute eigenvectors of tridiagonal matrix
      int ifail[NN];
      LAPACK_INT info;
 //  int total = QMP_get_number_of_nodes();
 //  int node = QMP_get_node_number();
 //  GridBase *grid = evec[0]._grid;
      int total = grid->_Nprocessors;
      int node = grid->_processor;
      int interval = (NN / total) + 1;
      double vl = 0.0, vu = 0.0;
      LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
      if (iu > NN)
 	iu = NN;
      double tol = 0.0;
      if (1)
 	{
 	  memset (evals_tmp, 0, sizeof (double) * NN);
 	  if (il <= NN)
 	    {
 	      printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
 #ifdef USE_MKL
 	      dstegr (&jobz, &range, &NN,
 #else
 	      LAPACK_dstegr (&jobz, &range, &NN,
 #endif
 			     (double *) DD, (double *) EE, &vl, &vu, &il, &iu,	// these four are ignored if second parameteris 'A'
 			     &tol,	// tolerance
 			     &evals_found, evals_tmp, (double *) NULL, &NN,
 			     isuppz, work, &lwork, iwork, &liwork, &info);
 	      for (int i = iu - 1; i >= il - 1; i--)
 		{
 		  printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
 			  evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
 		  evals_tmp[i] = evals_tmp[i - (il - 1)];
 		  if (il > 1)
 		    evals_tmp[i - (il - 1)] = 0.;
 		}
 	    }
 	  {
 	    grid->GlobalSumVector (evals_tmp, NN);
 	  }
 	}
 // cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
    }
 #undef LAPACK_INT
 #endif
    void diagonalize (std::vector  < RealD > &lmd,
 		      std::vector  < RealD > &lme,
 		      int N2, int N1, GridBase * grid)
    {
 #ifdef USE_LAPACK
      const int check_lapack = 0;	// just use lapack if 0, check against lapack if 1
      if (!check_lapack)
 	return diagonalize_lapack (lmd, lme, N2, N1, grid);
 //      diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
 #endif
    }
 #endif
    static RealD normalise (Field & v)
    {
      RealD nn = norm2 (v);
      nn = sqrt (nn);
      v = v * (1.0 / nn);
      return nn;
    }
    void orthogonalize (Field & w, std::vector < Field > &evec, int k)
    {
      double t0 = -usecond () / 1e6;
      typedef typename Field::scalar_type MyComplex;
      MyComplex ip;
      if (0)
 	{
 	  for (int j = 0; j < k; ++j)
 	    {
 	      normalise (evec[j]);
 	      for (int i = 0; i < j; i++)
 		{
 		  ip = innerProduct (evec[i], evec[j]);	// are the evecs normalised? ; this assumes so.
 		  evec[j] = evec[j] - ip * evec[i];
 		}
 	    }
 	}
      for (int j = 0; j < k; ++j)
 	{
 	  ip = innerProduct (evec[j], w);	// are the evecs normalised? ; this assumes so.
 	  w = w - ip * evec[j];
 	}
      normalise (w);
      t0 += usecond () / 1e6;
      OrthoTime += t0;
    }
    void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
    {
      for (int i = 0; i < Qt.size (); ++i)
 	Qt[i] = 0.0;
      for (int k = 0; k < Nm; ++k)
 	Qt[k + k * Nm] = 1.0;
    }
    void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
    {
      GridBase *grid = src.Grid();
 //      assert(grid == src._grid);
      std::
 	cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
 	endl;
      std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
      std::cout << GridLogMessage << " -- size of eval   = " << eval.
 	size () << std::endl;
 //      assert(c.size() && Nm == eval.size());
      std::vector < RealD > lme (Nm);
      std::vector < RealD > lmd (Nm);
      Field current (grid);
      Field last (grid);
      Field next (grid);
      Nconv = 0;
      RealD beta_k;
      // Set initial vector
      // (uniform vector) Why not src??
      //      evec[0] = 1.0;
      current = src;
      std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
 	endl;
      normalise (current);
      std::
 	cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
 	std::endl;
      // Initial Nk steps
      OrthoTime = 0.;
      double t0 = usecond () / 1e6;
      RealD norm;		// sqrt norm of last vector
      uint64_t iter = 0;
      bool initted = false;
      std::vector < RealD > low (Nstop * 10);
      std::vector < RealD > high (Nstop * 10);
      RealD cont = 0.;
      while (1) {
 	  cont = 0.;
 	  std::vector < RealD > lme2 (Nm);
 	  std::vector < RealD > lmd2 (Nm);
 	  for (uint64_t k = 0; k < Nm; ++k, iter++) {
 	      step (lmd, lme, last, current, next, iter);
 	      last = current;
 	      current = next;
 	    }
 	  double t1 = usecond () / 1e6;
 	  std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
 	    t0 << "seconds" << std::endl;
 	  t0 = t1;
 	  std::
 	    cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
 	    OrthoTime << "seconds" << std::endl;
 	  // getting eigenvalues
 	  lmd2.resize (iter + 2);
 	  lme2.resize (iter + 2);
 	  for (uint64_t k = 0; k < iter; ++k) {
 	      lmd2[k + 1] = lmd[k];
 	      lme2[k + 2] = lme[k];
 	    }
 	  t1 = usecond () / 1e6;
 	  std::cout << GridLogMessage << "IRL:: copy: " << t1 -
 	    t0 << "seconds" << std::endl;
 	  t0 = t1;
 	  {
 	    int total = grid->_Nprocessors;
 	    int node = grid->_processor;
 	    int interval = (Nstop / total) + 1;
 	    int iu = (iter + 1) - (interval * node + 1);
 	    int il = (iter + 1) - (interval * (node + 1));
 	    std::vector < RealD > eval2 (iter + 3);
 	    RealD eps2;
 	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
 			      eps2);
 //        diagonalize(eval2,lme2,iter,Nk,grid);
 	    RealD diff = 0.;
 	    for (int i = il; i <= iu; i++) {
 		if (initted)
 		  diff =
 		    fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
 						      fabs (high[iu-i]));
 		if (initted && (diff > eresid))
 		  cont = 1.;
 		if (initted)
 		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
 			  high[iu-i], diff);
 		high[iu-i] = eval2[i];
 	      }
 	    il = (interval * node + 1);
 	    iu = (interval * (node + 1));
 	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
 			      eps2);
 	    for (int i = il; i <= iu; i++) {
 		if (initted)
 		  diff =
 		    fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
 						fabs (low[i]));
 		if (initted && (diff > eresid))
 		  cont = 1.;
 		if (initted)
 		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
 			  low[i], diff);
 		low[i] = eval2[i];
 	      }
 	    t1 = usecond () / 1e6;
 	    std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
 	      t0 << "seconds" << std::endl;
 	    t0 = t1;
 	  }
 	  for (uint64_t k = 0; k < Nk; ++k) {
 //          eval[k] = eval2[k];
 	    }
 	  if (initted)
 	    {
 	      grid->GlobalSumVector (&cont, 1);
 	      if (cont < 1.) return;
 	    }
 	  initted = true;
 	}
    }
 #if 0
 /**
   There is some matrix Q such that for any vector y
   Q.e_1 = y and Q is unitary.
 **/
    template < class T >
      static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
    {
      int N = y.size ();	//Matrix Size
      Fill (Q, 0.0);
      T tau;
      for (int i = 0; i < N; i++)
 	{
 	  Q[i][0] = y[i];
 	}
      T sig = conj (y[0]) * y[0];
      T tau0 = fabs (sqrt (sig));
      for (int j = 1; j < N; j++)
 	{
 	  sig += conj (y[j]) * y[j];
 	  tau = abs (sqrt (sig));
 	  if (abs (tau0) > 0.0)
 	    {
 	      T gam = conj ((y[j] / tau) / tau0);
 	      for (int k = 0; k <= j - 1; k++)
 		{
 		  Q[k][j] = -gam * y[k];
 		}
 	      Q[j][j] = tau0 / tau;
 	    }
 	  else
 	    {
 	      Q[j - 1][j] = 1.0;
 	    }
 	  tau0 = tau;
 	}
      return tau;
    }
 /**
 	There is some matrix Q such that for any vector y
 	Q.e_k = y and Q is unitary.
 **/
    template < class T >
      static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
    {
      T tau = orthQ (Q, y);
      SL (Q);
      return tau;
    }
 /**
 	Wind up with a matrix with the first con rows untouched
 say con = 2
 	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
 	and the matrix is upper hessenberg
 	and with f and Q appropriately modidied with Q is the arnoldi factorization
 **/
    template < class T > static void Lock (DenseMatrix < T > &H,	///Hess mtx     
 					   DenseMatrix < T > &Q,	///Lock Transform
 					   T val,	///value to be locked
 					   int con,	///number already locked
 					   RealD small, int dfg, bool herm)
    {
      //ForceTridiagonal(H);
      int M = H.dim;
      DenseVector < T > vec;
      Resize (vec, M - con);
      DenseMatrix < T > AH;
      Resize (AH, M - con, M - con);
      AH = GetSubMtx (H, con, M, con, M);
      DenseMatrix < T > QQ;
      Resize (QQ, M - con, M - con);
      Unity (Q);
      Unity (QQ);
      DenseVector < T > evals;
      Resize (evals, M - con);
      DenseMatrix < T > evecs;
      Resize (evecs, M - con, M - con);
      Wilkinson < T > (AH, evals, evecs, small);
      int k = 0;
      RealD cold = abs (val - evals[k]);
      for (int i = 1; i < M - con; i++)
 	{
 	  RealD cnew = abs (val - evals[i]);
 	  if (cnew < cold)
 	    {
 	      k = i;
 	      cold = cnew;
 	    }
 	}
      vec = evecs[k];
      ComplexD tau;
      orthQ (QQ, vec);
      //orthQM(QQ,AH,vec);
      AH = Hermitian (QQ) * AH;
      AH = AH * QQ;
      for (int i = con; i < M; i++)
 	{
 	  for (int j = con; j < M; j++)
 	    {
 	      Q[i][j] = QQ[i - con][j - con];
 	      H[i][j] = AH[i - con][j - con];
 	    }
 	}
      for (int j = M - 1; j > con + 2; j--)
 	{
 	  DenseMatrix < T > U;
 	  Resize (U, j - 1 - con, j - 1 - con);
 	  DenseVector < T > z;
 	  Resize (z, j - 1 - con);
 	  T nm = norm (z);
 	  for (int k = con + 0; k < j - 1; k++)
 	    {
 	      z[k - con] = conj (H (j, k + 1));
 	    }
 	  normalise (z);
 	  RealD tmp = 0;
 	  for (int i = 0; i < z.size () - 1; i++)
 	    {
 	      tmp = tmp + abs (z[i]);
 	    }
 	  if (tmp < small / ((RealD) z.size () - 1.0))
 	    {
 	      continue;
 	    }
 	  tau = orthU (U, z);
 	  DenseMatrix < T > Hb;
 	  Resize (Hb, j - 1 - con, M);
 	  for (int a = 0; a < M; a++)
 	    {
 	      for (int b = 0; b < j - 1 - con; b++)
 		{
 		  T sum = 0;
 		  for (int c = 0; c < j - 1 - con; c++)
 		    {
 		      sum += H[a][con + 1 + c] * U[c][b];
 		    }		//sum += H(a,con+1+c)*U(c,b);}
 		  Hb[b][a] = sum;
 		}
 	    }
 	  for (int k = con + 1; k < j; k++)
 	    {
 	      for (int l = 0; l < M; l++)
 		{
 		  H[l][k] = Hb[k - 1 - con][l];
 		}
 	    }			//H(Hb[k-1-con][l] , l,k);}}
 	  DenseMatrix < T > Qb;
 	  Resize (Qb, M, M);
 	  for (int a = 0; a < M; a++)
 	    {
 	      for (int b = 0; b < j - 1 - con; b++)
 		{
 		  T sum = 0;
 		  for (int c = 0; c < j - 1 - con; c++)
 		    {
 		      sum += Q[a][con + 1 + c] * U[c][b];
 		    }		//sum += Q(a,con+1+c)*U(c,b);}
 		  Qb[b][a] = sum;
 		}
 	    }
 	  for (int k = con + 1; k < j; k++)
 	    {
 	      for (int l = 0; l < M; l++)
 		{
 		  Q[l][k] = Qb[k - 1 - con][l];
 		}
 	    }			//Q(Qb[k-1-con][l] , l,k);}}
 	  DenseMatrix < T > Hc;
 	  Resize (Hc, M, M);
 	  for (int a = 0; a < j - 1 - con; a++)
 	    {
 	      for (int b = 0; b < M; b++)
 		{
 		  T sum = 0;
 		  for (int c = 0; c < j - 1 - con; c++)
 		    {
 		      sum += conj (U[c][a]) * H[con + 1 + c][b];
 		    }		//sum += conj( U(c,a) )*H(con+1+c,b);}
 		  Hc[b][a] = sum;
 		}
 	    }
 	  for (int k = 0; k < M; k++)
 	    {
 	      for (int l = con + 1; l < j; l++)
 		{
 		  H[l][k] = Hc[k][l - 1 - con];
 		}
 	    }			//H(Hc[k][l-1-con] , l,k);}}
 	}
    }
 #endif
  };
 }
 #endif
@@ -97,7 +97,7 @@ public:
    RealD scale;
-    ConjugateGradient<FineField> CG(1.0e-3,400,false);
+    ConjugateGradient<FineField> CG(1.0e-4,2000,false);
    FineField noise(FineGrid);
    FineField Mn(FineGrid);
@@ -131,7 +131,10 @@ public:
    RealD scale;
    TrivialPrecon<FineField> simple_fine;
-    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
+    //    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,10,DiracOp,simple_fine,30,30);
    //    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,10,DiracOp,simple_fine,12,12);
    //    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
    PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,10,10);
    FineField noise(FineGrid);
    FineField src(FineGrid);
    FineField guess(FineGrid);
@@ -146,16 +149,16 @@ public:
      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
-      for(int i=0;i<2;i++){
+      for(int i=0;i<3;i++){
 	//  void operator() (const Field &src, Field &psi){
 #if 1
-	std::cout << GridLogMessage << " inverting on noise "<<std::endl;
+	if (i==0)std::cout << GridLogMessage << " inverting on noise "<<std::endl;
 	src = noise;
 	guess=Zero();
 	GCR(src,guess);
 	subspace[b] = guess;
 #else
-	std::cout << GridLogMessage << " inverting on zero "<<std::endl;
+	if (i==0)std::cout << GridLogMessage << " inverting on zero "<<std::endl;
 	src=Zero();
 	guess = noise;
 	GCR(src,guess);
@@ -167,7 +170,7 @@ public:
      }
-      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
+      DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<" <f|OpDagOp|f>"<<norm2(Mn)<<std::endl;
      subspace[b]   = noise;
    }
@@ -260,7 +260,8 @@ class GridLimeReader : public BinaryIO {
              << " / field= " << n2ck << " / rdiff= " << GRID_FIELD_NORM_CALC(FieldNormMetaData_,n2ck) << std::endl;
 	  GRID_FIELD_NORM_CHECK(FieldNormMetaData_,n2ck);
 	}
-	assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
+//	assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
 	scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
 	// find out if next field is a GridFieldNorm
 	return;
@@ -122,7 +122,7 @@ public:
    field.checksum = std::stoul(header["CHECKSUM"],0,16);
    field.ensemble_id      = header["ENSEMBLE_ID"];
    field.ensemble_label   = header["ENSEMBLE_LABEL"];
-    field.sequence_number  = std::stol(header["SEQUENCE_NUMBER"]);
+//    field.sequence_number  = std::stol(header["SEQUENCE_NUMBER"]);
    field.creator          = header["CREATOR"];
    field.creator_hardware = header["CREATOR_HARDWARE"];
    field.creation_date    = header["CREATION_DATE"];
@@ -596,16 +596,32 @@ template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
 //////////////////////////////////////////
 // Trace lattice and non-lattice
 //////////////////////////////////////////
 #define GRID_UNOP(name)   name
 #define GRID_DEF_UNOP(op, name)						\
  template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
  inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg)) \
  {									\
    return     LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg); \
  }
 template<int Index,class vobj>
 inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(vobj()))>
 {
  return traceIndex<SpinIndex>(lhs);
 }
 GridUnopClass(UnaryTraceSpin, traceIndex<SpinIndex>(a));
 GRID_DEF_UNOP(traceSpin, UnaryTraceSpin);
 template<int Index,class vobj>
 inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(vobj()))>
 {
  return traceIndex<ColourIndex>(lhs);
 }
 GridUnopClass(UnaryTraceColour, traceIndex<ColourIndex>(a));
 GRID_DEF_UNOP(traceColour, UnaryTraceColour);
 template<int Index,class vobj>
 inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
 {
@@ -617,6 +633,8 @@ inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIn
  return traceIndex<ColourIndex>(lhs);
 }
 #undef GRID_UNOP
 #undef GRID_DEF_UNOP
 //////////////////////////////////////////
 // Current types
 //////////////////////////////////////////
@@ -103,6 +103,18 @@ class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{
  PolyakovMod(): ObsBase(NoParameters()){}
 };
 template < class Impl >
 class SpatialPolyakovMod: public ObservableModule<SpatialPolyakovLogger<Impl>, NoParameters>{
  typedef ObservableModule<SpatialPolyakovLogger<Impl>, NoParameters> ObsBase;
  using ObsBase::ObsBase; // for constructors
  // acquire resource
  virtual void initialize(){
    this->ObservablePtr.reset(new SpatialPolyakovLogger<Impl>());
  }
  public:
  SpatialPolyakovMod(): ObsBase(NoParameters()){}
 };
 template < class Impl >
 class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
@@ -2,11 +2,12 @@
 Grid physics library, www.github.com/paboyle/Grid
-Source file: ./lib/qcd/modules/polyakov_line.h
+Source file: ./Grid/qcd/observables/polyakov_loop.h
-Copyright (C) 2017
+Copyright (C) 2025
 Author: David Preti <david.preti@csic.es>
 Author: Alexis Verney-Provatas <2414441@swansea.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -60,4 +61,43 @@ class PolyakovLogger : public HmcObservable<typename Impl::Field> {
  }
 };
 template <class Impl>
 class SpatialPolyakovLogger : public HmcObservable<typename Impl::Field> {
 public:
    // here forces the Impl to be of gauge fields
    // if not the compiler will complain
    INHERIT_GIMPL_TYPES(Impl);
     // necessary for HmcObservable compatibility
    typedef typename Impl::Field Field;
    void TrajectoryComplete(int traj,
                            Field &U,
                            GridSerialRNG &sRNG,
                            GridParallelRNG &pRNG) {
    // Save current numerical output precision
    int def_prec = std::cout.precision();
    // Assume that the dimensions are D=3+1
    int Ndim = 3;
    ComplexD polyakov;
    // Iterate over the spatial directions and print the average spatial polyakov loop
    // over them 
    for (int idx=0; idx<Ndim; idx++) {
        polyakov = WilsonLoops<Impl>::avgPolyakovLoop(U, idx);
        std::cout << GridLogMessage
            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
            << "Polyakov Loop in the " << idx << " spatial direction : [ " << traj << " ] "<< polyakov << std::endl;
    }
    // Return to original output precision
    std::cout.precision(def_prec);
  }
 };
 NAMESPACE_END(Grid);
@@ -254,9 +254,9 @@ static void testGenerators(GroupName::Sp) {
  }
 }
-template <int N>
+template <class vtype, int N>
-static Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > >
+static Lattice<iScalar<iScalar<iMatrix<vtype, N> > > >
-ProjectOnGeneralGroup(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu, GroupName::Sp) {
+ProjectOnGeneralGroup(const Lattice<iScalar<iScalar<iMatrix<vtype, N> > > > &Umu, GroupName::Sp) {
  return ProjectOnSpGroup(Umu);
 }
@@ -177,25 +177,43 @@ public:
  }
  //////////////////////////////////////////////////
-  // average over all x,y,z the temporal loop
+  // average Polyakov loop in mu direction over all directions != mu
  //////////////////////////////////////////////////
-  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
+  static ComplexD avgPolyakovLoop(const GaugeField &Umu, const int mu) {  //assume Nd=4
-    GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
+    
    // Protect against bad value of mu [0, 3]
    if ((mu < 0 ) || (mu > 3)) {
      std::cout << GridLogError << "Index is not an integer inclusively between 0 and 3." << std::endl;
      exit(1);
    }
    // U_loop is U_{mu}
    GaugeMat U_loop(Umu.Grid()), P(Umu.Grid());
    ComplexD out;
    int T = Umu.Grid()->GlobalDimensions()[3];
    int X = Umu.Grid()->GlobalDimensions()[0];
    int Y = Umu.Grid()->GlobalDimensions()[1];
    int Z = Umu.Grid()->GlobalDimensions()[2];
-    Ut = peekLorentz(Umu,3); //Select temporal direction
+    // Number of sites in mu direction
-    P = Ut;
+    int N_mu = Umu.Grid()->GlobalDimensions()[mu];
-    for (int t=1;t<T;t++){ 
+
-      P = Gimpl::CovShiftForward(Ut,3,P);
+    U_loop = peekLorentz(Umu, mu); //Select direction
    P = U_loop;
    for (int t=1;t<N_mu;t++){ 
      P = Gimpl::CovShiftForward(U_loop,mu,P);
    }
   RealD norm = 1.0/(Nc*X*Y*Z*T);
   out = sum(trace(P))*norm;
   return out;   
-}
+  }  
  /////////////////////////////////////////////////
  // overload for temporal Polyakov loop
  /////////////////////////////////////////////////
  static ComplexD avgPolyakovLoop(const GaugeField &Umu) { 
    return avgPolyakovLoop(Umu, 3);
  }
  //////////////////////////////////////////////////
  // average over traced single links
@@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once 
 #ifndef MIN
 #define MIN(x,y) ((x)>(y)?(y):(x))
 #endif
 // Introduce a class to gain deterministic bit reproducible reduction.
 // make static; perhaps just a namespace is required.
 NAMESPACE_BEGIN(Grid);
@@ -24,7 +24,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
@@ -230,3 +234,4 @@ int main(int argc, char **argv)
 #endif
 } // main
 #endif
@@ -25,7 +25,11 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
@@ -231,5 +235,4 @@ int main(int argc, char **argv)
 #endif
 } // main
-
+#endif
@@ -24,7 +24,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 #if Nc == 3
 #include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
@@ -230,5 +234,4 @@ int main(int argc, char **argv)
 #endif
 } // main
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
@@ -195,5 +199,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
@@ -449,5 +453,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 #ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
@@ -442,5 +446,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 using namespace Grid;
@@ -918,3 +922,5 @@ int main(int argc, char **argv) {
  return 0;
 #endif
 } // main
 #endif
@@ -28,7 +28,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 using namespace Grid;
@@ -873,3 +877,5 @@ int main(int argc, char **argv) {
  return 0;
 #endif
 } // main
 #endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
@@ -193,5 +197,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
@@ -512,5 +516,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
@@ -345,5 +349,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
@@ -516,5 +520,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
@@ -567,5 +571,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
@@ -263,5 +267,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 int main(int argc, char **argv) {
  using namespace Grid;
@@ -417,5 +421,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
@@ -452,5 +456,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
 NAMESPACE_BEGIN(Grid);
@@ -462,5 +466,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -27,7 +27,11 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid/Grid.h>
+
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include<Grid/Grid.h>
@@ -264,5 +268,4 @@ int main(int argc, char **argv) {
  Grid_finalize();
 } // main
-
+#endif
@@ -0,0 +1,16 @@
 #include <Grid/Grid.h>
 #pragma once
 #ifndef ENABLE_FERMION_INSTANTIATIONS
 #include <iostream>
 int main(void) {
  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
 	    << "which this example relies on. "
 	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
 	    << "to run this example."
 	    << std::endl;
  return 1;
 }
 #endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace Grid;
@@ -731,3 +734,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
 #endif
@@ -20,6 +20,9 @@
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -439,3 +442,4 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  GRID_ASSERT(norm2(src_e)<1.0e-4);
  GRID_ASSERT(norm2(src_o)<1.0e-4);
 }
 #endif
@@ -20,6 +20,10 @@
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -439,3 +443,5 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
  GRID_ASSERT(norm2(src_e)<1.0e-4);
  GRID_ASSERT(norm2(src_o)<1.0e-4);
 }
 #endif
@@ -20,6 +20,9 @@
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -385,3 +388,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
  exit(0);
 }
 #endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -238,5 +241,4 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
  }
 }
-
+#endif
@@ -1,3 +1,7 @@
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 #include <sstream>
 using namespace std;
@@ -155,3 +159,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
 #endif
@@ -20,6 +20,9 @@
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 #ifdef GRID_CUDA
 #define CUDA_PROFILE
@@ -129,3 +132,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
  exit(0);
 }
 #endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -149,3 +152,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
 #endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -172,5 +175,4 @@ void benchDw(std::vector<int> & latt4, int Ls)
  //  Dw.Report();
 }
-
+#endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -110,3 +113,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
 #endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -112,3 +115,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
 #endif
@@ -26,6 +26,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 #include <Grid/algorithms/blas/BatchedBlas.h>
@@ -873,7 +877,7 @@ int main (int argc, char ** argv)
  int do_su4=0;
  int do_memory=1;
  int do_comms =1;
-  int do_blas  =1;
+  int do_blas  =0;
  int do_dslash=1;
  int sel=4;
@@ -978,3 +982,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
  fclose(FP);
 }
 #endif
@@ -26,6 +26,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -258,3 +261,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
 #endif
@@ -19,6 +19,9 @@ Author: Richard Rollins <rprollins@users.noreply.github.com>
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include "disable_benchmarks_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -161,3 +164,5 @@ void bench_wilson_eo (
  double flops = (single_site_flops * volume * ncall)/2.0;
  std::cout << flops/(t1-t0) << "\t\t";
 }
 #endif
@@ -0,0 +1,16 @@
 #include <Grid/Grid.h>
 #pragma once
 #ifndef ENABLE_FERMION_INSTANTIATIONS
 #include <iostream>
 int main(void) {
  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
 	    << "which this benchmark relies on. "
 	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
 	    << "to run this benchmark."
 	    << std::endl;
  return 1;
 }
 #endif
@@ -172,6 +172,12 @@ case ${ac_TRACING} in
 esac
 ############### fermions
 AC_ARG_ENABLE([fermion-instantiations],
     [AS_HELP_STRING([--enable-fermion-instantiations=yes|no],[enable fermion instantiations])],
     [ac_FERMION_INSTANTIATIONS=${enable_fermion_instantiations}], [ac_FERMION_INSTANTIATIONS=yes])
 AM_CONDITIONAL(BUILD_FERMION_INSTANTIATIONS, [ test "${ac_FERMION_INSTANTIATIONS}X" == "yesX" ])
 AC_ARG_ENABLE([fermion-reps],
     [AS_HELP_STRING([--enable-fermion-reps=yes|no],[enable extra fermion representation support])],
     [ac_FERMION_REPS=${enable_fermion_reps}], [ac_FERMION_REPS=yes])
@@ -194,6 +200,9 @@ AM_CONDITIONAL(BUILD_ZMOBIUS, [ test "${ac_ZMOBIUS}X" == "yesX" ])
 case ${ac_FERMION_REPS} in
   yes) AC_DEFINE([ENABLE_FERMION_REPS],[1],[non QCD fermion reps]);;
 esac
 case ${ac_FERMION_INSTANTIATIONS} in
   yes) AC_DEFINE([ENABLE_FERMION_INSTANTIATIONS],[1],[enable fermions]);;
 esac
 case ${ac_GPARITY} in
   yes) AC_DEFINE([ENABLE_GPARITY],[1],[fermion actions with GPARITY BCs]);;
 esac
@@ -3,6 +3,9 @@
 * without regression / tests being applied
 */
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -310,5 +313,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
-
+#endif
@@ -0,0 +1,430 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 // copied here from Test_general_coarse_pvdagm.cc
 #include <cstdlib>
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 using namespace std;
 using namespace Grid;
 // Hermitize a DWF operator by squaring it
 template<class Matrix,class Field>
 class SquaredLinearOperator : public LinearOperatorBase<Field> {
  public:
  Matrix &_Mat;
  public:
    SquaredLinearOperator(Matrix &Mat): _Mat(Mat) {};
    void OpDiag (const Field &in, Field &out) {    assert(0);  }
    void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
    void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
    void Op     (const Field &in, Field &out){
      // std::cout << "Op is overloaded as HermOp" << std::endl;
      HermOp(in, out);
    }
    void AdjOp     (const Field &in, Field &out){
      HermOp(in, out);
    }
    void _Op     (const Field &in, Field &out){
      // std::cout << "Op: M "<<std::endl;
      _Mat.M(in, out);
    }
    void _AdjOp     (const Field &in, Field &out){
      // std::cout << "AdjOp: Mdag "<<std::endl;
      _Mat.Mdag(in, out);
    }
    void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
    void HermOp(const Field &in, Field &out){
      // std::cout << "HermOp: Mdag M Mdag M"<<std::endl;
      Field tmp(in.Grid());
      _Op(in,tmp);
      _AdjOp(tmp,out);
    }
 };
 template<class Matrix,class Field>
 class PVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
 public:
  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    //    _Mat.M(in,tmp);
    //    _PV.Mdag(tmp,out);
    //    _PV.M(out,tmp);
    //    _Mat.Mdag(tmp,out);
    Op(in,tmp);
    AdjOp(tmp,out);
    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Matrix,class Field>
 class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  RealD shift;
 public:
  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
  }
 };
 template<class Matrix, class Field>
 class ShiftedComplexPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  ComplexD shift;
 public:
 ShiftedComplexPVdagMLinearOperator(ComplexD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
  }
  void resetShift(ComplexD newShift) {
    shift = newShift;
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
  typedef LinearOperatorBase<FineField>                            FineOperator;
  typedef LinearFunction    <FineField>                            FineSmoother;
  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
  Aggregates     & _Aggregates;
  FineOperator   & _FineOperator;
  FineSmoother   & _PreSmoother;
  FineSmoother   & _PostSmoother;
  CoarseOperator & _CoarseOperator;
  CoarseSolver   & _CoarseSolve;
  int    level;  void Level(int lv) {level = lv; };
  MGPreconditioner(Aggregates &Agg,
 		   FineOperator &Fine,
 		   FineSmoother &PreSmoother,
 		   FineSmoother &PostSmoother,
 		   CoarseOperator &CoarseOperator_,
 		   CoarseSolver &CoarseSolve_)
    : _Aggregates(Agg),
      _FineOperator(Fine),
      _PreSmoother(PreSmoother),
      _PostSmoother(PostSmoother),
      _CoarseOperator(CoarseOperator_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  virtual void operator()(const FineField &in, FineField & out) 
  {
    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
    //    auto CoarseGrid = _CoarseOperator.Grid();
    CoarseVector Csrc(CoarseGrid);
    CoarseVector Csol(CoarseGrid);
    FineField vec1(in.Grid());
    FineField vec2(in.Grid());
    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
    double t;
    // Fine Smoother
    //    out = in;
    out = Zero();
    t=-usecond();
    _PreSmoother(in,out);
    t+=usecond();
    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
    // Update the residual
    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
    // Fine to Coarse 
    t=-usecond();
    _Aggregates.ProjectToSubspace  (Csrc,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse correction
    t=-usecond();
    Csol = Zero();
    _CoarseSolve(Csrc,Csol);
    //Csol=Zero();
    t+=usecond();
    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse to Fine
    t=-usecond();  
    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
    _Aggregates.PromoteFromSubspace(Csol,vec1); 
    add(out,out,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
    // Residual
    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
    // Fine Smoother
    t=-usecond();
    //    vec2=vec1;
    vec2=Zero();
    _PostSmoother(vec1,vec2);
    t+=usecond();
    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
    add( out,out,vec2);
    std::cout<<GridLogMessage << "Done " <<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
 //   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  std::vector<int> lat_size {16, 16, 16, 32};
  std::cout << "Lattice size: " << lat_size << std::endl;
  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(lat_size, 
 								          GridDefaultSimd(Nd,vComplex::Nsimd()),
 								          GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  // Construct a coarsened grid
  // poare TODO: replace this with the following line?
  Coordinate clatt = lat_size;
 //   Coordinate clatt = GridDefaultLatt();              // [PO] initial line before I edited it
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/2;
    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
  LatticeFermion    src(FGrid); random(RNG5,src);
  LatticeFermion result(FGrid); result=Zero();
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
 //   std::string file("ckpoint_lat.4000");
  std::string file("/Users/patrickoare/libraries/PETSc-Grid/ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu,header,file);
  RealD mass=0.01;
  RealD M5=1.8;
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
  // const int nbasis = 20;            // size of approximate basis for low-mode space
  const int nbasis = 3;            // size of approximate basis for low-mode space
  const int cb = 0 ;
  LatticeFermion prom(FGrid);
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NextToNearestStencilGeometry5D geom(Coarse5d);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
  typedef ShiftedComplexPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedComplexPVdagM_t;
  PVdagM_t PVdagM(Ddwf, Dpv);
  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv);
  SquaredLinearOperator<DomainWallFermionD, LatticeFermionD> Dsq (Ddwf);
  NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
  int Nm = 10;
  int Nk = 6;
  // int Nm = 6;         // Nm = 6 case is acting really strangely... with Nm = 6 and Nm = 3 it zeros out the Hessenberg and also makes it imaginary?
  // int Nk = 2;
  // int Nk = Nm+1;     // if just running once
  // int maxIter = 5;
  // int maxIter = 1;
  int maxIter = 3;
  // int maxIter = 100;
  int Nstop = 6;
  Coordinate origin ({0,0,0,0});
  auto tmpSrc = peekSite(src, origin);
  std::cout << "[DEBUG] Source at origin = " <<  tmpSrc << std::endl;
  LatticeFermion src2 = src;
  // Run Lanczos and Arnoldi on a Hermitian matrix
  // Arnoldi Arn (Dsq, FGrid, 1e-8, false);
  // Arn(src, 1, Nm, -1);
  Arnoldi Arn (Dsq, FGrid, 1e-8, EvalNormLarge);      // for comparison to Lanczos
  // Arn(src, maxIter, Nm, Nk, Nstop);
  // auto tmpSrcDup = peekSite(src, origin);
  // std::cout << "[DEBUG] Source at origin = " <<  tmpSrcDup << std::endl;
  // auto tmpSrc2Dup = peekSite(src2, origin);
  // std::cout << "[DEBUG] Source2 at origin = " <<  tmpSrc2Dup << std::endl;
  Arn(src, maxIter, Nm, Nk, Nstop);
  std::cout << "Hessenberg mat for symmetric N = " << Nm << std::endl;
  std::cout << Arn.getHessenbergMat() << std::endl;
  // ImplicitlyRestartedLanczosHermOpTester<LatticeFermionD> SimpleTester (Dsq);
  // ImplicitlyRestartedLanczos<LatticeFermionD> Lanc (Dsq, Dsq, SimpleTester, Nm, Nm, Nm, 1e-8, Nm);
  int Nconv;
  PlainHermOp DsqHermOp (Dsq);
  // std::vector<RealD> levals (Nm+1); std::vector<LatticeFermionD> levecs (Nm+1, src);
  // ImplicitlyRestartedLanczos<LatticeFermionD> Lanc (DsqHermOp, DsqHermOp, Nm, Nm, Nm + 1, 1e-8, Nm);
  std::vector<RealD> levals (Nm+1); std::vector<LatticeFermionD> levecs (Nm, src);
  ImplicitlyRestartedLanczos<LatticeFermionD> Lanc (DsqHermOp, DsqHermOp, Nstop, Nk, Nm, 1e-8, maxIter);
  std::cout << GridLogMessage << "Calculating with Lanczos" << std::endl;
  // auto tmpSrc1 = peekSite(src, origin);
  // std::cout << "[DEBUG] Source at origin = " <<  tmpSrc1 << std::endl;
  // auto tmpSrc2 = peekSite(src2, origin);
  // std::cout << "[DEBUG] Source2 at origin = " <<  tmpSrc2 << std::endl;
  // std::cout << "[DEBUG] Source norm2: " << norm2(src) << std::endl;
  std::cout << "running Lanczos now" << std::endl;
  Lanc.calc(levals, levecs, src2, Nconv);
  std::cout<<GridLogMessage << "*******************************************" << std::endl;
  std::cout<<GridLogMessage << "***************** RESULTS *****************" << std::endl;
  std::cout<<GridLogMessage << "*******************************************" << std::endl;
  std::cout << GridLogMessage << "Arnoldi eigenvalues: " << std::endl << Arn.getEvals() << std::endl;
  std::cout << GridLogMessage << "Lanczos eigenvalues: " << std::endl << levals << std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -3,6 +3,9 @@
 * without regression / tests being applied
 */
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -432,5 +435,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
-
+#endif
@@ -0,0 +1,405 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 // Tests code written to read off the Krylov coefficients
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 #include <Grid/algorithms/iterative/ConjugateGradient.h>
 using namespace std;
 using namespace Grid;
 // Hermitize a DWF operator by squaring it
 template<class Matrix,class Field>
 class SquaredLinearOperator : public LinearOperatorBase<Field> {
  public:
  Matrix &_Mat;
  public:
    SquaredLinearOperator(Matrix &Mat): _Mat(Mat) {};
    void OpDiag (const Field &in, Field &out) {    assert(0);  }
    void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
    void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
    void Op     (const Field &in, Field &out){
      // std::cout << "Op is overloaded as HermOp" << std::endl;
      HermOp(in, out);
    }
    void AdjOp     (const Field &in, Field &out){
      HermOp(in, out);
    }
    void _Op     (const Field &in, Field &out){
      // std::cout << "Op: M "<<std::endl;
      _Mat.M(in, out);
    }
    void _AdjOp     (const Field &in, Field &out){
      // std::cout << "AdjOp: Mdag "<<std::endl;
      _Mat.Mdag(in, out);
    }
    void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
    void HermOp(const Field &in, Field &out){
      // std::cout << "HermOp: Mdag M Mdag M"<<std::endl;
      Field tmp(in.Grid());
      _Op(in,tmp);
      _AdjOp(tmp,out);
    }
 };
 /**
 * Computes the coefficients in the Krylov expansion for 1/D ~ \sum_{i=0}^N c_i D^i. 
 * 
 * Parameters
 * ----------
 * std::vector<double> &coeffs
 *    Polynomial coeffients to return, with indexing order (c_0, c_1, c_2, ..., c_n). 
 * LinearOperatorBase<FineField> &DiracOp
 *    Dirac operator D. 
 * FineField src
 *    Source field b. 
 * FineField psiStar
 *    Output approximation for D^{-1} b coming from a Krylov method. 
 * int N
 *    Dimension of the polynomial approximation (Krylov space K_{N-1} = {b, Db, D^2 b, ..., D^{N-1} b}).
 */
 void poly_coeffs(std::vector<ComplexD> &coeffs, LinearOperatorBase<LatticeFermion> &DiracOp, LatticeFermion src,
                  LatticeFermion psiStar, GridCartesian* FGrid, int N, bool use_herm = false)
 {
  // stdBasis = {b, Db, D^2 b, ..., D^N b}, kryBasis = {k0, k1, ..., kN}
  std::vector<LatticeFermion> kryBasis;
  Eigen::VectorXcd psiStarCoeffs (N);
  // Normalize by 1 / ||src||; does not change the polynomial coefficients
  double srcNorm   = 1 / std::sqrt(norm2(src));
  kryBasis.push_back(srcNorm * src);                // normalized source
  psiStar          = srcNorm * psiStar;
  psiStarCoeffs(0) = innerProduct(kryBasis[0], psiStar);
  // orthonormalize canonical Krylov basis {b, Db, D^2 b, ..., D^{N-1} b} <--> {k_i} and compute components <k_i | psi*>
  LatticeFermion tmp (FGrid);
  for (int i = 0; i < N - 1; i++) {               // construct ONB for {b, Db, ..., D^{i+1} b}
    if (use_herm) {
      DiracOp.HermOp(kryBasis.back(), tmp);         // tmp \in span{(D^\dag D)^{i+1} b} \oplus span{(D^\dag D)^i b, ..., D^\dag D b, b}
    } else {
      DiracOp.Op(kryBasis.back(), tmp);             // tmp \in span{D^{i+1} b} \oplus span{D^i b, ..., Db, b}
    }
    for (int j = 0; j < i+1; j++) {               // orthogonalize tmp with previous basis vectors
      ComplexD coeff = innerProduct(kryBasis[j], tmp);      // <k_j | tmp>
      tmp -= coeff * kryBasis[j];                           // subtract off |k_j><k_j | tmp>; now tmp is perp to |k_j>
    }
    double tmpNorm = 1 / std::sqrt(norm2(tmp));
    kryBasis.push_back(
      tmpNorm * tmp
    );                                                      // normalize |k_i> and add to kryBasis
    psiStarCoeffs(i+1) = innerProduct(kryBasis[i+1], psiStar);  // compute < k_i | psi* >
  }
  // To verify the basis is ONB
  // for (int i = 0; i < N; i++) {
  //   for (int j = 0; j < N; j++) {
  //     std::cout << "<ki|kj> for (i,j) = (" << i << ", " << j << ") = "  << innerProduct(kryBasis[i], kryBasis[j]) << std::endl;
  //   }
  // }
  // Compute change of basis matrix
  LatticeFermion tmp2 (FGrid);
  Eigen::MatrixXcd M = Eigen::MatrixXcd::Zero(N, N);
  tmp = kryBasis[0];       // current Krylov vector; starts with tmp = src (normalized)
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < i + 1; j++) {    // fill column with components of kryVec. Only need j <= i to get orthonormal components
      M(j, i) = innerProduct(kryBasis[j], tmp);
    }    
    if (use_herm) {     // tmp --> D^\dag D(tmp)
      DiracOp.HermOp(tmp, tmp2);
      tmp = tmp2;
    } else {      // tmp --> D(tmp). Note that DiracOp.Op(tmp, tmp) will cause a bug
      DiracOp.Op(tmp, tmp2);
      tmp = tmp2;
    }
  }
  // Compute M^{-1} @ psiStarCoeffs and copy to coeffs
  Eigen::VectorXcd res (N);
  res = M.inverse() * psiStarCoeffs;
  for (int i = 0; i < N; i++) {
    coeffs[i] = res(i);
  }
 }
 // out file for poly coefficients (should it be complex?)
 // class PolynomialFile: Serializable {
 // public:
 //   GRID_SERIALIZABLE_CLASS_MEMBERS(OutputFile, std::vector< Real >, data);
 // };
 std::complex<double> poly_approx(std::complex<double> x, std::vector<std::complex<double>> coeffs) {
  std::complex<double> px;
  for (int i = 0; i < coeffs.size(); i++) {
    px += coeffs[i] * std::pow(x, i);
  }
  return px;
 }
 /**
 * Returns the approximation psi = \sum_i c_i D^i b resulting from a Krylov solver.
 * 
 * Parameters
 * ----------
 * LatticeFermion &psi
 *    Approximation field, returned psi = \sum_i c_i D^i b.
 * LatticeFermion src
 *    Source b used to generate the Krylov space K_n(D, b).
 * LinearOperatorBase<LatticeFermion> &Linop
 *    Dirac operator used to generate the Krylov space K_n(D, b).
 * std::vector<std::complex<double>> coeffs
 *    Polynomial coefficients returned from the solver. 
 */
 void krylovApprox(LatticeFermion &psi, LatticeFermion src, LinearOperatorBase<LatticeFermion> &Linop, std::vector<ComplexD> coeffs) {
  psi = Zero();
  LatticeFermion tmp (psi.Grid());
  tmp = src;
  LatticeFermion tmp2 (psi.Grid());
  for (int i = 0; i < coeffs.size(); i++) {
      psi = psi + coeffs[i] * tmp;
      Linop.Op(tmp, tmp2);              // tmp = D*tmp
      tmp = tmp2;
  }
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc, &argv);
  const int Ls = 8;
  std::vector<int> lat_size {16, 16, 16, 32};
  std::cout << "Lattice size: " << lat_size << std::endl;
  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(lat_size, 
 								          GridDefaultSimd(Nd,vComplex::Nsimd()),
 								          GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  // std::vector<int> seeds4({1, 2, 3, 4}); 
  // GridParallelRNG RNG4(UGrid);
  // RNG4.SeedFixedIntegers(seeds4);
  // std::vector<int> seeds5({1, 2, 3, 4, 5}); 
  // GridParallelRNG RNG5(FGrid);
  // RNG5.SeedFixedIntegers(seeds5);
  // std::string outStrStem = "/Users/patrickoare/Dropbox (MIT)/research/multigrid/grid_out/";
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("/Users/patrickoare/libraries/PETSc-Grid/ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu, header, file);
  RealD mass=0.01;
  RealD M5=1.8;
  // RealD M5=1.0;
  RealD b=1.5;// Scale factor b+c=2, b-c=1
  RealD c=0.5;
  // load in Dirac operators that we'll use; square it to Hermitize
  // Dsq just needs to be a Hermitian operator so we can use CG on it
  DomainWallFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
  SquaredLinearOperator<DomainWallFermionD, LatticeFermionD> Dsq (Ddwf);
  NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
  LatticeFermion src (FGrid); src = 1.0;                              // Source to use 
  LatticeFermion psiCG (FGrid); psiCG = Zero();                       // Field to solve with for CG
  LatticeFermion psiGCR (FGrid); psiGCR = Zero();                     // Field to solve with for GCR
  std::cout << GridLogMessage << "*******************************************" << std::endl;
  std::cout << GridLogMessage << "********** TESTING CG POLY COEFFS *********" << std::endl;
  std::cout << GridLogMessage << "*******************************************" << std::endl << std::endl;
  double tol = 1.0e-8;
  int N = 5;           // max iterations (size of Krylov basis)
  // GCR variables
  int outer_iters = 1;                  // num restarts for GCR
  TrivialPrecon<LatticeFermionD> prec;  // trivial preconditioner
  ConjugateGradientPolynomial<LatticeFermion> CGP(tol, N, false);
  CGP(Dsq, src, psiCG);
  // Compute Krylov coeffs directly and compare
  std::vector<ComplexD> cg_coeffs (N);
  poly_coeffs(cg_coeffs, Dsq, src, psiCG, FGrid, N, true);
  PolynomialFile PF;
  // Use GCR solver, also get poly coeffs
  std::vector<ComplexD> gcr_sym_coeffs (N);     // Can try N --> N + 3 to test to see if the last 3 comps are 0
  PGCRPolynomial<LatticeFermionD> GCRPolySym(tol, outer_iters, Dsq, prec, N+1, N, PF);    // mmax sets the memory, note the last beta doesn't really matter for updating the polynomial
  GCRPolySym(src, psiGCR);
  // poly_coeffs(gcr_sym_coeffs, Dsq, src, psi, FGrid, N, true);
  poly_coeffs(gcr_sym_coeffs, Dsq, src, psiGCR, FGrid, N, true);
  std::cout << GridLogMessage << std::endl << "******** CG POLYNOMIAL COEFFICIENTS *******" << std::endl;
  std::cout << GridLogMessage << CGP.polynomial << std::endl << std::endl;
  std::cout << GridLogMessage << "****** DIRECT POLYNOMIAL COEFFICIENTS *****" << std::endl;
  std::cout << GridLogMessage << cg_coeffs << std::endl << std::endl;
  // TODO: try GCR with a Hermitian operator (Dsq)
  std::cout << GridLogMessage << "****** GCR COEFFICIENTS *****" << std::endl;
  std::cout << GridLogMessage << GCRPolySym.polynomial << std::endl << std::endl;
  std::cout << GridLogMessage << "****** DIRECT GCR COEFFICIENTS *****" << std::endl;
  std::cout << GridLogMessage << gcr_sym_coeffs << std::endl << std::endl;
  // test how good the decomposition is
  std::cout << "Testing fidelity of decomposition by computing ||psi* - sum_i c_i D^i b||^2!" << std::endl;
  LatticeFermion psiPrime (FGrid);
  // for CG
  krylovApprox(psiPrime, src, Dsq, cg_coeffs);
  std::cout << "CG with Dsq, ||psi - psiPrime||^2 = " << norm2(psiCG - psiPrime) << std::endl;
  // for GCR with alpha / beta computation
  krylovApprox(psiPrime, src, Dsq, GCRPolySym.polynomial);
  std::cout << "GCR with Dsq, ||psi - psiPrime||^2 = " << norm2(psiGCR - psiPrime) << std::endl;
  // for GCR with alpha / beta computation
  krylovApprox(psiPrime, src, Dsq, gcr_sym_coeffs);
  std::cout << "GCR direct with Dsq, ||psi - psiPrime||^2 = " << norm2(psiGCR - psiPrime) << std::endl;
  // std::vector<double> real_cg_diff (N);
  // for (int i = 0; i < N; i++) { real_cg_diff[i] = std::abs(cg_coeffs[i].real() - CGP.polynomial[i]); }
  // std::cout << GridLogMessage << "************* COEFF DIFFERENCE ************" << std::endl;
  // std::cout << GridLogMessage << real_cg_diff << std::endl << std::endl;
  // GCR polynomial reconstruction with Ddwf!
  std::cout << GridLogMessage << "*******************************************" << std::endl;
  std::cout << GridLogMessage << "********* TESTING GCR POLY COEFFS *********" << std::endl;
  std::cout << GridLogMessage << "*******************************************" << std::endl << std::endl;
  // re-init variables
  src = 1.0;
  src = (1 / std::sqrt(norm2(src))) * src;
  psiGCR = Zero(); psiPrime = Zero();
  // test GCR poly
  PGCRPolynomial<LatticeFermionD> GCRPoly(tol, outer_iters, DLinOp, prec, N+1, N, PF);    // mmax sets the memory, note the last beta doesn't really matter for updating the polynomial
  GCRPoly(src, psiGCR);
  // Compute Krylov coeffs directly and compare
  // N = 1;    // compare the N > 1 decomposition with the psi* resulting from N = 1
  std::vector<ComplexD> gcr_coeffs (N);   // note N --> N + k should just give k coeffs that are 0; this works as intended
  poly_coeffs(gcr_coeffs, DLinOp, src, psiGCR, FGrid, N, false);
  std::cout << GridLogMessage << "******* GCR POLYNOMIAL COEFFICIENTS *******" << std::endl;
  std::cout << GridLogMessage << GCRPoly.polynomial << std::endl << std::endl;
  std::cout << GridLogMessage << "****** DIRECT POLYNOMIAL COEFFICIENTS *****" << std::endl;
  std::cout << GridLogMessage << gcr_coeffs << std::endl << std::endl;
  // test how good the decomposition is
  std::cout << "Testing fidelity of decomposition by computing ||psi* - sum_i c_i D^i b||^2!" << std::endl;
  // for GCR with alpha / beta computation
  krylovApprox(psiPrime, src, DLinOp, GCRPoly.polynomial);
  std::cout << "GCR with Dsq, ||psi - psiPrime||^2 = " << norm2(psiGCR - psiPrime) << std::endl;
  // for GCR with alpha / beta computation
  krylovApprox(psiPrime, src, DLinOp, gcr_coeffs);
  std::cout << "GCR direct with Dsq, ||psi - psiPrime||^2 = " << norm2(psiGCR - psiPrime) << std::endl;
  // TESTS TO DO THE N = 2 CASE DIRECTLY
  /*
  std::vector<std::complex<double>> alphas {
    std::complex(0.244300601, 0.00013007545), 
    std::complex(0.285370971, -0.000160704481)
  };
  std::complex<double> beta00 (-0.184661284, -6.52153945e-05);
  LatticeFermion psi2 (FGrid);
  LatticeFermion Dsrc (FGrid);
  DLinOp.Op(src, Dsrc);
  std::complex<double> c1 = alphas[0] + alphas[1] * (1. + beta00);
  std::complex<double> c2 = -alphas[0] * alphas[1];
  psi2 = c1 * src + c2 * Dsrc;
  std::cout << "||b|| = " << norm2(src) << std::endl;
  std::cout << "||Db|| = " << norm2(Dsrc) << std::endl;
  // fail; so far this is giving something different than what's being computed in krylovApprox (idk how?)
  std::cout << "c1 and c2 are: " << c1 << " and " << c2 << std::endl;
  std::cout << "GCRPoly polynomial coeffs are (should equal c1 and c2): " << GCRPoly.polynomial << std::endl;
  std::cout << "||GCRpsi - psi2||_2^2 = " << norm2(psiGCR - psi2) << std::endl;
  // pass
  LatticeFermion src2 (FGrid);
  src2 = 1.0;
  src2 = (1 / std::sqrt(norm2(src2))) * src2;
  std::cout << "||ones - src|| (to verify that src is the same throughout, should be 0) = " << norm2(src2 - src) << std::endl;
  // pass
  krylovApprox(psiPrime, src, DLinOp, GCRPoly.polynomial);
  std::cout << "GCR with Dsq, ||psi2 - psiPrime||^2 = " << norm2(psi2 - psiPrime) << std::endl;
  std::vector<ComplexD> psi2_coeffs (N);   // note N --> N + k should just give k coeffs that are 0; this works as intended
  poly_coeffs(psi2_coeffs, DLinOp, src, psi2, FGrid, N, false);
  krylovApprox(psiPrime, src, DLinOp, psi2_coeffs);
  std::cout << "GCR direct with Dsq, ||psi - psiPrime||^2 = " << norm2(psi2 - psiPrime) << std::endl;
  */
  // std::complex z (10.0, 0.0);     // z = 10
  // std::cout << GridLogMessage << "************* GCR POLY(z = 10) *************" << std::endl;
  // std::cout << GridLogMessage << poly_approx(z, GCRPoly.polynomial) << std::endl;
  // std::cout << GridLogMessage << "************ DIRECT POLY(z = 10) ***********" << std::endl;
  // std::cout << GridLogMessage << poly_approx(z, gcr_coeffs) << std::endl;
  // std::vector<std::complex<double>> gcr_diff (N);
  // for (int i = 0; i < N; i++) { gcr_diff[i] = gcr_coeffs[i] - GCRPoly.polynomial[i]; }
  // std::cout << GridLogMessage << "*********** GCR COEFF DIFFERENCE **********" << std::endl;
  // std::cout << GridLogMessage << gcr_diff << std::endl << std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -0,0 +1,380 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 // copied here from Test_general_coarse_pvdagm.cc
 #include <cstdlib>
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 using namespace std;
 using namespace Grid;
 namespace Grid {
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 		  		RealD, mstep , 
 				Integer, Nstop,
                                Integer, Nk,
                                Integer, Np,
                                Integer, ReadEvec,
                                Integer, maxIter,
 	  			RealD, resid,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
  LanczosParameters() {
    ////////////////////////////// Default values
      mass = 0;
    /////////////////////////////////
  }
  template <class ReaderClass >
  LanczosParameters(Reader<ReaderClass> & TheReader){
    initialize(TheReader);
  }
  template < class ReaderClass > 
  void initialize(Reader<ReaderClass> &TheReader){
 //    std::cout << GridLogMessage << "Reading HMC\n";
    read(TheReader, "HMC", *this);
  }
  void print_parameters() const {
 //    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
 //    MD.print_parameters();
  }
 };
 }
 template <class T> void writeFile(T& in, std::string const fname){
 #if 1
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
  WR.writeScidacFieldRecord(in,record,0);
  WR.close();
 #endif
  // What is the appropriate way to throw error?
 }
 typedef WilsonFermionD WilsonOp;
 typedef typename WilsonFermionD::FermionField FermionField;
 template<class Matrix,class Field>
 class InvertNonHermitianLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  RealD _stp;
 public:
  InvertNonHermitianLinearOperator(Matrix &Mat,RealD stp=1e-8): _Mat(Mat),_stp(stp){};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
 //    _Mat.Mdiag(in,out);
 //    out = out + shift*in;
    assert(0);
  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
 //    _Mat.Mdir(in,out,dir,disp);
    assert(0);
  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){
 //    _Mat.MdirAll(in,out);
    assert(0);
  };
  void Op     (const Field &in, Field &out){
    Field tmp(in.Grid());
 //    _Mat.M(in,out);
 //  RealD mass=-shift;
 //  WilsonCloverFermionD Dw(Umu, Grid, RBGrid, mass, csw_r, csw_t);
 //  NonHermitianLinearOperator<Matrix,Field> HermOp(_Mat);
 //  BiCGSTAB<Field> CG(_stp,10000);
    _Mat.Mdag(in,tmp);
    MdagMLinearOperator<Matrix,Field> HermOp(_Mat);
    ConjugateGradient<Field> CG(_stp,10000);
    CG(HermOp,tmp,out);
 //    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    _Mat.Mdag(in,out);
 //    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    assert(0);
  }
  void HermOp(const Field &in, Field &out){
    assert(0);
  }
 };
 template<class Field>
 void testSchurFromHess(Arnoldi<Field>& Arn, Field& src, int Nlarge, int Nm, int Nk) {
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout << GridLogMessage << "Testing Schur reordering, Nm = " << Nm << ", Nk = " << Nk << std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout << GridLogMessage << "Running Arnoldi for 1 iteration to get a Hessenberg." << std::endl;
  Arn(src, 1, Nlarge, Nm, Nlarge);
  Eigen::MatrixXcd Hess = Arn.getHessenbergMat();
  std::cout << GridLogMessage << "Hessenberg for use: " << std::endl << Hess << std::endl;
  ComplexSchurDecomposition schur (Hess, true);
  bool isDecomposed = schur.checkDecomposition();
  std::cout << "Schur decomp holds? " << isDecomposed << std::endl;
  std::cout << GridLogMessage << "S = " << std::endl << schur.getMatrixS() << std::endl;
  std::cout << GridLogMessage << "Swapping S(3, 3) with S(4, 4)" << std::endl;
  schur.swapEvals(3);
  std::cout << GridLogMessage << "S after swap = " << std::endl << schur.getMatrixS() << std::endl;
  std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
  // Now move last diagonal element all the way to the front.
  std::cout << GridLogMessage << "Moving last eval to front. S at start = " << std::endl << schur.getMatrixS() << std::endl;
  for (int i = 0; i < Nk - 1; i++) {
    int swapIdx = Nk - 2 - i;
    schur.swapEvals(swapIdx);
    std::cout << GridLogMessage << "S after swap of index " << swapIdx << " = " << std::endl << schur.getMatrixS() << std::endl;
    std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
  }
  std::cout << GridLogMessage << "Testing Schur reorder" << std::endl;
  schur.schurReorder(Nk);
  std::cout << GridLogMessage << "S after reorder = " << std::endl << schur.getMatrixS() << std::endl;
  std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
 //   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 //  std::vector<int> lat_size {32, 32, 32, 32};
 //  std::cout << "Lattice size: " << lat_size << std::endl;
  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
 								          GridDefaultSimd(Nd,vComplex::Nsimd()),
 								          GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 //  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 //  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridCartesian         * FGrid   = UGrid;
  GridRedBlackCartesian * FrbGrid = UrbGrid;
  // Construct a coarsened grid
  // poare TODO: replace this with the following line?
  Coordinate clatt = GridDefaultLatt();
 //   Coordinate clatt = GridDefaultLatt();              // [PO] initial line before I edited it
  for(int d=0;d<clatt.size();d++){
  std::cout << GridLogMessage<< clatt[d] <<std::endl;
    clatt[d] = clatt[d]/2;
    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
  LatticeFermion result(FGrid); result=Zero();
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("config");
 //  std::string file("Users/patrickoare/libraries/PETSc-Grid/ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu,header,file);
  LanczosParameters LanParams;
  {
    XmlReader  HMCrd("LanParams.xml");
    read(HMCrd,"LanczosParameters",LanParams);
  }
  std::cout << GridLogMessage<< LanParams <<std::endl;
  {
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
  RealD mass=0.01;
  RealD M5=1.8;
  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
  int Nm = 50;
  int Nk = 12; 
  int Np = 38; 
  // int Nk = Nm+1;     // if just running once
  int maxIter = 10000;
  int Nstop = 10;
  RealD resid = 1.0e-5;
  std::vector<Complex> boundary = {1,1,1,-1};
  WilsonOp::ImplParams Params(boundary);
 //  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 //  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
  mass=LanParams.mass;
  std::cout << GridLogIRL<< "mass "<<mass<<std::endl;
  WilsonOp WilsonOperator(Umu,*UGrid,*UrbGrid,mass,Params);
  // const int nbasis = 20;            // size of approximate basis for low-mode space
  const int nbasis = 3;            // size of approximate basis for low-mode space
  const int cb = 0 ;
  LatticeFermion prom(FGrid);
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NextToNearestStencilGeometry5D geom(Coarse5d);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
 //  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
 //  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
 //  typedef ShiftedComplexPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedComplexPVdagM_t;
 //  PVdagM_t PVdagM(Ddwf, Dpv);
 //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv);
 //  SquaredLinearOperator<DomainWallFermionD, LatticeFermionD> Dsq (Ddwf);
 //  NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
  NonHermitianLinearOperator<WilsonOp,FermionField> Dwilson(WilsonOperator); /// <-----
  InvertNonHermitianLinearOperator<WilsonOp,FermionField> Iwilson(WilsonOperator); /// <-----
  MdagMLinearOperator<WilsonOp,FermionField> HermOp(WilsonOperator); /// <-----
  Gamma5HermitianLinearOperator <WilsonOp,LatticeFermion> HermOp2(WilsonOperator); /// <----
  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
  resid=LanParams.resid;
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
  maxIter=LanParams.maxIter;
  Nm = Nk + Np;
  int Nu=16;
  std::vector<LatticeFermion> src(Nu,FGrid); 
  for(int i=0;i<Nu;i++) random(RNG5,src[i]);
  if(LanParams.ReadEvec) {
    std::string evecs_file="evec_in";
    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
    emptyUserRecord record;
    Grid::ScidacReader RD;
    RD.open(evecs_file);
    RD.readScidacFieldRecord(src[0],record);
    RD.close();
  }
  Coordinate origin ({0,0,0,0});
  auto tmpSrc = peekSite(src[0], origin);
  std::cout << "[DEBUG] Source at origin = " <<  tmpSrc << std::endl;
  LatticeFermion src2 = src[0];
  // Run KrylovSchur and Arnoldi on a Hermitian matrix
  std::cout << GridLogMessage << "Running Krylov Schur" << std::endl;
  // KrylovSchur KrySchur (Dsq, FGrid, 1e-8, EvalNormLarge);
 //  KrylovSchur KrySchur (Dsq, FGrid, 1e-8,EvalImNormSmall);
 //  KrySchur(src, maxIter, Nm, Nk, Nstop);
 //  KrylovSchur KrySchur (HermOp2, UGrid, resid,EvalNormSmall);
 //  Hacked, really EvalImagSmall
 #if 1
    RealD shift=1.5;
    KrylovSchur KrySchur (Dwilson, UGrid, resid,EvalImNormSmall);
    KrySchur(src[0], maxIter, Nm, Nk, Nstop,&shift);
 #else
    KrylovSchur KrySchur (Iwilson, UGrid, resid,EvalImNormSmall);
    KrySchur(src[0], maxIter, Nm, Nk, Nstop);
 #endif
  std::cout << GridLogMessage << "evec.size= " << KrySchur.evecs.size()<< std::endl;
  src[0]=KrySchur.evecs[0];
  for (int i=1;i<Nstop;i++) src[0]+=KrySchur.evecs[i];
  for (int i=0;i<Nstop;i++) 
  {
 	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
        auto evdensity = localInnerProduct(KrySchur.evecs[i],KrySchur.evecs[i] );
        writeFile(evdensity,evfile);
  }
  {
        std::string evfile ("./evec_"+std::to_string(mass)+"_sum");
 //        auto evdensity = localInnerProduct(evec[i],evec[i] );
        writeFile(src[0],evfile);
  }
  /*
  std::cout << GridLogMessage << "Running Arnoldi" << std::endl;
  // Arnoldi Arn (Dsq, FGrid, 1e-8);
  Arnoldi Arn (DLinOp, FGrid, 1e-8);
  testSchurFromHess<LatticeFermion>(Arn, src, 10, 6, 4);
  Arnoldi Arn2 (DLinOp, FGrid, 1e-8);
  testSchurFromHess<LatticeFermion>(Arn2, src, 16, 12, 8);
  */
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -0,0 +1,498 @@
 /*************************************************************************************
    Runs the Krylov-Schur algorithm on a (pre-conditioned) domain-wall fermion operator 
    to determine part of its spectrum. 
    Usage : 
      $ ./Example_spec_kryschur <Nm> <Nk> <maxiter> <Nstop> <inFile> <outDir> <?rf>
      Nm = Maximum size of approximation subspace.
      Nk = Size of truncation subspace
      maxiter = Maximum number of iterations.
      Nstop   = Stop when Nstop eigenvalues have converged. 
      inFile  = Gauge configuration to read in.
      outDir  = Directory to write output to.
      rf      = (Optional) RitzFilter to sort with. Takes in any string in 
                  {EvalNormSmall, EvalNormLarge, EvalReSmall, EvalReLarge, EvalImSmall, EvalImLarge}
    Output:
      ${outDir}/evals.txt  = Contains all eigenvalues. Each line is formatted as `$idx $eval $ritz`, where:
                              - $idx is the index of the eigenvalue.
                              - $eval is the eigenvalue, formated as "(re,im)".
                              - $ritz is the Ritz estimate of the eigenvalue (deviation from being a true eigenvalue)
      ${outDir}/evec${idx} = Eigenvector $idx written out in SCIDAC format (if LIME is enabled).
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Patrick Oare <poare@bnl.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <cstdlib>
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 #include <Grid/parallelIO/IldgIOtypes.h>
 #include <Grid/parallelIO/IldgIO.h>
 using namespace std;
 using namespace Grid;
 namespace Grid {
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass ,
 		  		RealD, mstep ,
 				Integer, Nstop,
                                Integer, Nk,
                                Integer, Np,
                                Integer, ReadEvec,
 	  			RealD, resid,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
  LanczosParameters() {
    /////////////////////////////////
  }
  template <class ReaderClass >
  LanczosParameters(Reader<ReaderClass> & TheReader){
    initialize(TheReader);
  }
  template < class ReaderClass >
  void initialize(Reader<ReaderClass> &TheReader){
    read(TheReader, "HMC", *this);
  }
  void print_parameters() const {
 //    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
 //    MD.print_parameters();
  }
 };
 }
 template <class T> void writeFile(T& in, std::string const fname){
 #ifdef HAVE_LIME
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
  WR.writeScidacFieldRecord(in,record,0); // Lexico
  WR.close();
 #endif
 }
 /**
 * Writes the eigensystem of a Krylov Schur object to a directory.
 *
 * Parameters
 * ----------
 * std::string path
 *    Directory to write to.
 */
 template <class Field>
 void writeEigensystem(KrylovSchur<Field> KS, std::string outDir) {
  int Nk = KS.getNk();
  std::cout << GridLogMessage << "Writing output to directory: " << outDir << std::endl;
  // Write evals
  std::string evalPath = outDir + "/evals.txt";
  std::ofstream fEval;
  fEval.open(evalPath);
  Eigen::VectorXcd evals = KS.getEvals();
  std::vector<RealD> ritz  = KS.getRitzEstimates();
  for (int i = 0; i < Nk; i++) {
    // write eigenvalues and Ritz estimates
    fEval << i << " " << evals(i) << " " << ritz[i];
    if (i < Nk - 1) { fEval << "\n"; }
  }
  fEval.close();
  // Write evecs (TODO: very heavy on storage costs! Don't write them all out)
  // std::vector<Field> evecs = KS.getEvecs();
  // for (int i = 0; i < Nk; i++) {
  //   std::string fName = outDir + "/evec" + std::to_string(i);
  //   writeFile(evecs[i], fName);     // using method from Grid/HMC/ComputeWilsonFlow.cc
  // }
 }
 // Hermitize a DWF operator by squaring it
 template<class Matrix,class Field>
 class SquaredLinearOperator : public LinearOperatorBase<Field> {
  public:
  Matrix &_Mat;
  public:
    SquaredLinearOperator(Matrix &Mat): _Mat(Mat) {};
    void OpDiag (const Field &in, Field &out) {    assert(0);  }
    void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
    void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
    void Op     (const Field &in, Field &out){
      // std::cout << "Op is overloaded as HermOp" << std::endl;
      HermOp(in, out);
    }
    void AdjOp     (const Field &in, Field &out){
      HermOp(in, out);
    }
    void _Op     (const Field &in, Field &out){
      // std::cout << "Op: M "<<std::endl;
      _Mat.M(in, out);
    }
    void _AdjOp     (const Field &in, Field &out){
      // std::cout << "AdjOp: Mdag "<<std::endl;
      _Mat.Mdag(in, out);
    }
    void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
    void HermOp(const Field &in, Field &out){
      // std::cout << "HermOp: Mdag M Mdag M"<<std::endl;
      Field tmp(in.Grid());
      _Op(in,tmp);
      _AdjOp(tmp,out);
    }
 };
 template<class Matrix,class Field>
 class PVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
 public:
  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    //    _Mat.M(in,tmp);
    //    _PV.Mdag(tmp,out);
    //    _PV.M(out,tmp);
    //    _Mat.Mdag(tmp,out);
    Op(in,tmp);
    AdjOp(tmp,out);
    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Matrix,class Field>
 class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  RealD shift;
 public:
  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
  }
 };
 template<class Matrix, class Field>
 class ShiftedComplexPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  ComplexD shift;
 public:
 ShiftedComplexPVdagMLinearOperator(ComplexD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
  }
  void resetShift(ComplexD newShift) {
    shift = newShift;
  }
 };
 #if 0
 template<class Fobj,class CComplex,int nbasis>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
  typedef LinearOperatorBase<FineField>                            FineOperator;
  typedef LinearFunction    <FineField>                            FineSmoother;
  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
  Aggregates     & _Aggregates;
  FineOperator   & _FineOperator;
  FineSmoother   & _PreSmoother;
  FineSmoother   & _PostSmoother;
  CoarseOperator & _CoarseOperator;
  CoarseSolver   & _CoarseSolve;
  int    level;  void Level(int lv) {level = lv; };
  MGPreconditioner(Aggregates &Agg,
 		   FineOperator &Fine,
 		   FineSmoother &PreSmoother,
 		   FineSmoother &PostSmoother,
 		   CoarseOperator &CoarseOperator_,
 		   CoarseSolver &CoarseSolve_)
    : _Aggregates(Agg),
      _FineOperator(Fine),
      _PreSmoother(PreSmoother),
      _PostSmoother(PostSmoother),
      _CoarseOperator(CoarseOperator_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  virtual void operator()(const FineField &in, FineField & out) 
  {
    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
    //    auto CoarseGrid = _CoarseOperator.Grid();
    CoarseVector Csrc(CoarseGrid);
    CoarseVector Csol(CoarseGrid);
    FineField vec1(in.Grid());
    FineField vec2(in.Grid());
    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
    double t;
    // Fine Smoother
    //    out = in;
    out = Zero();
    t=-usecond();
    _PreSmoother(in,out);
    t+=usecond();
    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
    // Update the residual
    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
    // Fine to Coarse 
    t=-usecond();
    _Aggregates.ProjectToSubspace  (Csrc,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse correction
    t=-usecond();
    Csol = Zero();
    _CoarseSolve(Csrc,Csol);
    //Csol=Zero();
    t+=usecond();
    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse to Fine
    t=-usecond();  
    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
    _Aggregates.PromoteFromSubspace(Csol,vec1); 
    add(out,out,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
    // Residual
    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
    // Fine Smoother
    t=-usecond();
    //    vec2=vec1;
    vec2=Zero();
    _PostSmoother(vec1,vec2);
    t+=usecond();
    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
    add( out,out,vec2);
    std::cout<<GridLogMessage << "Done " <<std::endl;
  }
 };
 #endif
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  // Usage : $ ./Example_spec_kryschur <Nm> <Nk> <maaxiter> <Nstop> <inFile> <outDir>
  std::string NmStr      = argv[1];
  std::string NkStr      = argv[2];
  std::string maxIterStr = argv[3];
  std::string NstopStr   = argv[4];
  std::string file       = argv[5];
  std::string outDir     = argv[6];
  RitzFilter RF;
  if (argc == 8) {
    std::string rf       = argv[7];
    RF = selectRitzFilter(rf);
  } else {
    RF = EvalReSmall;
  }
  std::cout << "Sorting eigenvalues using " << rfToString(RF) << std::endl;
  //const int Ls=16;
  const int Ls = 8;
 //   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  //std::vector<int> lat_size {16, 16, 16, 32};
  std::vector<int> lat_size {8, 8, 8, 8};
  std::cout << "Lattice size: " << lat_size << std::endl;
  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(lat_size, 
 								          GridDefaultSimd(Nd,vComplex::Nsimd()),
 								          GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  LatticeFermion    src(FGrid); random(RNG5,src);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  NerscIO::readConfiguration(Umu,header,file);
  // RealD mass=0.01;
  RealD mass=0.001;
  RealD M5=1.8;
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
  typedef ShiftedComplexPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedComplexPVdagM_t;
  PVdagM_t PVdagM(Ddwf, Dpv);
  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv);
  SquaredLinearOperator<DomainWallFermionD, LatticeFermionD> Dsq (Ddwf);
  NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
  int Nm = std::stoi(NmStr);
  int Nk = std::stoi(NkStr);
  int maxIter = std::stoi(maxIterStr);
  int Nstop = std::stoi(NstopStr);
  std::cout << GridLogMessage << "Runnning Krylov Schur. Nm = " << Nm << ", Nk = " << Nk << ", maxIter = " << maxIter 
                  << ", Nstop = " << Nstop << std::endl;
  KrylovSchur KrySchur (PVdagM, FGrid, 1e-8, RF);      // use preconditioned PV^\dag D_{dwf}
  // KrylovSchur KrySchur (DLinOp, FGrid, 1e-8, RF);         // use D_{dwf}
  KrySchur(src, maxIter, Nm, Nk, Nstop);
  std::cout<<GridLogMessage << "*******************************************" << std::endl;
  std::cout<<GridLogMessage << "***************** RESULTS *****************" << std::endl;
  std::cout<<GridLogMessage << "*******************************************" << std::endl;
  std::cout << GridLogMessage << "Krylov Schur eigenvalues: " << std::endl << KrySchur.getEvals() << std::endl;
  writeEigensystem(KrySchur, outDir);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -3,6 +3,9 @@
 * without regression / tests being applied
 */
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -535,5 +538,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
-
+#endif
@@ -3,6 +3,9 @@
 * without regression / tests being applied
 */
 #include "disable_examples_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -429,5 +432,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
-
+#endif
@@ -0,0 +1,383 @@
 /*************************************************************************************
    Script for studying the Wilson eigenvectors resulting from the Krylov-Schur process. 
    Usage : 
      $ ./Example_spec_kryschur <Nm> <Nk> <maxiter> <Nstop> <inFile> <outDir> <?rf>
      Nm = Maximum size of approximation subspace.
      Nk = Size of truncation subspace
      maxiter = Maximum number of iterations.
      Nstop   = Stop when Nstop eigenvalues have converged. 
      inFile  = Gauge configuration to read in.
      outDir  = Directory to write output to.
      rf      = (Optional) RitzFilter to sort with. Takes in any string in 
                  {EvalNormSmall, EvalNormLarge, EvalReSmall, EvalReLarge, EvalImSmall, EvalImLarge}
    Output:
      ${outDir}/evals.txt  = Contains all eigenvalues. Each line is formatted as `$idx $eval $ritz`, where:
                              - $idx is the index of the eigenvalue.
                              - $eval is the eigenvalue, formated as "(re,im)".
                              - $ritz is the Ritz estimate of the eigenvalue (deviation from being a true eigenvalue)
      ${outDir}/evec${idx} = Eigenvector $idx written out in SCIDAC format (if LIME is enabled).
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Patrick Oare <poare@bnl.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <cstdlib>
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 #include <Grid/parallelIO/IldgIOtypes.h>
 #include <Grid/parallelIO/IldgIO.h>
 using namespace std;
 using namespace Grid;
 template <class T> void writeFile(T& in, std::string const fname){  
  #ifdef HAVE_LIME
    // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
    std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
    Grid::emptyUserRecord record;
    Grid::ScidacWriter WR(in.Grid()->IsBoss());
    WR.open(fname);
    WR.writeScidacFieldRecord(in,record,0); // Lexico
    WR.close();
  #endif
 }
 template <class T> void readFile(T& out, std::string const fname){  
  #ifdef HAVE_LIME
    // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
    std::cout << Grid::GridLogMessage << "Reads at: " << fname << std::endl;
    Grid::emptyUserRecord record;
    // Grid::ScidacReader SR(out.Grid()->IsBoss());
    Grid::ScidacReader SR;
    SR.open(fname);
    SR.readScidacFieldRecord(out, record);
    SR.close();
  #endif
 }
 /**
 * Writes the eigensystem of a Krylov Schur object to a directory. 
 * 
 * Parameters
 * ----------
 * std::string path
 *    Directory to write to. 
 */
 template <class Field>
 void writeEigensystem(KrylovSchur<Field> KS, std::string outDir) {
  int Nk = KS.getNk();
  std::cout << GridLogMessage << "Writing output to directory: " << outDir << std::endl;
  // Write evals
  std::string evalPath = outDir + "/evals.txt";
  std::ofstream fEval;
  fEval.open(evalPath);
  Eigen::VectorXcd evals = KS.getEvals();
  std::vector<RealD> ritz  = KS.getRitzEstimates();
  for (int i = 0; i < Nk; i++) {
    // write eigenvalues and Ritz estimates
    fEval << i << " " << evals(i) << " " << ritz[i];
    if (i < Nk - 1) { fEval << "\n"; }
  }
  fEval.close();
  // Write evecs
  int Nevecs = Nk;          // don't write all of them
  std::vector<Field> evecs = KS.getEvecs();
  for (int i = 0; i < Nevecs; i++) {
    std::string fName = outDir + "/evec" + std::to_string(i);
    writeFile(evecs[i], fName);     // using method from Grid/HMC/ComputeWilsonFlow.cc
  }
 }
 // Hermitize a DWF operator by squaring it
 template<class Matrix,class Field>
 class SquaredLinearOperator : public LinearOperatorBase<Field> {
  public:
  Matrix &_Mat;
  public:
    SquaredLinearOperator(Matrix &Mat): _Mat(Mat) {};
    void OpDiag (const Field &in, Field &out) {    assert(0);  }
    void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
    void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
    void Op     (const Field &in, Field &out){
      // std::cout << "Op is overloaded as HermOp" << std::endl;
      HermOp(in, out);
    }
    void AdjOp     (const Field &in, Field &out){
      HermOp(in, out);
    }
    void _Op     (const Field &in, Field &out){
      // std::cout << "Op: M "<<std::endl;
      _Mat.M(in, out);
    }
    void _AdjOp     (const Field &in, Field &out){
      // std::cout << "AdjOp: Mdag "<<std::endl;
      _Mat.Mdag(in, out);
    }
    void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
    void HermOp(const Field &in, Field &out){
      // std::cout << "HermOp: Mdag M Mdag M"<<std::endl;
      Field tmp(in.Grid());
      _Op(in,tmp);
      _AdjOp(tmp,out);
    }
 };
 template<class Matrix,class Field>
 class PVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
 public:
  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    //    _Mat.M(in,tmp);
    //    _PV.Mdag(tmp,out);
    //    _PV.M(out,tmp);
    //    _Mat.Mdag(tmp,out);
    Op(in,tmp);
    AdjOp(tmp,out);
    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Matrix,class Field>
 class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  RealD shift;
 public:
  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
  }
 };
 template<class Matrix, class Field>
 class ShiftedComplexPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  ComplexD shift;
 public:
 ShiftedComplexPVdagMLinearOperator(ComplexD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
  }
  void resetShift(ComplexD newShift) {
    shift = newShift;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  // Usage : $ ./Example_wilson_evecs ${inFile}
  std::string file       = argv[1];
  const int Ls=16;
 //   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  //std::vector<int> lat_size {16, 16, 16, 32};
  std::vector<int> lat_size {32, 32, 32, 32};
  std::cout << "Lattice size: " << lat_size << std::endl;
  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(lat_size, 
 								          GridDefaultSimd(Nd,vComplex::Nsimd()),
 								          GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  // GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  // GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridCartesian * FGrid = UGrid;
  GridRedBlackCartesian * FrbGrid = UrbGrid;
  std::vector<int> seeds4({1,2,3,4});
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);
  LatticeFermion    src(FGrid); random(RNG4, src);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  NerscIO::readConfiguration(Umu, header, file);
  std::cout << GridLogMessage << "Loaded configuration" << std::endl;
  // RealD mass = 0.01;
  RealD M5 = 1.8;
  // Wilson mass
  RealD mass = -1.6;
  std::cout << GridLogMessage << "masses specified" << std::endl;
  std::vector<Complex> boundary = {1,1,1,-1};
  WilsonFermionD::ImplParams Params(boundary);
  // DomainWallFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
  // NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
  // WilsonFermionD Dwilson(Umu, *FGrid, *FrbGrid, mass);
  WilsonFermionD Dwilson(Umu, *UGrid, *UrbGrid, mass, Params);
  NonHermitianLinearOperator<WilsonFermionD, LatticeFermionD> DLinOp (Dwilson);
  std::cout << GridLogMessage << "Dirac operator defined" << std::endl;
  std::string eigenPath = "/home/poare/lqcd/multigrid/spectra/32cube-rho0.124-tau4/U_smr_3.000000/Nm72_Nk24_8111835.aurora-pbs-0001.hostmgmt.cm.aurora.alcf.anl.gov/";
  std::cout << GridLogMessage << "Loading eigenvalues" << std::endl;
  std::ifstream evalFile(eigenPath + "evals.txt");
  std::string str;
  std::vector<ComplexD> evals;
  while (std::getline(evalFile, str)) {
      std::cout << GridLogMessage << "Reading line: " << str << std::endl;
      int i1 = str.find("(") + 1;
      int i2 = str.find(",") + 1;
      int i3 = str.find(")");
      std::cout << "i1,i2,i3 = " << i1 << "," << i2 << "," << i3 << std::endl;
      std::string reStr = str.substr(i1, i2 - i1);
      std::string imStr = str.substr(i2, i3 - i2);
      std::cout << GridLogMessage << "Parsed re = " << reStr << " and im = " << imStr << std::endl;
      // ComplexD z (std::stof(reStr), std::stof(imStr));
      ComplexD z (std::stod(reStr), std::stod(imStr));
      evals.push_back(z);
  }
  std::cout << GridLogMessage << "Eigenvalues: " << evals << std::endl;
  int Nevecs = 24;
  std::vector<LatticeFermion> evecs;
  LatticeFermion evec (FGrid);
  for (int i = 0; i < Nevecs; i++) {
    std::string evecPath = eigenPath + "evec" + std::to_string(i);
    readFile(evec, evecPath);
    evecs.push_back(evec);
  }
  std::cout << GridLogMessage << "Evecs loaded" << std::endl;
  // Compute < evec | D - \lambda | evec >
  std::cout << GridLogMessage << "Testing eigenvectors" << std::endl;
  LatticeFermion Devec (FGrid);
  ComplexD ritz;
  for (int i = 0; i < Nevecs; i++) {
    Devec = Zero();
    DLinOp.Op(evecs[i], Devec);
    ritz = std::sqrt(norm2(Devec - evals[i] * evecs[i]));
    std::cout << GridLogMessage << "i = " << i << ", || (D - lambda) |vi> || = " << ritz << std::endl;
  }
  // Eigen::MatrixXcd Dw_evecs;
  // Dw_evecs = Eigen::MatrixXcd::Zero(Nevecs, Nevecs);
  // for (int i = 0; i < Nevecs; i++) {
  //   Linop.Op(evecs[i], Devec);
  //   for (int j = 0; j < Nevecs; j++) {
  //   }
  // }
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -0,0 +1,374 @@
 /*************************************************************************************
    Runs the Krylov-Schur algorithm on a Wilson fermion operator to determine part of its spectrum. 
    TODO rename this file: really is running the topology change jobs on Aurora. 
    Usage : 
      $ ./Example_spec_kryschur <Nm> <Nk> <maxiter> <Nstop> <inFile> <outDir> <?rf>
      Nm = Maximum size of approximation subspace.
      Nk = Size of truncation subspace
      maxiter = Maximum number of iterations.
      Nstop   = Stop when Nstop eigenvalues have converged. 
      inFile  = Gauge configuration to read in.
      outDir  = Directory to write output to.
      rf      = (Optional) RitzFilter to sort with. Takes in any string in 
                  {EvalNormSmall, EvalNormLarge, EvalReSmall, EvalReLarge, EvalImSmall, EvalImLarge}
    Output:
      ${outDir}/evals.txt  = Contains all eigenvalues. Each line is formatted as `$idx $eval $ritz`, where:
                              - $idx is the index of the eigenvalue.
                              - $eval is the eigenvalue, formated as "(re,im)".
                              - $ritz is the Ritz estimate of the eigenvalue (deviation from being a true eigenvalue)
      ${outDir}/evec${idx} = Eigenvector $idx written out in SCIDAC format (if LIME is enabled).
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    Author: Patrick Oare <poare@bnl.edu>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <cstdlib>
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 #include <Grid/parallelIO/IldgIOtypes.h>
 #include <Grid/parallelIO/IldgIO.h>
 using namespace std;
 using namespace Grid;
 template <class T> void writeFile(T& in, std::string const fname){  
  #ifdef HAVE_LIME
    // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
    std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
    Grid::emptyUserRecord record;
    Grid::ScidacWriter WR(in.Grid()->IsBoss());
    WR.open(fname);
    WR.writeScidacFieldRecord(in,record,0); // Lexico
    WR.close();
  #endif
 }
 /**
 * Writes the eigensystem of a Krylov Schur object to a directory. 
 * 
 * Parameters
 * ----------
 * std::string path
 *    Directory to write to. 
 */
 template <class Field>
 void writeEigensystem(KrylovSchur<Field> KS, std::string outDir) {
  int Nk = KS.getNk();
  std::cout << GridLogMessage << "Writing output to directory: " << outDir << std::endl;
  // Write evals
  std::string evalPath = outDir + "/evals.txt";
  std::ofstream fEval;
  fEval.open(evalPath);
  Eigen::VectorXcd evals = KS.getEvals();
  std::vector<RealD> ritz  = KS.getRitzEstimates();
  for (int i = 0; i < Nk; i++) {
    // write eigenvalues and Ritz estimates
    fEval << i << " " << evals(i) << " " << ritz[i];
    if (i < Nk - 1) { fEval << "\n"; }
  }
  fEval.close();
  // Write evecs
  int Nevecs = Nk;          // don't write all of them
  std::vector<Field> evecs = KS.getEvecs();
  for (int i = 0; i < Nevecs; i++) {
    std::string fName = outDir + "/evec" + std::to_string(i);
    writeFile(evecs[i], fName);     // using method from Grid/HMC/ComputeWilsonFlow.cc
  }
 }
 // Hermitize a DWF operator by squaring it
 template<class Matrix,class Field>
 class SquaredLinearOperator : public LinearOperatorBase<Field> {
  public:
  Matrix &_Mat;
  public:
    SquaredLinearOperator(Matrix &Mat): _Mat(Mat) {};
    void OpDiag (const Field &in, Field &out) {    assert(0);  }
    void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
    void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
    void Op     (const Field &in, Field &out){
      // std::cout << "Op is overloaded as HermOp" << std::endl;
      HermOp(in, out);
    }
    void AdjOp     (const Field &in, Field &out){
      HermOp(in, out);
    }
    void _Op     (const Field &in, Field &out){
      // std::cout << "Op: M "<<std::endl;
      _Mat.M(in, out);
    }
    void _AdjOp     (const Field &in, Field &out){
      // std::cout << "AdjOp: Mdag "<<std::endl;
      _Mat.Mdag(in, out);
    }
    void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
    void HermOp(const Field &in, Field &out){
      // std::cout << "HermOp: Mdag M Mdag M"<<std::endl;
      Field tmp(in.Grid());
      _Op(in,tmp);
      _AdjOp(tmp,out);
    }
 };
 template<class Matrix,class Field>
 class PVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
 public:
  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    //    _Mat.M(in,tmp);
    //    _PV.Mdag(tmp,out);
    //    _PV.M(out,tmp);
    //    _Mat.Mdag(tmp,out);
    Op(in,tmp);
    AdjOp(tmp,out);
    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Matrix,class Field>
 class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  RealD shift;
 public:
  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
  }
 };
 template<class Matrix, class Field>
 class ShiftedComplexPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  ComplexD shift;
 public:
 ShiftedComplexPVdagMLinearOperator(ComplexD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
  }
  void resetShift(ComplexD newShift) {
    shift = newShift;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  // Usage : $ ./Example_spec_kryschur <Nm> <Nk> <maaxiter> <Nstop> <inFile> <outDir>
  std::string NmStr      = argv[1];
  std::string NkStr      = argv[2];
  std::string maxIterStr = argv[3];
  std::string NstopStr   = argv[4];
  std::string file       = argv[5];
  std::string outDir     = argv[6];
  // RitzFilter RF;
  // if (argc == 8) {
  //   std::string rf       = argv[7];
  //   RF = selectRitzFilter(rf);
  // } else {
  //   RF = EvalReSmall;
  // }
  // RitzFilter RF;
  std::string rf       = argv[7];
  RitzFilter RF        = selectRitzFilter(rf);
  std::cout << "Sorting eigenvalues using " << rfToString(RF) << std::endl;
  const int Ls=16;
 //   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  //std::vector<int> lat_size {16, 16, 16, 32};
  std::vector<int> lat_size {32, 32, 32, 32};
  std::cout << "Lattice size: " << lat_size << std::endl;
  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(lat_size, 
 								          GridDefaultSimd(Nd,vComplex::Nsimd()),
 								          GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  // GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  // GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridCartesian * FGrid = UGrid;
  GridRedBlackCartesian * FrbGrid = UrbGrid;
  std::vector<int> seeds4({1,2,3,4});
  GridParallelRNG RNG4(UGrid);
  RNG4.SeedFixedIntegers(seeds4);
  LatticeFermion    src(FGrid); random(RNG4, src);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  NerscIO::readConfiguration(Umu, header, file);
  std::cout << GridLogMessage << "Loaded configuration" << std::endl;
  // RealD mass = 0.01;
  RealD M5 = 1.8;
  // Wilson mass
  RealD mass = -1.6;
  std::cout << GridLogMessage << "masses specified" << std::endl;
  std::vector<Complex> boundary = {1,1,1,-1};
  WilsonFermionD::ImplParams Params(boundary);
  // DomainWallFermionD Ddwf(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mass, M5);
  // NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
  // WilsonFermionD Dwilson(Umu, *FGrid, *FrbGrid, mass);
  WilsonFermionD Dwilson(Umu, *UGrid, *UrbGrid, mass, Params);
  NonHermitianLinearOperator<WilsonFermionD, LatticeFermionD> DLinOp (Dwilson);
  std::cout << GridLogMessage << "Dirac operator defined" << std::endl;
  // Define PV^dag D (if we want)
  // DomainWallFermionD Dpv(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, 1.0, M5);
  // typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
  // PVdagM_t PVdagM(Ddwf, Dpv);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  // SquaredLinearOperator<WilsonFermionD, LatticeFermionD> Dsq (DWilson);
  // NonHermitianLinearOperator<WilsonFermionD, LatticeFermionD> DLinOp (DWilson);
  int Nm = std::stoi(NmStr);
  int Nk = std::stoi(NkStr);
  int maxIter = std::stoi(maxIterStr);
  int Nstop = std::stoi(NstopStr);
  std::cout << GridLogMessage << "Runnning Krylov Schur. Nm = " << Nm << ", Nk = " << Nk << ", maxIter = " << maxIter 
                  << ", Nstop = " << Nstop << std::endl;
  // KrylovSchur KrySchur (PVdagM, FGrid, 1e-8, RF);         // use PV^\dag M
  KrylovSchur KrySchur (DLinOp, FGrid, 1e-8, RF);         // use Ddwf
  KrySchur(src, maxIter, Nm, Nk, Nstop);
  std::cout << GridLogMessage << "Checking eigensystem." << std::endl;
  KrySchur.checkRitzEstimate();
  std::cout<<GridLogMessage << "*******************************************" << std::endl;
  std::cout<<GridLogMessage << "***************** RESULTS *****************" << std::endl;
  std::cout<<GridLogMessage << "*******************************************" << std::endl;
  std::cout << GridLogMessage << "Krylov Schur eigenvalues: " << std::endl << KrySchur.getEvals() << std::endl;
  writeEigensystem(KrySchur, outDir);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -0,0 +1,15 @@
 #include <Grid/Grid.h>
 #pragma once
 #ifndef ENABLE_FERMION_INSTANTIATIONS
 #include <iostream>
 int main(void) {
  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
 	    << "which this example relies on. "
 	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
 	    << "to run this example."
 	    << std::endl;
  return 1;
 }
 #endif
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_tests_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -273,8 +276,6 @@ void  TestWhat(What & Ddwf,
  err = phi-chi;
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<< std::endl;
 }
-
+#endif
@@ -30,6 +30,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 *  Reimplement the badly named "multigrid" lanczos as compressed Lanczos using the features 
 *  in Grid that were intended to be used to support blocked Aggregates, from
 */
 #include "disable_tests_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 #include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
@@ -256,3 +259,4 @@ int main (int argc, char ** argv) {
  Grid_finalize();
 }
 #endif
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_tests_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -237,3 +240,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
 #endif
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_tests_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -222,3 +225,5 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
 #endif
@@ -25,6 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "disable_tests_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 using namespace std;
@@ -118,3 +121,4 @@ int main (int argc, char ** argv)
  Grid_finalize();
 }
 #endif
 #endif
@@ -24,6 +24,8 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 #include "disable_tests_without_instantiations.h"
 #ifdef ENABLE_FERMION_INSTANTIATIONS
 #include <Grid/Grid.h>
 #include <Grid/qcd/utils/A2Autils.h>
@@ -157,3 +159,5 @@ int main(int argc, char *argv[])
  return EXIT_SUCCESS;
 }
 #endif
@@ -128,6 +128,10 @@ int main (int argc, char ** argv)
  typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
  HermFineMatrix FineHermOp(HermOpEO);
  LatticeFermionD src(FrbGrid); 
  src = ComplexD(1.0);
  PowerMethod<LatticeFermionD>       PM;   PM(HermOpEO,src);
  ////////////////////////////////////////////////////////////
  ///////////// Coarse basis and Little Dirac Operator ///////
  ////////////////////////////////////////////////////////////
@@ -150,7 +154,7 @@ int main (int argc, char ** argv)
  std::cout << "**************************************"<<std::endl;
  std::cout << "Create Subspace"<<std::endl;
  std::cout << "**************************************"<<std::endl;
-  Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.); 
+  Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,35.,0.01,500);// <== last run
  std::cout << "**************************************"<<std::endl;
  std::cout << "Refine Subspace"<<std::endl;
@@ -185,7 +189,7 @@ int main (int argc, char ** argv)
  std::cout << "**************************************"<<std::endl;
  typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
-  Chebyshev<CoarseVector>      IRLCheby(0.05,40.0,101);  // 1 iter
+  Chebyshev<CoarseVector>      IRLCheby(0.01,16.0,201);  // 1 iter
  MrhsHermMatrix MrhsCoarseOp     (mrhs);
  CoarseVector pm_src(CoarseMrhs);
@@ -193,10 +197,10 @@ int main (int argc, char ** argv)
  PowerMethod<CoarseVector>       cPM;
  cPM(MrhsCoarseOp,pm_src);
-  int Nk=nrhs;
+  //  int Nk=16;
-  int Nm=Nk*3;
+  //  int Nm=Nk*3;
-  //  int Nk=36;
+  int Nk=32;
-  //  int Nm=144;
+  int Nm=128;
  int Nstop=Nk;
  int Nconv_test_interval=1;
@@ -210,7 +214,7 @@ int main (int argc, char ** argv)
 							  nrhs,
 							  Nk,
 							  Nm,
-							  1e-4,10);
+							  1e-4,100);
  int Nconv;
  std::vector<RealD>            eval(Nm);
@@ -231,8 +235,6 @@ int main (int argc, char ** argv)
  std::cout << "**************************************"<<std::endl;
  std::cout << " Recompute coarse evecs  "<<std::endl;
  std::cout << "**************************************"<<std::endl;
  evec.resize(Nm,Coarse5d);
  eval.resize(Nm);
  for(int r=0;r<nrhs;r++){
    random(CRNG,c_src[r]);
  }
@@ -243,7 +245,7 @@ int main (int argc, char ** argv)
  // Deflation guesser object
  ///////////////////////
  std::cout << "**************************************"<<std::endl;
-  std::cout << " Reimport coarse evecs  "<<std::endl;
+  std::cout << " Reimport coarse evecs "<<evec.size()<<" "<<eval.size()<<std::endl;
  std::cout << "**************************************"<<std::endl;
  MultiRHSDeflation<CoarseVector> MrhsGuesser;
  MrhsGuesser.ImportEigenBasis(evec,eval);
@@ -252,9 +254,11 @@ int main (int argc, char ** argv)
  // Extra HDCG parameters
  //////////////////////////
  int maxit=3000;
-  ConjugateGradient<CoarseVector>  CG(2.0e-1,maxit,false);
+  //  ConjugateGradient<CoarseVector>  CG(2.0e-1,maxit,false);
-  RealD lo=2.0;
+  //  ConjugateGradient<CoarseVector>  CG(1.0e-2,maxit,false);
-  int ord = 9;
+  ConjugateGradient<CoarseVector>  CG(5.0e-2,maxit,false);
  RealD lo=0.2;
  int ord = 7;
  DoNothingGuesser<CoarseVector> DoNothing;
  HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
@@ -300,6 +304,19 @@ int main (int argc, char ** argv)
    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
    CGfine(HermOpEO, src, result);
  }
  {
    std::cout << "**************************************"<<std::endl;
    std::cout << "Calling MdagM CG"<<std::endl;
    std::cout << "**************************************"<<std::endl;
    LatticeFermion result(FGrid); result=Zero();
    LatticeFermion    src(FGrid); random(RNG5,src);
    result=Zero();
    MdagMLinearOperator<MobiusFermionD, LatticeFermionD> HermOp(Ddwf);
    ConjugateGradient<LatticeFermionD>  CGfine(1.0e-8,30000,false);
    CGfine(HermOp, src, result);
  }
 #endif  
  Grid_finalize();
  return 0;
@@ -368,7 +368,10 @@ int main (int argc, char ** argv)
  TrivialPrecon<CoarseVector> simple;
  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
-  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-2, 100, LinOpCoarse,simple,10,10); 
+  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-2, 100, LinOpCoarse,simple,12,12);  // 35 outer
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(5.0e-2, 100, LinOpCoarse,simple,12,12);  // 36 outer, 12s
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-1, 100, LinOpCoarse,simple,12,12);  // 36 ; 11s   
  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(3.0e-1, 100, LinOpCoarse,simple,12,12);     
  L2PGCR.Level(3);
  c_res=Zero();
  L2PGCR(c_src,c_res);
@@ -400,7 +403,7 @@ int main (int argc, char ** argv)
 			    LinOpCoarse,
 			    L2PGCR);
-  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16);
+  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,100,PVdagM,TwoLevelPrecon,10,10);
  L1PGCR.Level(1);
  f_res=Zero();
@@ -0,0 +1,493 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 using namespace std;
 using namespace Grid;
 template<class Matrix,class Field>
 class PVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
 public:
  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    //    std::cout << GridLogMessage<< "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    //    std::cout << GridLogMessage<<"AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    assert(0);
  }
  void HermOp(const Field &in, Field &out){
    //    std::cout <<GridLogMessage<< "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Matrix,class Field>
 class MdagPVLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
 public:
  MdagPVLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    Field tmp(in.Grid());
    //    std::cout <<GridLogMessage<< "Op: PVdag M "<<std::endl;
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    //    std::cout <<GridLogMessage<< "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    assert(0);
  }
  void HermOp(const Field &in, Field &out){
    //    std::cout << GridLogMessage<<"HermOp: PVdag M Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Matrix,class Field>
 class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  RealD shift;
 public:
  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    //    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class MGPreconditionerSVD : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
  typedef LinearOperatorBase<FineField>                            FineOperator;
  typedef LinearFunction    <FineField>                            FineSmoother;
  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
  Aggregates     & _FineToCoarse;
  Aggregates     & _CoarseToFine;
  FineOperator   & _FineOperator;
  FineSmoother   & _PreSmoother;
  FineSmoother   & _PostSmoother;
  CoarseOperator & _CoarseOperator;
  CoarseSolver   & _CoarseSolve;
  int    level;  void Level(int lv) {level = lv; };
  MGPreconditionerSVD(Aggregates &FtoC,
 		      Aggregates &CtoF,
 		      FineOperator &Fine,
 		      FineSmoother &PreSmoother,
 		      FineSmoother &PostSmoother,
 		      CoarseOperator &CoarseOperator_,
 		      CoarseSolver &CoarseSolve_)
    : _FineToCoarse(FtoC),
      _CoarseToFine(CtoF),
      _FineOperator(Fine),
      _PreSmoother(PreSmoother),
      _PostSmoother(PostSmoother),
      _CoarseOperator(CoarseOperator_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  virtual void operator()(const FineField &in, FineField & out) 
  {
    GridBase *CoarseGrid = _FineToCoarse.CoarseGrid;
    //    auto CoarseGrid = _CoarseOperator.Grid();
    CoarseVector Csrc(CoarseGrid);
    CoarseVector Csol(CoarseGrid);
    FineField vec1(in.Grid());
    FineField vec2(in.Grid());
    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
    double t;
    // Fine Smoother
    //    out = in;
    out = Zero();
    t=-usecond();
    _PreSmoother(in,out);
    t+=usecond();
    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
    // Update the residual
    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
    // Fine to Coarse 
    t=-usecond();
    _FineToCoarse.ProjectToSubspace  (Csrc,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse correction
    t=-usecond();
    Csol = Zero();
    _CoarseSolve(Csrc,Csol);
    //Csol=Zero();
    t+=usecond();
    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse to Fine
    t=-usecond();  
    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
    _CoarseToFine.PromoteFromSubspace(Csol,vec1); 
    add(out,out,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
    // Residual
    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
    // Fine Smoother
    t=-usecond();
    //    vec2=vec1;
    vec2=Zero();
    _PostSmoother(vec1,vec2);
    t+=usecond();
    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
    add( out,out,vec2);
    std::cout<<GridLogMessage << "Done " <<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  // Construct a coarsened grid
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/2;
    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
  LatticeFermion    src(FGrid); random(RNG5,src);
  LatticeFermion result(FGrid); result=Zero();
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu,header,file);
  RealD mass=0.01;
  RealD M5=1.8;
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
  const int nbasis = 30;
  const int cb = 0 ;
  NextToNearestStencilGeometry5D geom(Coarse5d);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
  typedef MdagPVLinearOperator<DomainWallFermionD,LatticeFermionD> MdagPV_t;
  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
  PVdagM_t PVdagM(Ddwf,Dpv);
  MdagPV_t MdagPV(Ddwf,Dpv);
  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
  // Run power method on HOA??
  PowerMethod<LatticeFermion>       PM;
  //  PM(PVdagM,src);
  //  PM(MdagPV,src);
  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace V(Coarse5d,FGrid,cb);
  Subspace U(Coarse5d,FGrid,cb);
  // Breeds right singular vectors with call to HermOp (V)
  V.CreateSubspaceChebyshev(RNG5,PVdagM,
 			    nbasis,
 			    4000.0,0.003,
 			    500);
  // Breeds left singular vectors with call to HermOp (U)
  //  U.CreateSubspaceChebyshev(RNG5,PVdagM,
  U.CreateSubspaceChebyshev(RNG5,MdagPV,
 			    nbasis,
 			    4000.0,0.003,
 			    500);
  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
  CombinedSubspace CombinedUV(Coarse5d,FGrid,cb);
  for(int b=0;b<nbasis;b++){
    CombinedUV.subspace[b]        = V.subspace[b];
    CombinedUV.subspace[b+nbasis] = U.subspace[b];
  }
  int bl, br;
  std::cout <<" <V| PVdagM| V> " <<std::endl;
  for(bl=0;bl<nbasis;bl++){
  for(br=0;br<nbasis;br++){
    PVdagM.Op(V.subspace[br],src);
    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
  }}
  std::cout <<" <V| PVdagM| U> " <<std::endl;
  for(bl=0;bl<nbasis;bl++){
  for(br=0;br<nbasis;br++){
    PVdagM.Op(U.subspace[br],src);
    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
  }}
  std::cout <<" <U| PVdagM| V> " <<std::endl;
  for(bl=0;bl<nbasis;bl++){
  for(br=0;br<nbasis;br++){
    PVdagM.Op(V.subspace[br],src);
    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
  }}
  std::cout <<" <U| PVdagM| U> " <<std::endl;
  for(bl=0;bl<nbasis;bl++){
  for(br=0;br<nbasis;br++){
    PVdagM.Op(U.subspace[br],src);
    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
  }}
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperatorV;
  typedef LittleDiracOperatorV::CoarseVector CoarseVectorV;
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  V.Orthogonalise();
  for(int b =0 ; b<nbasis;b++){
    CoarseVectorV c_src (Coarse5d);
    V.ProjectToSubspace  (c_src,U.subspace[b]);
    V.PromoteFromSubspace(c_src,src);
    std::cout << " Completeness of U in V ["<< b<<"] "<< std::sqrt(norm2(src)/norm2(U.subspace[b]))<<std::endl;
  }
  CoarseVector c_src (Coarse5d);
  CoarseVector c_res (Coarse5d);
  CoarseVector c_proj(Coarse5d);
  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
  LittleDiracOpPV.CoarsenOperator(PVdagM,CombinedUV,CombinedUV);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
  Complex one(1.0);
  c_src = one;  // 1 in every element for vector 1.
  blockPromote(c_src,err,CombinedUV.subspace);
  LatticeFermion prom(FGrid);
  prom=Zero();
  for(int b=0;b<nbasis*2;b++){
    prom=prom+CombinedUV.subspace[b];
  }
  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
  PVdagM.Op(prom,tmp);
  blockProject(c_proj,tmp,CombinedUV.subspace);
  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
  LittleDiracOpPV.M(c_src,c_res);
  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
  c_proj = c_proj - c_res;
  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
  /**********
   * Some solvers
   **********
   */
  ///////////////////////////////////////
  // Coarse grid solver test
  ///////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<CoarseVector> simple;
  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 10, LinOpCoarse,simple,20,20); 
  L2PGCR.Level(3);
  c_res=Zero();
  L2PGCR(c_src,c_res);
  ////////////////////////////////////////
  // Fine grid smoother
  ////////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<LatticeFermionD> simple_fine;
  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
  SmootherGCR.Level(2);
  LatticeFermionD f_src(FGrid);
  LatticeFermionD f_res(FGrid);
  f_src = one;  // 1 in every element for vector 1.
  f_res=Zero();
  SmootherGCR(f_src,f_res);
  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis*2> TwoLevelMG;
  TwoLevelMG TwoLevelPrecon(CombinedUV,CombinedUV,
 			    PVdagM,
 			    simple_fine,
 			    SmootherGCR,
 			    LinOpCoarse,
 			    L2PGCR);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,20,20);
  L1PGCR.Level(1);
  f_res=Zero();
  L1PGCR(f_src,f_res);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -0,0 +1,492 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 using namespace std;
 using namespace Grid;
 template<class Matrix,class Field>
 class PVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
 public:
  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    //    std::cout << GridLogMessage<< "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    //    std::cout << GridLogMessage<<"AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    HermOp(in,out);
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
  void HermOp(const Field &in, Field &out){
    //    std::cout <<GridLogMessage<< "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Matrix,class Field>
 class MdagPVLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
 public:
  MdagPVLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    Field tmp(in.Grid());
    //    std::cout <<GridLogMessage<< "Op: PVdag M "<<std::endl;
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    //    std::cout <<GridLogMessage<< "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    ComplexD dot = innerProduct(in,out);
    n1=real(dot);
    n2=norm2(out);
  }
  void HermOp(const Field &in, Field &out){
    //    std::cout << GridLogMessage<<"HermOp: PVdag M Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Matrix,class Field>
 class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  RealD shift;
 public:
  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    //    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class MGPreconditionerSVD : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
  typedef LinearOperatorBase<FineField>                            FineOperator;
  typedef LinearFunction    <FineField>                            FineSmoother;
  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
  Aggregates     & _FineToCoarse;
  Aggregates     & _CoarseToFine;
  FineOperator   & _FineOperator;
  FineSmoother   & _PreSmoother;
  FineSmoother   & _PostSmoother;
  CoarseOperator & _CoarseOperator;
  CoarseSolver   & _CoarseSolve;
  int    level;  void Level(int lv) {level = lv; };
  MGPreconditionerSVD(Aggregates &FtoC,
 		      Aggregates &CtoF,
 		      FineOperator &Fine,
 		      FineSmoother &PreSmoother,
 		      FineSmoother &PostSmoother,
 		      CoarseOperator &CoarseOperator_,
 		      CoarseSolver &CoarseSolve_)
    : _FineToCoarse(FtoC),
      _CoarseToFine(CtoF),
      _FineOperator(Fine),
      _PreSmoother(PreSmoother),
      _PostSmoother(PostSmoother),
      _CoarseOperator(CoarseOperator_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  virtual void operator()(const FineField &in, FineField & out) 
  {
    GridBase *CoarseGrid = _FineToCoarse.CoarseGrid;
    //    auto CoarseGrid = _CoarseOperator.Grid();
    CoarseVector Csrc(CoarseGrid);
    CoarseVector Csol(CoarseGrid);
    FineField vec1(in.Grid());
    FineField vec2(in.Grid());
    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
    double t;
    // Fine Smoother
    //    out = in;
    out = Zero();
    t=-usecond();
    _PreSmoother(in,out);
    t+=usecond();
    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
    // Update the residual
    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
    // Fine to Coarse 
    t=-usecond();
    _FineToCoarse.ProjectToSubspace  (Csrc,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse correction
    t=-usecond();
    Csol = Zero();
    _CoarseSolve(Csrc,Csol);
    //Csol=Zero();
    t+=usecond();
    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse to Fine
    t=-usecond();  
    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
    _CoarseToFine.PromoteFromSubspace(Csol,vec1); 
    add(out,out,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
    // Residual
    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
    // Fine Smoother
    t=-usecond();
    //    vec2=vec1;
    vec2=Zero();
    _PostSmoother(vec1,vec2);
    t+=usecond();
    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
    add( out,out,vec2);
    std::cout<<GridLogMessage << "Done " <<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  // Construct a coarsened grid
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/2;
    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
  LatticeFermion    src(FGrid); random(RNG5,src);
  LatticeFermion result(FGrid); result=Zero();
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu,header,file);
  RealD mass=0.01;
  RealD M5=1.8;
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
  const int nbasis = 20;
  const int cb = 0 ;
  NextToNearestStencilGeometry5D geom(Coarse5d);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
  typedef MdagPVLinearOperator<DomainWallFermionD,LatticeFermionD> MdagPV_t;
  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
  PVdagM_t PVdagM(Ddwf,Dpv);
  MdagPV_t MdagPV(Ddwf,Dpv);
  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
  // Run power method on HOA??
  PowerMethod<LatticeFermion>       PM;
  //  PM(PVdagM,src);
  //  PM(MdagPV,src);
  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace V(Coarse5d,FGrid,cb);
  Subspace U(Coarse5d,FGrid,cb);
  // Breeds right singular vectors with call to HermOp (V)
  V.CreateSubspace(RNG5,PVdagM,nbasis);
  // Breeds left singular vectors with call to HermOp (U)
  //  U.CreateSubspaceChebyshev(RNG5,MdagPV,
  U.CreateSubspace(RNG5,PVdagM,nbasis);
  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
  CombinedSubspace CombinedUV(Coarse5d,FGrid,cb);
  for(int b=0;b<nbasis;b++){
    CombinedUV.subspace[b]        = V.subspace[b];
    CombinedUV.subspace[b+nbasis] = U.subspace[b];
  }
  int bl, br;
  std::cout <<" <V| PVdagM| V> " <<std::endl;
  for(bl=0;bl<nbasis;bl++){
  for(br=0;br<nbasis;br++){
    PVdagM.Op(V.subspace[br],src);
    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
  }}
  std::cout <<" <V| PVdagM| U> " <<std::endl;
  for(bl=0;bl<nbasis;bl++){
  for(br=0;br<nbasis;br++){
    PVdagM.Op(U.subspace[br],src);
    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(V.subspace[bl],src)<<std::endl;
  }}
  std::cout <<" <U| PVdagM| V> " <<std::endl;
  for(bl=0;bl<nbasis;bl++){
  for(br=0;br<nbasis;br++){
    PVdagM.Op(V.subspace[br],src);
    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
  }}
  std::cout <<" <U| PVdagM| U> " <<std::endl;
  for(bl=0;bl<nbasis;bl++){
  for(br=0;br<nbasis;br++){
    PVdagM.Op(U.subspace[br],src);
    std::cout <<bl<<" "<<br<<"\t"<<innerProduct(U.subspace[bl],src)<<std::endl;
  }}
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperatorV;
  typedef LittleDiracOperatorV::CoarseVector CoarseVectorV;
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  V.Orthogonalise();
  for(int b =0 ; b<nbasis;b++){
    CoarseVectorV c_src (Coarse5d);
    V.ProjectToSubspace  (c_src,U.subspace[b]);
    V.PromoteFromSubspace(c_src,src);
    std::cout << " Completeness of U in V ["<< b<<"] "<< std::sqrt(norm2(src)/norm2(U.subspace[b]))<<std::endl;
  }
  CoarseVector c_src (Coarse5d);
  CoarseVector c_res (Coarse5d);
  CoarseVector c_proj(Coarse5d);
  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
  LittleDiracOpPV.CoarsenOperator(PVdagM,CombinedUV,CombinedUV);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
  Complex one(1.0);
  c_src = one;  // 1 in every element for vector 1.
  blockPromote(c_src,err,CombinedUV.subspace);
  LatticeFermion prom(FGrid);
  prom=Zero();
  for(int b=0;b<nbasis*2;b++){
    prom=prom+CombinedUV.subspace[b];
  }
  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
  PVdagM.Op(prom,tmp);
  blockProject(c_proj,tmp,CombinedUV.subspace);
  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
  LittleDiracOpPV.M(c_src,c_res);
  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
  c_proj = c_proj - c_res;
  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
  /**********
   * Some solvers
   **********
   */
  ///////////////////////////////////////
  // Coarse grid solver test
  ///////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<CoarseVector> simple;
  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 10, LinOpCoarse,simple,20,20); 
  L2PGCR.Level(3);
  c_res=Zero();
  L2PGCR(c_src,c_res);
  ////////////////////////////////////////
  // Fine grid smoother
  ////////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<LatticeFermionD> simple_fine;
  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
  SmootherGCR.Level(2);
  LatticeFermionD f_src(FGrid);
  LatticeFermionD f_res(FGrid);
  f_src = one;  // 1 in every element for vector 1.
  f_res=Zero();
  SmootherGCR(f_src,f_res);
  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis*2> TwoLevelMG;
  TwoLevelMG TwoLevelPrecon(CombinedUV,CombinedUV,
 			    PVdagM,
 			    simple_fine,
 			    SmootherGCR,
 			    LinOpCoarse,
 			    L2PGCR);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,20,20);
  L1PGCR.Level(1);
  f_res=Zero();
  L1PGCR(f_src,f_res);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -0,0 +1,479 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 using namespace std;
 using namespace Grid;
 template<class Matrix,class Field>
 class PVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
 public:
  PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    //    std::cout << GridLogMessage<< "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    //    std::cout << GridLogMessage<<"AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    assert(0);
  }
  void HermOp(const Field &in, Field &out){
    //    std::cout <<GridLogMessage<< "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Matrix,class Field>
 class MdagPVLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
 public:
  MdagPVLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    Field tmp(in.Grid());
    //    std::cout <<GridLogMessage<< "Op: PVdag M "<<std::endl;
    _PV.M(in,tmp);
    _Mat.Mdag(tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    //    std::cout <<GridLogMessage<< "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    assert(0);
  }
  void HermOp(const Field &in, Field &out){
    //    std::cout << GridLogMessage<<"HermOp: PVdag M Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
    //    std::cout << "HermOp done "<<norm2(out)<<std::endl;
  }
 };
 template<class Matrix,class Field>
 class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  Matrix &_PV;
  RealD shift;
 public:
  ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
  void OpDiag (const Field &in, Field &out) {    assert(0);  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {    assert(0);  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){    assert(0);  };
  void Op     (const Field &in, Field &out){
    //    std::cout << "Op: PVdag M "<<std::endl;
    Field tmp(in.Grid());
    _Mat.M(in,tmp);
    _PV.Mdag(tmp,out);
    out = out + shift * in;
  }
  void AdjOp     (const Field &in, Field &out){
    //    std::cout << "AdjOp: Mdag PV "<<std::endl;
    Field tmp(in.Grid());
    _PV.M(tmp,out);
    _Mat.Mdag(in,tmp);
    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){    assert(0);  }
  void HermOp(const Field &in, Field &out){
    //    std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
    Field tmp(in.Grid());
    Op(in,tmp);
    AdjOp(tmp,out);
  }
 };
 template<class Fobj,class CComplex,int nbasis>
 class MGPreconditionerSVD : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
  typedef LinearOperatorBase<FineField>                            FineOperator;
  typedef LinearFunction    <FineField>                            FineSmoother;
  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
  ///////////////////////////////
  // SVD is M = U S Vdag
  //
  // Define a subset of Vc and Uc in Complex_f,c  matrix
  // - these are the coarsening, non-square matrices
  //
  // Solve a coarse approx to
  //
  //      M psi = eta
  //
  //  via
  //
  //  Uc^dag U S Vdag Vc Vc^dag psi = Uc^dag eta
  //
  //  M_coarse Vc^dag psi = M_coarse psi_c = eta_c
  //  
  ///////////////////////////////
  Aggregates     & _U;
  Aggregates     & _V;
  FineOperator   & _FineOperator;
  FineSmoother   & _PreSmoother;
  FineSmoother   & _PostSmoother;
  CoarseOperator & _CoarseOperator;
  CoarseSolver   & _CoarseSolve;
  int    level;  void Level(int lv) {level = lv; };
  MGPreconditionerSVD(Aggregates &U,
 		      Aggregates &V,
 		      FineOperator &Fine,
 		      FineSmoother &PreSmoother,
 		      FineSmoother &PostSmoother,
 		      CoarseOperator &CoarseOperator_,
 		      CoarseSolver &CoarseSolve_)
    : _U(U),
      _V(V),
      _FineOperator(Fine),
      _PreSmoother(PreSmoother),
      _PostSmoother(PostSmoother),
      _CoarseOperator(CoarseOperator_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  virtual void operator()(const FineField &in, FineField & out) 
  {
    GridBase *CoarseGrid = _U.CoarseGrid;
    //    auto CoarseGrid = _CoarseOperator.Grid();
    CoarseVector Csrc(CoarseGrid);
    CoarseVector Csol(CoarseGrid);
    FineField vec1(in.Grid());
    FineField vec2(in.Grid());
    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
    double t;
    // Fine Smoother
    //    out = in;
    out = Zero();
    t=-usecond();
    _PreSmoother(in,out);
    t+=usecond();
    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
    // Update the residual
    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
    //  Uc^dag U S Vdag Vc Vc^dag psi = Uc^dag eta
    // Fine to Coarse 
    t=-usecond();
    _U.ProjectToSubspace  (Csrc,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse correction
    t=-usecond();
    Csol = Zero();
    _CoarseSolve(Csrc,Csol);
    //Csol=Zero();
    t+=usecond();
    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse to Fine
    t=-usecond();  
    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
    _V.PromoteFromSubspace(Csol,vec1); 
    add(out,out,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
    // Residual
    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
    // Fine Smoother
    t=-usecond();
    //    vec2=vec1;
    vec2=Zero();
    _PostSmoother(vec1,vec2);
    t+=usecond();
    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
    add( out,out,vec2);
    std::cout<<GridLogMessage << "Done " <<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  // Construct a coarsened grid
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/2;
    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
  LatticeFermion    src(FGrid); random(RNG5,src);
  LatticeFermion result(FGrid); result=Zero();
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu,header,file);
  RealD mass=0.01;
  RealD M5=1.8;
  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
  const int nbasis = 60;
  const int cb = 0 ;
  NextToNearestStencilGeometry5D geom(Coarse5d);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
  typedef MdagPVLinearOperator<DomainWallFermionD,LatticeFermionD> MdagPV_t;
  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
  PVdagM_t PVdagM(Ddwf,Dpv);
  MdagPV_t MdagPV(Ddwf,Dpv);
  //  ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
  //  ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
  //  ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric 
  //  ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ... 
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ...  with 24 vecs
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ...  with 24 vecs and 2^4 blocking
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ...  with 16 vecs and 2^4 blocking, sloppier
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35  ...  with 20 vecs and 2^4 blocking, looser coarse
  //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64  ...  with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
  ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); // 
  // Run power method on HOA??
  PowerMethod<LatticeFermion>       PM;
  PM(PVdagM,src);
  PM(MdagPV,src);
  // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace V(Coarse5d,FGrid,cb);
  //  Subspace U(Coarse5d,FGrid,cb);
  // Breeds right singular vectors with call to HermOp
  V.CreateSubspaceChebyshev(RNG5,PVdagM,
 			    nbasis,
 			    4000.0,0.003,
 			    300);
  // Breeds left singular vectors with call to HermOp
  //  U.CreateSubspaceChebyshev(RNG5,MdagPV,
  //			    nbasis,
  //			    4000.0,0.003,
  //			    300);
  //  U.subspace=V.subspace;
  //  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
  //  CombinedSubspace CombinedUV(Coarse5d,FGrid,cb);
  //  for(int b=0;b<nbasis;b++){
  //    CombinedUV.subspace[b]        = V.subspace[b];
  //    CombinedUV.subspace[b+nbasis] = U.subspace[b];
  //  }
  //  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
  LittleDiracOpPV.CoarsenOperator(PVdagM,V,V);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
  CoarseVector c_src (Coarse5d);
  CoarseVector c_res (Coarse5d);
  CoarseVector c_proj(Coarse5d);
  Complex one(1.0);
  c_src = one;  // 1 in every element for vector 1.
  //  blockPromote(c_src,err,CoarseToFine.subspace);
  LatticeFermion prom(FGrid);
  prom=Zero();
  for(int b=0;b<nbasis;b++){
    prom=prom+V.subspace[b];
  }
  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
  PVdagM.Op(prom,tmp);
  blockProject(c_proj,tmp,V.subspace);
  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
  LittleDiracOpPV.M(c_src,c_res);
  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
  c_proj = c_proj - c_res;
  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
  /**********
   * Some solvers
   **********
   */
  ///////////////////////////////////////
  // Coarse grid solver test
  ///////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<CoarseVector> simple;
  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L3PGCR(1.0e-4, 10, LinOpCoarse,simple,20,20); 
  L3PGCR.Level(3);
  c_res=Zero();
  L3PGCR(c_src,c_res);
  ////////////////////////////////////////
  // Fine grid smoother
  ////////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<LatticeFermionD> simple_fine;
  //  NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
  SmootherGCR.Level(2);
  LatticeFermionD f_src(FGrid);
  LatticeFermionD f_res(FGrid);
  f_src = one;  // 1 in every element for vector 1.
  f_res=Zero();
  SmootherGCR(f_src,f_res);
  //  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis*2> TwoLevelMG;
  typedef MGPreconditionerSVD<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
  //  TwoLevelMG TwoLevelPrecon(CombinedUV,CombinedUV,
  TwoLevelMG TwoLevelPrecon(V,V,
 			    PVdagM,
 			    simple_fine,
 			    SmootherGCR,
 			    LinOpCoarse,
 			    L3PGCR);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16);
  L1PGCR.Level(1);
  f_res=Zero();
  L1PGCR(f_src,f_res);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -0,0 +1,333 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 using namespace std;
 using namespace Grid;
 template<class Fobj,class CComplex,int nbasis>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
  typedef LinearOperatorBase<FineField>                            FineOperator;
  typedef LinearFunction    <FineField>                            FineSmoother;
  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
  Aggregates     & _Aggregates;
  FineOperator   & _FineOperator;
  FineSmoother   & _PreSmoother;
  FineSmoother   & _PostSmoother;
  CoarseOperator & _CoarseOperator;
  CoarseSolver   & _CoarseSolve;
  int    level;  void Level(int lv) {level = lv; };
  MGPreconditioner(Aggregates &Agg,
 		   FineOperator &Fine,
 		   FineSmoother &PreSmoother,
 		   FineSmoother &PostSmoother,
 		   CoarseOperator &CoarseOperator_,
 		   CoarseSolver &CoarseSolve_)
    : _Aggregates(Agg),
      _FineOperator(Fine),
      _PreSmoother(PreSmoother),
      _PostSmoother(PostSmoother),
      _CoarseOperator(CoarseOperator_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  virtual void operator()(const FineField &in, FineField & out) 
  {
    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
    //    auto CoarseGrid = _CoarseOperator.Grid();
    CoarseVector Csrc(CoarseGrid);
    CoarseVector Csol(CoarseGrid);
    FineField vec1(in.Grid());
    FineField vec2(in.Grid());
    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
    double t;
    // Fine Smoother
    //    out = in;
    out = Zero();
    t=-usecond();
    _PreSmoother(in,out);
    t+=usecond();
    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
    // Update the residual
    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
    // Fine to Coarse 
    t=-usecond();
    _Aggregates.ProjectToSubspace  (Csrc,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse correction
    t=-usecond();
    Csol = Zero();
    _CoarseSolve(Csrc,Csol);
    //Csol=Zero();
    t+=usecond();
    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse to Fine
    t=-usecond();  
    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
    _Aggregates.PromoteFromSubspace(Csol,vec1); 
    add(out,out,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
    // Residual
    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
    // Fine Smoother
    t=-usecond();
    //    vec2=vec1;
    vec2=Zero();
    _PostSmoother(vec1,vec2);
    t+=usecond();
    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
    add( out,out,vec2);
    std::cout<<GridLogMessage << "Done " <<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = UGrid;
  GridRedBlackCartesian * FrbGrid = UrbGrid;
  // Construct a coarsened grid
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/2;
    //clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
  Complex one(1.0);
  LatticeFermion    src(FGrid); src=one;
  LatticeFermion result(FGrid); result=Zero();
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeFermion    precsrc(FGrid);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("ckpoint_lat");
  NerscIO::readConfiguration(Umu,header,file);
  RealD csw =0.0;
  RealD mass=-0.92;
  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
  const int nbasis = 20;
  const int cb = 0 ;
  LatticeFermion prom(FGrid);
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NearestStencilGeometry4D geom(Coarse4d);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse4d,FGrid,cb);
  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.01);
  Aggregates.CreateSubspaceGCR(RNG4,
 			       LinOpDw,
 			       nbasis);
  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
  CombinedSubspace CombinedUV(Coarse4d,UGrid,cb);
  for(int b=0;b<nbasis;b++){
    Gamma G5(Gamma::Algebra::Gamma5);
    CombinedUV.subspace[b]        = Aggregates.subspace[b];
    CombinedUV.subspace[b+nbasis] = G5*Aggregates.subspace[b];
  }
  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
  LittleDiracOp.CoarsenOperator(LinOpDw,CombinedUV);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
  CoarseVector c_src (Coarse4d);
  CoarseVector c_res (Coarse4d);
  CoarseVector c_proj(Coarse4d);
  std::vector<LatticeFermion> subspace(2*nbasis,FGrid);
  subspace=CombinedUV.subspace;
  c_src = one;  // 1 in every element for vector 1.
  blockPromote(c_src,err,subspace);
  prom=Zero();
  for(int b=0;b<2*nbasis;b++){
    prom=prom+subspace[b];
  }
  err=err-prom; 
  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
  LinOpDw.Op(prom,tmp);
  blockProject(c_proj,tmp,subspace);
  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
  LittleDiracOp.M(c_src,c_res);
  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
  c_proj = c_proj - c_res;
  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
  /**********
   * Some solvers
   **********
   */
  // CG
  {
    MdagMLinearOperator<WilsonFermionD,LatticeFermion> HermOp(Dw);
    ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
    Dw.Mdag(src,precsrc);
    CG(HermOp,precsrc,result);
    result=Zero();
  }
  ///////////////////////////////////////
  // Coarse grid solver test
  ///////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<CoarseVector> simple;
  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.001);
  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.01);
  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LinOpCoarse,0.001);
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-1, 100, LinOpCoarse,simple,30,30); 
  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(2.0e-1, 50, ShiftedLinOpCoarse,simple,50,50); 
  L2PGCR.Level(3);
  c_res=Zero();
  L2PGCR(c_src,c_res);
  ////////////////////////////////////////
  // Fine grid smoother
  ////////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<LatticeFermionD> simple_fine;
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.1,1,ShiftedLinOpDw,simple_fine,4,4);
  SmootherGCR.Level(2);
  LatticeFermionD f_src(FGrid);
  LatticeFermionD f_res(FGrid);
  f_src = one;  // 1 in every element for vector 1.
  f_res=Zero();
  SmootherGCR(f_src,f_res);
  typedef MGPreconditioner<vSpinColourVector,  vTComplex,2*nbasis> TwoLevelMG;
  TwoLevelMG TwoLevelPrecon(CombinedUV,
 			    LinOpDw,
 			    simple_fine,
 			    SmootherGCR,
 			    LinOpCoarse,
 			    L2PGCR);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,16,16);
  L1PGCR.Level(1);
  f_res=Zero();
  L1PGCR(f_src,f_res);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -0,0 +1,326 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 using namespace std;
 using namespace Grid;
 template<class Fobj,class CComplex,int nbasis>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
  typedef LinearOperatorBase<FineField>                            FineOperator;
  typedef LinearFunction    <FineField>                            FineSmoother;
  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
  Aggregates     & _Aggregates;
  FineOperator   & _FineOperator;
  FineSmoother   & _PreSmoother;
  FineSmoother   & _PostSmoother;
  CoarseOperator & _CoarseOperator;
  CoarseSolver   & _CoarseSolve;
  int    level;  void Level(int lv) {level = lv; };
  MGPreconditioner(Aggregates &Agg,
 		   FineOperator &Fine,
 		   FineSmoother &PreSmoother,
 		   FineSmoother &PostSmoother,
 		   CoarseOperator &CoarseOperator_,
 		   CoarseSolver &CoarseSolve_)
    : _Aggregates(Agg),
      _FineOperator(Fine),
      _PreSmoother(PreSmoother),
      _PostSmoother(PostSmoother),
      _CoarseOperator(CoarseOperator_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  virtual void operator()(const FineField &in, FineField & out) 
  {
    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
    //    auto CoarseGrid = _CoarseOperator.Grid();
    CoarseVector Csrc(CoarseGrid);
    CoarseVector Csol(CoarseGrid);
    FineField vec1(in.Grid());
    FineField vec2(in.Grid());
    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
    double t;
    // Fine Smoother
    //    out = in;
    out = Zero();
    t=-usecond();
    _PreSmoother(in,out);
    t+=usecond();
    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
    // Update the residual
    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
    // Fine to Coarse 
    t=-usecond();
    _Aggregates.ProjectToSubspace  (Csrc,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse correction
    t=-usecond();
    Csol = Zero();
    _CoarseSolve(Csrc,Csol);
    //Csol=Zero();
    t+=usecond();
    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse to Fine
    t=-usecond();  
    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
    _Aggregates.PromoteFromSubspace(Csol,vec1); 
    add(out,out,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
    // Residual
    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
    // Fine Smoother
    t=-usecond();
    //    vec2=vec1;
    vec2=Zero();
    _PostSmoother(vec1,vec2);
    t+=usecond();
    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
    add( out,out,vec2);
    std::cout<<GridLogMessage << "Done " <<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = UGrid;
  GridRedBlackCartesian * FrbGrid = UrbGrid;
  // Construct a coarsened grid
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/2;
    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
  Complex one(1.0);
  LatticeFermion    src(FGrid); src=one;
  LatticeFermion result(FGrid); result=Zero();
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeFermion    precsrc(FGrid);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("ckpoint_lat");
  NerscIO::readConfiguration(Umu,header,file);
  RealD csw =0.0;
  RealD mass=-0.92;
  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
  const int nbasis = 40;
  const int cb = 0 ;
  LatticeFermion prom(FGrid);
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NearestStencilGeometry4D geom(Coarse4d);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse4d,FGrid,cb);
  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.01);
  Aggregates.CreateSubspaceGCR(RNG4,
 			       LinOpDw,
 			       nbasis);
  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
  LittleDiracOp.CoarsenOperator(LinOpDw,Aggregates);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
  CoarseVector c_src (Coarse4d);
  CoarseVector c_res (Coarse4d);
  CoarseVector c_proj(Coarse4d);
  std::vector<LatticeFermion> subspace(nbasis,FGrid);
  subspace=Aggregates.subspace;
  c_src = one;  // 1 in every element for vector 1.
  blockPromote(c_src,err,subspace);
  prom=Zero();
  for(int b=0;b<nbasis;b++){
    prom=prom+subspace[b];
  }
  err=err-prom; 
  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
  LinOpDw.Op(prom,tmp);
  blockProject(c_proj,tmp,subspace);
  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
  LittleDiracOp.M(c_src,c_res);
  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
  c_proj = c_proj - c_res;
  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
  /**********
   * Some solvers
   **********
   */
  // CG
  {
    MdagMLinearOperator<WilsonFermionD,LatticeFermion> HermOp(Dw);
    ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
    Dw.Mdag(src,precsrc);
    CG(HermOp,precsrc,result);
    result=Zero();
  }
  ///////////////////////////////////////
  // Coarse grid solver test
  ///////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<CoarseVector> simple;
  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.001);
  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LittleDiracOp,0.01);
  //  ShiftedNonHermitianLinearOperator<LittleDiracOperator,CoarseVector> ShiftedLinOpCoarse(LinOpCoarse,0.001);
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-1, 100, LinOpCoarse,simple,30,30); 
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(2.0e-1, 50, ShiftedLinOpCoarse,simple,50,50); 
  L2PGCR.Level(3);
  c_res=Zero();
  L2PGCR(c_src,c_res);
  ////////////////////////////////////////
  // Fine grid smoother
  ////////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<LatticeFermionD> simple_fine;
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.1,1,ShiftedLinOpDw,simple_fine,6,6);
  SmootherGCR.Level(2);
  LatticeFermionD f_src(FGrid);
  LatticeFermionD f_res(FGrid);
  f_src = one;  // 1 in every element for vector 1.
  f_res=Zero();
  SmootherGCR(f_src,f_res);
  typedef MGPreconditioner<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
  TwoLevelMG TwoLevelPrecon(Aggregates,
 			    LinOpDw,
 			    simple_fine,
 			    SmootherGCR,
 			    LinOpCoarse,
 			    L2PGCR);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,16,16);
  L1PGCR.Level(1);
  f_res=Zero();
  L1PGCR(f_src,f_res);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -0,0 +1,320 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 using namespace std;
 using namespace Grid;
 template<class Fobj,class CComplex,int nbasis>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
  typedef LinearOperatorBase<FineField>                            FineOperator;
  typedef LinearFunction    <FineField>                            FineSmoother;
  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
  Aggregates     & _Aggregates;
  FineOperator   & _FineOperator;
  FineSmoother   & _PreSmoother;
  FineSmoother   & _PostSmoother;
  CoarseOperator & _CoarseOperator;
  CoarseSolver   & _CoarseSolve;
  int    level;  void Level(int lv) {level = lv; };
  MGPreconditioner(Aggregates &Agg,
 		   FineOperator &Fine,
 		   FineSmoother &PreSmoother,
 		   FineSmoother &PostSmoother,
 		   CoarseOperator &CoarseOperator_,
 		   CoarseSolver &CoarseSolve_)
    : _Aggregates(Agg),
      _FineOperator(Fine),
      _PreSmoother(PreSmoother),
      _PostSmoother(PostSmoother),
      _CoarseOperator(CoarseOperator_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  virtual void operator()(const FineField &in, FineField & out) 
  {
    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
    //    auto CoarseGrid = _CoarseOperator.Grid();
    CoarseVector Csrc(CoarseGrid);
    CoarseVector Csol(CoarseGrid);
    FineField vec1(in.Grid());
    FineField vec2(in.Grid());
    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
    double t;
    // Fine Smoother
    //    out = in;
    out = Zero();
    t=-usecond();
    _PreSmoother(in,out);
    t+=usecond();
    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
    // Update the residual
    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
    // Fine to Coarse 
    t=-usecond();
    _Aggregates.ProjectToSubspace  (Csrc,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse correction
    t=-usecond();
    Csol = Zero();
    _CoarseSolve(Csrc,Csol);
    //Csol=Zero();
    t+=usecond();
    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse to Fine
    t=-usecond();  
    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
    _Aggregates.PromoteFromSubspace(Csol,vec1); 
    add(out,out,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
    // Residual
    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
    // Fine Smoother
    t=-usecond();
    //    vec2=vec1;
    vec2=Zero();
    _PostSmoother(vec1,vec2);
    t+=usecond();
    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
    add( out,out,vec2);
    std::cout<<GridLogMessage << "Done " <<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = UGrid;
  GridRedBlackCartesian * FrbGrid = UrbGrid;
  // Construct a coarsened grid
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/2;
    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
  LatticeFermion    src(FGrid); random(RNG4,src);
  LatticeFermion result(FGrid); result=Zero();
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("ckpoint_lat");
  NerscIO::readConfiguration(Umu,header,file);
  RealD csw =0.0;
  RealD mass=-0.92;
  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
  const int nbasis = 20;
  const int cb = 0 ;
  LatticeFermion prom(FGrid);
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,2*nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NearestStencilGeometry4D geom(Coarse4d);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse4d,FGrid,cb);
  MdagMLinearOperator<WilsonCloverFermionD,LatticeFermion> MdagMOpDw(Dw);
  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.5);
  //  Aggregates.CreateSubspaceGCR(RNG4,
  //			       LinOpDw,
  //			       nbasis);
  Aggregates.CreateSubspace(RNG4,MdagMOpDw,nbasis);
  typedef Aggregation<vSpinColourVector,vTComplex,2*nbasis> CombinedSubspace;
  CombinedSubspace CombinedUV(Coarse4d,UGrid,cb);
  for(int b=0;b<nbasis;b++){
    Gamma G5(Gamma::Algebra::Gamma5);
    CombinedUV.subspace[b]        = Aggregates.subspace[b];
    CombinedUV.subspace[b+nbasis] = G5*Aggregates.subspace[b];
  }
  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
  LittleDiracOp.CoarsenOperator(LinOpDw,CombinedUV);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
  CoarseVector c_src (Coarse4d);
  CoarseVector c_res (Coarse4d);
  CoarseVector c_proj(Coarse4d);
  std::vector<LatticeFermion> subspace(2*nbasis,FGrid);
  subspace=CombinedUV.subspace;
  Complex one(1.0);
  c_src = one;  // 1 in every element for vector 1.
  blockPromote(c_src,err,subspace);
  prom=Zero();
  for(int b=0;b<2*nbasis;b++){
    prom=prom+subspace[b];
  }
  err=err-prom; 
  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
  LinOpDw.Op(prom,tmp);
  blockProject(c_proj,tmp,subspace);
  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
  LittleDiracOp.M(c_src,c_res);
  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
  c_proj = c_proj - c_res;
  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
  /**********
   * Some solvers
   **********
   */
  ///////////////////////////////////////
  // Coarse grid solver test
  ///////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<CoarseVector> simple;
  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 100, LinOpCoarse,simple,30,30); 
  L2PGCR.Level(3);
  c_res=Zero();
  L2PGCR(c_src,c_res);
  ////////////////////////////////////////
  // Fine grid smoother
  ////////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<LatticeFermionD> simple_fine;
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedLinOpDw,simple_fine,4,4);
  SmootherGCR.Level(2);
  LatticeFermionD f_src(FGrid);
  LatticeFermionD f_res(FGrid);
  f_src = one;  // 1 in every element for vector 1.
  f_res=Zero();
  SmootherGCR(f_src,f_res);
  typedef MGPreconditioner<vSpinColourVector,  vTComplex,2*nbasis> TwoLevelMG;
  TwoLevelMG TwoLevelPrecon(CombinedUV,
 			    LinOpDw,
 			    simple_fine,
 			    SmootherGCR,
 			    LinOpCoarse,
 			    L2PGCR);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,32,32);
  L1PGCR.Level(1);
  f_res=Zero();
  L1PGCR(f_src,f_res);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -0,0 +1,312 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_padded_cell.cc
    Copyright (C) 2023
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 using namespace std;
 using namespace Grid;
 template<class Fobj,class CComplex,int nbasis>
 class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
 public:
  using LinearFunction<Lattice<Fobj> >::operator();
  typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField    FineField;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
  typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
  typedef LinearOperatorBase<FineField>                            FineOperator;
  typedef LinearFunction    <FineField>                            FineSmoother;
  typedef LinearOperatorBase<CoarseVector>                         CoarseOperator;
  typedef LinearFunction    <CoarseVector>                         CoarseSolver;
  Aggregates     & _Aggregates;
  FineOperator   & _FineOperator;
  FineSmoother   & _PreSmoother;
  FineSmoother   & _PostSmoother;
  CoarseOperator & _CoarseOperator;
  CoarseSolver   & _CoarseSolve;
  int    level;  void Level(int lv) {level = lv; };
  MGPreconditioner(Aggregates &Agg,
 		   FineOperator &Fine,
 		   FineSmoother &PreSmoother,
 		   FineSmoother &PostSmoother,
 		   CoarseOperator &CoarseOperator_,
 		   CoarseSolver &CoarseSolve_)
    : _Aggregates(Agg),
      _FineOperator(Fine),
      _PreSmoother(PreSmoother),
      _PostSmoother(PostSmoother),
      _CoarseOperator(CoarseOperator_),
      _CoarseSolve(CoarseSolve_),
      level(1)  {  }
  virtual void operator()(const FineField &in, FineField & out) 
  {
    GridBase *CoarseGrid = _Aggregates.CoarseGrid;
    //    auto CoarseGrid = _CoarseOperator.Grid();
    CoarseVector Csrc(CoarseGrid);
    CoarseVector Csol(CoarseGrid);
    FineField vec1(in.Grid());
    FineField vec2(in.Grid());
    std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
    //    std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
    double t;
    // Fine Smoother
    //    out = in;
    out = Zero();
    t=-usecond();
    _PreSmoother(in,out);
    t+=usecond();
    std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
    // Update the residual
    _FineOperator.Op(out,vec1);  sub(vec1, in ,vec1);   
    //    std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
    // Fine to Coarse 
    t=-usecond();
    _Aggregates.ProjectToSubspace  (Csrc,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse correction
    t=-usecond();
    Csol = Zero();
    _CoarseSolve(Csrc,Csol);
    //Csol=Zero();
    t+=usecond();
    std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
    // Coarse to Fine
    t=-usecond();  
    //    _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
    _Aggregates.PromoteFromSubspace(Csol,vec1); 
    add(out,out,vec1);
    t+=usecond();
    std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
    // Residual
    _FineOperator.Op(out,vec1);  sub(vec1 ,in , vec1);  
    //    std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
    // Fine Smoother
    t=-usecond();
    //    vec2=vec1;
    vec2=Zero();
    _PostSmoother(vec1,vec2);
    t+=usecond();
    std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
    add( out,out,vec2);
    std::cout<<GridLogMessage << "Done " <<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = UGrid;
  GridRedBlackCartesian * FrbGrid = UrbGrid;
  // Construct a coarsened grid
  Coordinate clatt = GridDefaultLatt();
  for(int d=0;d<clatt.size();d++){
    clatt[d] = clatt[d]/2;
    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse4d);CRNG.SeedFixedIntegers(cseeds);
  LatticeFermion    src(FGrid); random(RNG4,src);
  LatticeFermion result(FGrid); result=Zero();
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("ckpoint_lat");
  NerscIO::readConfiguration(Umu,header,file);
  RealD csw =0.0;
  RealD mass=-0.92;
  WilsonCloverFermionD Dw(Umu,*UGrid,*UrbGrid,mass,csw,csw);
  const int nbasis = 40;
  const int cb = 0 ;
  LatticeFermion prom(FGrid);
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NearestStencilGeometry4D geom(Coarse4d);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  // Warning: This routine calls Linop.Op, not LinOpo.HermOp
  typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
  Subspace Aggregates(Coarse4d,FGrid,cb);
  MdagMLinearOperator<WilsonCloverFermionD,LatticeFermion> MdagMOpDw(Dw);
  NonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> LinOpDw(Dw);
  ShiftedNonHermitianLinearOperator<WilsonCloverFermionD,LatticeFermion> ShiftedLinOpDw(Dw,0.5);
  //  Aggregates.CreateSubspaceGCR(RNG4,
  //			       LinOpDw,
  //			       nbasis);
  Aggregates.CreateSubspace(RNG4,MdagMOpDw,nbasis);
  LittleDiracOperator LittleDiracOp(geom,FGrid,Coarse4d);
  LittleDiracOp.CoarsenOperator(LinOpDw,Aggregates);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"Testing coarsened operator "<<std::endl;
  CoarseVector c_src (Coarse4d);
  CoarseVector c_res (Coarse4d);
  CoarseVector c_proj(Coarse4d);
  std::vector<LatticeFermion> subspace(nbasis,FGrid);
  subspace=Aggregates.subspace;
  Complex one(1.0);
  c_src = one;  // 1 in every element for vector 1.
  blockPromote(c_src,err,subspace);
  prom=Zero();
  for(int b=0;b<nbasis;b++){
    prom=prom+subspace[b];
  }
  err=err-prom; 
  std::cout<<GridLogMessage<<"Promoted back from subspace: err "<<norm2(err)<<std::endl;
  std::cout<<GridLogMessage<<"c_src "<<norm2(c_src)<<std::endl;
  std::cout<<GridLogMessage<<"prom  "<<norm2(prom)<<std::endl;
  LinOpDw.Op(prom,tmp);
  blockProject(c_proj,tmp,subspace);
  std::cout<<GridLogMessage<<" Called Big Dirac Op "<<norm2(tmp)<<std::endl;
  LittleDiracOp.M(c_src,c_res);
  std::cout<<GridLogMessage<<" Called Little Dirac Op c_src "<< norm2(c_src) << "  c_res "<< norm2(c_res) <<std::endl;
  std::cout<<GridLogMessage<<"Little dop : "<<norm2(c_res)<<std::endl;
  //  std::cout<<GridLogMessage<<" Little "<< c_res<<std::endl;
  std::cout<<GridLogMessage<<"Big dop in subspace : "<<norm2(c_proj)<<std::endl;
  //  std::cout<<GridLogMessage<<" Big "<< c_proj<<std::endl;
  c_proj = c_proj - c_res;
  std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
  //  std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
  /**********
   * Some solvers
   **********
   */
  ///////////////////////////////////////
  // Coarse grid solver test
  ///////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<CoarseVector> simple;
  NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOp);
  //  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10); 
  PrecGeneralisedConjugateResidualNonHermitian<CoarseVector>  L2PGCR(1.0e-2, 100, LinOpCoarse,simple,30,30); 
  L2PGCR.Level(3);
  c_res=Zero();
  L2PGCR(c_src,c_res);
  ////////////////////////////////////////
  // Fine grid smoother
  ////////////////////////////////////////
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
  std::cout<<GridLogMessage<<"******************* "<<std::endl;
  TrivialPrecon<LatticeFermionD> simple_fine;
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedLinOpDw,simple_fine,6,6);
  SmootherGCR.Level(2);
  LatticeFermionD f_src(FGrid);
  LatticeFermionD f_res(FGrid);
  f_src = one;  // 1 in every element for vector 1.
  f_res=Zero();
  SmootherGCR(f_src,f_res);
  typedef MGPreconditioner<vSpinColourVector,  vTComplex,nbasis> TwoLevelMG;
  TwoLevelMG TwoLevelPrecon(Aggregates,
 			    LinOpDw,
 			    simple_fine,
 			    SmootherGCR,
 			    LinOpCoarse,
 			    L2PGCR);
  PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,LinOpDw,TwoLevelPrecon,32,32);
  L1PGCR.Level(1);
  f_res=Zero();
  L1PGCR(f_src,f_res);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -490,7 +490,7 @@ public:
 	    }
 	  }
-	  GRID_ASSERT(s==nshift);
+	  assert(s==nshift);
 	  coalescedWrite(gStaple_v[ss],stencil_ss);
 	}
 	);
@@ -0,0 +1,16 @@
 #include <Grid/Grid.h>
 #pragma once
 #ifndef ENABLE_FERMION_INSTANTIATIONS
 #include <iostream>
 int main(void) {
  std::cout << "This build of Grid was configured to exclude fermion instantiations, "
 	    << "which this test relies on. "
 	    << "Please reconfigure and rebuild Grid with --enable-fermion-instantiations"
 	    << "to run this test."
 	    << std::endl;
  return 1;
 }
 #endif
@@ -1,14 +1,18 @@
 <?xml version="1.0"?>
 <grid>
  <LanczosParameters>
-    <mass>0.00107</mass>
+    <mass>0</mass>
    <mstep>-0.025</mstep>
    <M5>1.8</M5>
    <Ls>48</Ls>
-    <Nstop>10</Nstop>
+    <Nstop>5</Nstop>
-    <Nk>15</Nk>
+    <Nk>5</Nk>
-    <Np>85</Np>
+    <Np>5</Np>
-    <ChebyLow>0.003</ChebyLow>
+    <ReadEvec>0</ReadEvec>
-    <ChebyHigh>60</ChebyHigh>
+    <maxIter>10000</maxIter>
-    <ChebyOrder>201</ChebyOrder>
+    <resid>1e-10</resid>
    <ChebyLow>1</ChebyLow>
    <ChebyHigh>100</ChebyHigh>
    <ChebyOrder>51</ChebyOrder>
  </LanczosParameters>
 </grid>
@@ -32,9 +32,13 @@ directory
 using namespace std;
 using namespace Grid;
-//typedef WilsonFermionD FermionOp;
+#if 0
 typedef DomainWallFermionD FermionOp;
 typedef typename DomainWallFermionD::FermionField FermionField;
 #else
 typedef MobiusFermionD FermionOp;
 typedef typename MobiusFermionD::FermionField FermionField;
 #endif
 template <class T> void writeFile(T& in, std::string const fname){  
 #ifdef HAVE_LIME
@@ -179,12 +183,14 @@ int main(int argc, char** argv) {
  Np=LanParams.Np;
  int Nm = Nk + Np;
  int MaxIt = 100;
  RealD resid = 1.0e-4;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;
  RealD mob_b=1.5;
 //while ( mass > - 5.0){
-  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+//  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
 //  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
  Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
@@ -113,6 +113,9 @@ struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 				RealD, resid,
 				Integer, Nstop,
                                Integer, Nk,
                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -204,7 +207,6 @@ int main(int argc, char** argv) {
  int Nstop = 5;
  int Nk = 10;
  int Np = 90;
  int Nm = Nk + Np;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;
@@ -226,10 +228,14 @@ int main(int argc, char** argv) {
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
-
+  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
  mass=LanParams.mass;
  resid=LanParams.resid;
  int Nm = Nk + Np;
 while ( mass > - 5.0){
  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
@@ -0,0 +1,377 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/lanczos/Test_wilson_bilanczos.cc
    Copyright (C) 2025
 Author: Chulwoo Jung <chulwoo@bnl.gov>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <cstdlib>
 #include <Grid/Grid.h>
 #include <Grid/lattice/PaddedCell.h>
 #include <Grid/stencil/GeneralLocalStencil.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidual.h>
 #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
 #include <Grid/algorithms/iterative/BiCGSTAB.h>
 using namespace std;
 using namespace Grid;
 namespace Grid {
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 		  		RealD, mstep , 
 				Integer, Nstop,
                                Integer, Nk,
                                Integer, Np,
                                Integer, ReadEvec,
                                Integer, maxIter,
 	  			RealD, resid,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
  LanczosParameters() {
    ////////////////////////////// Default values
      mass = 0;
    /////////////////////////////////
  }
  template <class ReaderClass >
  LanczosParameters(Reader<ReaderClass> & TheReader){
    initialize(TheReader);
  }
  template < class ReaderClass > 
  void initialize(Reader<ReaderClass> &TheReader){
 //    std::cout << GridLogMessage << "Reading HMC\n";
    read(TheReader, "HMC", *this);
  }
  void print_parameters() const {
 //    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
 //    MD.print_parameters();
  }
 };
 }
 template <class T> void writeFile(T& in, std::string const fname){
 #if 1
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
  WR.writeScidacFieldRecord(in,record,0);
  WR.close();
 #endif
  // What is the appropriate way to throw error?
 }
 typedef WilsonFermionD WilsonOp;
 typedef typename WilsonFermionD::FermionField FermionField;
 template<class Matrix,class Field>
 class InvertNonHermitianLinearOperator : public LinearOperatorBase<Field> {
  Matrix &_Mat;
  RealD _stp;
 public:
  InvertNonHermitianLinearOperator(Matrix &Mat,RealD stp=1e-8): _Mat(Mat),_stp(stp){};
  // Support for coarsening to a multigrid
  void OpDiag (const Field &in, Field &out) {
 //    _Mat.Mdiag(in,out);
 //    out = out + shift*in;
    assert(0);
  }
  void OpDir  (const Field &in, Field &out,int dir,int disp) {
 //    _Mat.Mdir(in,out,dir,disp);
    assert(0);
  }
  void OpDirAll  (const Field &in, std::vector<Field> &out){
 //    _Mat.MdirAll(in,out);
    assert(0);
  };
  void Op     (const Field &in, Field &out){
    Field tmp(in.Grid());
    _Mat.Mdag(in,tmp);
    MdagMLinearOperator<Matrix,Field> HermOp(_Mat);
    ConjugateGradient<Field> CG(_stp,10000);
    CG(HermOp,tmp,out);
  }
  void AdjOp     (const Field &in, Field &out){
    _Mat.Mdag(in,out);
 //    out = out + shift * in;
  }
  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
    assert(0);
  }
  void HermOp(const Field &in, Field &out){
    assert(0);
  }
 };
 template<class Field>
 void testSchurFromHess(Arnoldi<Field>& Arn, Field& src, int Nlarge, int Nm, int Nk) {
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout << GridLogMessage << "Testing Schur reordering, Nm = " << Nm << ", Nk = " << Nk << std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout << GridLogMessage << "Running Arnoldi for 1 iteration to get a Hessenberg." << std::endl;
  Arn(src, 1, Nlarge, Nm, Nlarge);
  Eigen::MatrixXcd Hess = Arn.getHessenbergMat();
  std::cout << GridLogMessage << "Hessenberg for use: " << std::endl << Hess << std::endl;
  ComplexSchurDecomposition schur (Hess, true);
  bool isDecomposed = schur.checkDecomposition();
  std::cout << "Schur decomp holds? " << isDecomposed << std::endl;
  std::cout << GridLogMessage << "S = " << std::endl << schur.getMatrixS() << std::endl;
  std::cout << GridLogMessage << "Swapping S(3, 3) with S(4, 4)" << std::endl;
  schur.swapEvals(3);
  std::cout << GridLogMessage << "S after swap = " << std::endl << schur.getMatrixS() << std::endl;
  std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
  // Now move last diagonal element all the way to the front.
  std::cout << GridLogMessage << "Moving last eval to front. S at start = " << std::endl << schur.getMatrixS() << std::endl;
  for (int i = 0; i < Nk - 1; i++) {
    int swapIdx = Nk - 2 - i;
    schur.swapEvals(swapIdx);
    std::cout << GridLogMessage << "S after swap of index " << swapIdx << " = " << std::endl << schur.getMatrixS() << std::endl;
    std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
  }
  std::cout << GridLogMessage << "Testing Schur reorder" << std::endl;
  schur.schurReorder(Nk);
  std::cout << GridLogMessage << "S after reorder = " << std::endl << schur.getMatrixS() << std::endl;
  std::cout << "Schur decomp still holds? " << schur.checkDecomposition() << std::endl;
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  const int Ls=16;
 //   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 //  std::vector<int> lat_size {32, 32, 32, 32};
 //  std::cout << "Lattice size: " << lat_size << std::endl;
  GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
 								          GridDefaultSimd(Nd,vComplex::Nsimd()),
 								          GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 //  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 //  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  GridCartesian         * FGrid   = UGrid;
  GridRedBlackCartesian * FrbGrid = UrbGrid;
  // Construct a coarsened grid
  // poare TODO: replace this with the following line?
  Coordinate clatt = GridDefaultLatt();
 //   Coordinate clatt = GridDefaultLatt();              // [PO] initial line before I edited it
  for(int d=0;d<clatt.size();d++){
  std::cout << GridLogMessage<< clatt[d] <<std::endl;
    clatt[d] = clatt[d]/2;
    //    clatt[d] = clatt[d]/4;
  }
  GridCartesian *Coarse4d =  SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
  GridCartesian *Coarse5d =  SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  std::vector<int> cseeds({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);   RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);   RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
  LatticeFermion result(FGrid); result=Zero();
  LatticeFermion    ref(FGrid); ref=Zero();
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  LatticeGaugeField Umu(UGrid);
  FieldMetaData header;
  std::string file("config");
 //  std::string file("Users/patrickoare/libraries/PETSc-Grid/ckpoint_lat.4000");
  NerscIO::readConfiguration(Umu,header,file);
  LanczosParameters LanParams;
  {
    XmlReader  HMCrd("LanParams.xml");
    read(HMCrd,"LanczosParameters",LanParams);
  }
  std::cout << GridLogMessage<< LanParams <<std::endl;
  {
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
  RealD mass=0.01;
  RealD M5=1.8;
  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
  int Nm = 50;
  int Nk = 12; 
  int Np = 38; 
  // int Nk = Nm+1;     // if just running once
  int maxIter = 10000;
  int Nstop = 10;
  RealD resid = 1.0e-5;
  std::vector<Complex> boundary = {1,1,1,-1};
  WilsonOp::ImplParams Params(boundary);
 //  DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 //  DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
  mass=LanParams.mass;
  std::cout << GridLogIRL<< "mass "<<mass<<std::endl;
  WilsonOp WilsonOperator(Umu,*UGrid,*UrbGrid,mass,Params);
  // const int nbasis = 20;            // size of approximate basis for low-mode space
  const int nbasis = 3;            // size of approximate basis for low-mode space
  const int cb = 0 ;
  LatticeFermion prom(FGrid);
  typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
  typedef LittleDiracOperator::CoarseVector CoarseVector;
  NextToNearestStencilGeometry5D geom(Coarse5d);
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
 //  typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
 //  typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
 //  typedef ShiftedComplexPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedComplexPVdagM_t;
 //  PVdagM_t PVdagM(Ddwf, Dpv);
 //  ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv);
 //  SquaredLinearOperator<DomainWallFermionD, LatticeFermionD> Dsq (Ddwf);
 //  NonHermitianLinearOperator<DomainWallFermionD, LatticeFermionD> DLinOp (Ddwf);
  NonHermitianLinearOperator<WilsonOp,FermionField> Dwilson(WilsonOperator); /// <-----
 //  InvertNonHermitianLinearOperator<WilsonOp,FermionField> Iwilson(WilsonOperator); /// <-----
  MdagMLinearOperator<WilsonOp,FermionField> HermOp(WilsonOperator); /// <-----
  Gamma5HermitianLinearOperator <WilsonOp,LatticeFermion> HermOp2(WilsonOperator); /// <----
  // PowerMethod<LatticeFermion> PM; PM(PVdagM, src);
  resid=LanParams.resid;
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
  maxIter=LanParams.maxIter;
  Nm = Nk + Np;
  int Nu=16;
  std::vector<LatticeFermion> src(Nu,FGrid); 
  for(int i=0;i<Nu;i++) random(RNG5,src[i]);
  if(LanParams.ReadEvec) {
    std::string evecs_file="evec_in";
    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
    emptyUserRecord record;
    Grid::ScidacReader RD;
    RD.open(evecs_file);
    RD.readScidacFieldRecord(src[0],record);
    RD.close();
  }
  Coordinate origin ({0,0,0,0});
  auto tmpSrc = peekSite(src[0], origin);
  std::cout << "[DEBUG] Source at origin = " <<  tmpSrc << std::endl;
  LatticeFermion src2 = src[0];
  // Run KrylovSchur and Arnoldi on a Hermitian matrix
  std::cout << GridLogMessage << "Running Krylov Schur" << std::endl;
 #if 0
 #if 1
    RealD shift=1.5;
    KrylovSchur KrySchur (Dwilson, UGrid, resid,EvalImNormSmall);
    KrySchur(src[0], maxIter, Nm, Nk, Nstop,&shift);
 #else
    KrylovSchur KrySchur (Iwilson, UGrid, resid,EvalImNormSmall);
    KrySchur(src[0], maxIter, Nm, Nk, Nstop);
 #endif
    std::cout << GridLogMessage << "evec.size= " << KrySchur.evecs.size()<< std::endl;
 #else
  LanczosBidiagonalization<FermionField> LB(Dwilson, UGrid);
  LB.run(src[0], Nm, resid);
  RestartedLanczosBidiagonalization<FermionField> IRLBA(Dwilson, UGrid, Nstop, Nm, resid, maxIter,false);
  IRLBA.run(src[0]);
 #endif
 #if 0
  src[0]=KrySchur.evecs[0];
  for (int i=1;i<Nstop;i++) src[0]+=KrySchur.evecs[i];
  for (int i=0;i<Nstop;i++) 
  {
 	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
        auto evdensity = localInnerProduct(KrySchur.evecs[i],KrySchur.evecs[i] );
        writeFile(evdensity,evfile);
  }
  {
        std::string evfile ("./evec_"+std::to_string(mass)+"_sum");
 //        auto evdensity = localInnerProduct(evec[i],evec[i] );
        writeFile(src[0],evfile);
  }
 #endif
  /*
  std::cout << GridLogMessage << "Running Arnoldi" << std::endl;
  // Arnoldi Arn (Dsq, FGrid, 1e-8);
  Arnoldi Arn (DLinOp, FGrid, 1e-8);
  testSchurFromHess<LatticeFermion>(Arn, src, 10, 6, 4);
  Arnoldi Arn2 (DLinOp, FGrid, 1e-8);
  testSchurFromHess<LatticeFermion>(Arn2, src, 16, 12, 8);
  */
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
  std::cout<<GridLogMessage<<std::endl;
  std::cout<<GridLogMessage << "Done "<< std::endl;
  Grid_finalize();
  return 0;
 }
@@ -6,7 +6,7 @@ Source file: ./tests/Test_dwf_lanczos.cc
 Copyright (C) 2015
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Chulwoo Jung <chulwoo@bnl.gov>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -27,6 +27,9 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/parallelIO/IldgIOtypes.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
 using namespace std;
 using namespace Grid;
@@ -38,18 +41,111 @@ typedef typename WilsonFermionD::FermionField FermionField;
 RealD AllZero(RealD x) { return 0.; }
 template <class T> void writeFile(T& in, std::string const fname){
 #if 1
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
  WR.writeScidacFieldRecord(in,record,0);
  WR.close();
 #endif
  // What is the appropriate way to throw error?
 }
 namespace Grid {
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 		  		RealD, mstep , 
 				Integer, Nstop,
                                Integer, Nk,
                                Integer, Np,
                                Integer, ReadEvec,
 	  			RealD, resid,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
 //                                  Integer, StartTrajectory,
 //                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
 //                                  bool, MetropolisTest,
 //                                  Integer, NoMetropolisUntil,
 //                                  std::string, StartingType,
 //                                  Integer, SW,
 //				  RealD, Kappa,
 //                                  IntegratorParameters, MD)
  LanczosParameters() {
    ////////////////////////////// Default values
      mass = 0;
 //    MetropolisTest    = true;
 //    NoMetropolisUntil = 10;
 //    StartTrajectory   = 0;
 //    SW                = 2;
 //    Trajectories      = 10;
 //    StartingType      = "HotStart";
    /////////////////////////////////
  }
  template <class ReaderClass >
  LanczosParameters(Reader<ReaderClass> & TheReader){
    initialize(TheReader);
  }
  template < class ReaderClass > 
  void initialize(Reader<ReaderClass> &TheReader){
 //    std::cout << GridLogMessage << "Reading HMC\n";
    read(TheReader, "HMC", *this);
  }
  void print_parameters() const {
 //    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
 //    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
 //    MD.print_parameters();
  }
 };
 }
 int main(int argc, char** argv) {
  Grid_init(&argc, &argv);
  int Ndir=4;
  auto mpi_layout  = GridDefaultMpi();
  std::vector<int> nblock(4,1);
  std::vector<int> mpi_split(4,1);
 //Interested in avoiding degeneracy only for now
  nblock[3]=2;
  int mrhs=1;
  for(int i =0;i<Ndir;i++){
      mpi_split[i] = mpi_layout[i] / nblock[i];
      mrhs *= nblock[i];
  }
  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
      GridDefaultMpi());
-  GridRedBlackCartesian* UrbGrid =
+
-      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
                                                    GridDefaultSimd(Nd,vComplex::Nsimd()),
                                                    mpi_split,
                                                    *UGrid);
  GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian* FGrid = UGrid;
  GridRedBlackCartesian* FrbGrid = UrbGrid;
-  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid,
+//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
         FrbGrid);
  std::vector<int> seeds4({1, 2, 3, 4});
  std::vector<int> seeds5({5, 6, 7, 8});
@@ -61,7 +157,16 @@ int main(int argc, char** argv) {
  RNG5.SeedFixedIntegers(seeds5);
  LatticeGaugeField Umu(UGrid);
-  SU<Nc>::HotConfiguration(RNG4, Umu);
+//  SU<Nc>::HotConfiguration(RNG4, Umu);
 //  SU<Nc>::ColdConfiguration(Umu);
  FieldMetaData header;
  std::string file("./config");
 //  int precision32 = 0;
 //  int tworow      = 0;
 //  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
  NerscIO::readConfiguration(Umu,header,file);
 /*
  std::vector<LatticeColourMatrix> U(4, UGrid);
@@ -70,30 +175,100 @@ int main(int argc, char** argv) {
  }
 */
-  RealD mass = -0.1;
+  int Nstop = 10;
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+  int Nu = 1;
-  MdagMLinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator); /// <-----
+  int Nk = 20;
-  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
+  int Np = 80;
  int Nm = Nk + Np;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;
-  const int Nstop = 20;
+  RealD mass = -1.0;
-  const int Nk = 60;
+
-  const int Np = 60;
+  LanczosParameters LanParams;
-  const int Nm = Nk + Np;
+#if 1
-  const int MaxIt = 10000;
+  {
-  RealD resid = 1.0e-6;
+    XmlReader  HMCrd("LanParams.xml");
    read(HMCrd,"LanczosParameters",LanParams);
  }
 #else
  {
    LanParams.mass = mass;
  }
 #endif
  std::cout << GridLogMessage<< LanParams <<std::endl;
  { 
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
  mass=LanParams.mass;
  resid=LanParams.resid;
  Nstop=LanParams.Nstop;
  Nu = mrhs;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
  Nm = Nk + Np;
 //  FermionField src(FGrid);
  std::vector<FermionField> src(Nu,FGrid);
  for(int i =0;i<Nu;i++) gaussian(RNG5, src[i]);
  if(LanParams.ReadEvec) {
    std::string evecs_file="evec_in";
    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
    emptyUserRecord record;
    Grid::ScidacReader RD;
    RD.open(evecs_file);
    RD.readScidacFieldRecord(src[0],record);
    RD.close();
  }
  std::vector<Complex> boundary = {1,1,1,-1};
 //  std::vector<Complex> boundary = {1,1,1,1};
  FermionOp::ImplParams Params(boundary);
  GridCartesian         * SFGrid   = SGrid;
  GridRedBlackCartesian * SFrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SFGrid);
 //  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
  LatticeGaugeField s_Umu(SGrid);
  Grid_split  (Umu,s_Umu);
 while ( mass > - 2.0){
  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
  FermionOp WilsonSplit(s_Umu,*SFGrid,*SFrbGrid,mass,Params);
  MdagMLinearOperator<FermionOp,FermionField> SHermOp(WilsonSplit); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
  std::vector<double> Coeffs{0, 1.};
  Polynomial<FermionField> PolyX(Coeffs);
-  Chebyshev<FermionField> Cheby(0.0, 10., 12);
+//  Chebyshev<FermionField> Cheby(0.5, 60., 31);
 //                                  RealD, ChebyLow,
 //                                RealD, ChebyHigh,
 //                                Integer, ChebyOrder)
  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
     PlainHermOp<FermionField> Op     (HermOp);
     PlainHermOp<FermionField> Op2     (HermOp2);
-  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
+//  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
-
+//  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);
    ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
                                                     FrbGrid,SFrbGrid,mrhs,
                                                     Cheby,
                                                     Nstop, Nstop*2,
                                                     Nu, Nk, Nm,
                                                     resid, MaxIt,
                                                     IRBLdiagonaliseWithEigen);
  IRBL.split_test=1;
  std::vector<RealD> eval(Nm);
  FermionField src(FGrid);
  gaussian(RNG5, src);
  std::vector<FermionField> evec(Nm, FGrid);
  for (int i = 0; i < 1; i++) {
    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
@@ -101,9 +276,40 @@ int main(int argc, char** argv) {
  };
  int Nconv;
-  IRL.calc(eval, evec, src, Nconv);
+//  IRL.calc(eval, evec, src, Nconv);
  IRBL.calc(eval, evec, src, Nconv,LanczosType::irbl);
-  std::cout << eval << std::endl;
+  std::cout << mass <<" : " << eval << std::endl;
  Gamma g5(Gamma::Algebra::Gamma5) ;
  ComplexD dot;
  FermionField tmp(FGrid);
  FermionField sav(FGrid);
  sav=evec[0];
  for (int i = 0; i < Nstop ; i++) {
    tmp = g5*evec[i];
    dot = innerProduct(tmp,evec[i]);
    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
 //    if ( i<1)
    {
 	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
        auto evdensity = localInnerProduct(evec[i],evec[i] );
 	writeFile(evdensity,evfile);
    }
    if (i>0) sav += evec[i];
  }
  {
 	std::string evfile ("./evec_"+std::to_string(mass)+"_sum");
 //        auto evdensity = localInnerProduct(evec[i],evec[i] );
 	writeFile(sav,evfile);
  }
  for(int i =0;i<Nu;i++) src[i]=evec[i];
  for(int i=Nu;i<Nstop;i++) src[i%Nu] +=evec[i];
 //  src  = evec[0]+evec[1]+evec[2];
 //  src  += evec[3]+evec[4]+evec[5];
 //  src  += evec[6]+evec[7]+evec[8];
  mass += LanParams.mstep;
 }
  Grid_finalize();
 }
@@ -27,6 +27,7 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/parallelIO/IldgIOtypes.h>
 using namespace std;
 using namespace Grid;
@@ -38,11 +39,32 @@ typedef typename WilsonFermionD::FermionField FermionField;
 RealD AllZero(RealD x) { return 0.; }
 template <class T> void writeFile(T& in, std::string const fname){
 #if 1
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
  WR.writeScidacFieldRecord(in,record,0);
  WR.close();
 #endif
  // What is the appropriate way to throw error?
 }
 namespace Grid {
 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 		  		RealD, mstep , 
 				Integer, Nstop,
                                Integer, Nk,
                                Integer, Np,
                                Integer, ReadEvec,
                                Integer, maxIter,
 	  			RealD, resid,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -115,12 +137,13 @@ int main(int argc, char** argv) {
  LatticeGaugeField Umu(UGrid);
 //  SU<Nc>::HotConfiguration(RNG4, Umu);
 //  SU<Nc>::ColdConfiguration(Umu);
  FieldMetaData header;
  std::string file("./config");
-  int precision32 = 0;
+//  int precision32 = 0;
-  int tworow      = 0;
+//  int tworow      = 0;
 //  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
  NerscIO::readConfiguration(Umu,header,file);
@@ -158,10 +181,32 @@ int main(int argc, char** argv) {
  }
  mass=LanParams.mass;
  resid=LanParams.resid;
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
  MaxIt=LanParams.maxIter;
  Nm = Nk + Np;
  FermionField src(FGrid);
  gaussian(RNG5, src);
  if(LanParams.ReadEvec) {
    std::string evecs_file="evec_in";
    std::cout << GridLogIRL<< "Reading evecs from "<<evecs_file<<std::endl;
    emptyUserRecord record;
    Grid::ScidacReader RD;
    RD.open(evecs_file);
    RD.readScidacFieldRecord(src,record);
    RD.close();
  }
  std::vector<Complex> boundary = {1,1,1,-1};
 //  std::vector<Complex> boundary = {1,1,1,1};
  FermionOp::ImplParams Params(boundary);
-while ( mass > - 5.0){
+while ( mass > - 2.0){
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
@@ -179,11 +224,10 @@ while ( mass > - 5.0){
     PlainHermOp<FermionField> Op     (HermOp);
     PlainHermOp<FermionField> Op2     (HermOp2);
-  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
+  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
 //  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);
  std::vector<RealD> eval(Nm);
  FermionField src(FGrid);
  gaussian(RNG5, src);
  std::vector<FermionField> evec(Nm, FGrid);
  for (int i = 0; i < 1; i++) {
    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
@@ -192,19 +236,46 @@ while ( mass > - 5.0){
  int Nconv;
  IRL.calc(eval, evec, src, Nconv);
 //  IRL.calc(eval,  src, Nconv);
  std::cout << mass <<" : " << eval << std::endl;
  Gamma g5(Gamma::Algebra::Gamma5) ;
  ComplexD dot;
  FermionField tmp(FGrid);
  FermionField sav(FGrid);
  sav=evec[0];
  for (int i = 0; i < Nstop ; i++) {
    tmp = g5*evec[i];
    dot = innerProduct(tmp,evec[i]);
    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
 //    if ( i<1)
    {
 	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
        auto evdensity = localInnerProduct(evec[i],evec[i] );
 	writeFile(evdensity,evfile);
 //  if(LanParams.ReadEvec) {
 //    std::string evecs_file="evec_in";
  {
    std::cout << GridLogIRL<< "Reading evecs from "<<evfile<<std::endl;
    emptyUserRecord record;
    Grid::ScidacReader RD;
    RD.open(evfile);
    RD.readScidacFieldRecord(evdensity,record);
    RD.close();
  }
    }
    if (i>0) sav += evec[i];
  }
  {
 	std::string evfile ("./evec_"+std::to_string(mass)+"_sum");
 //        auto evdensity = localInnerProduct(evec[i],evec[i] );
 	writeFile(sav,evfile);
  }
  src  = evec[0]+evec[1]+evec[2];
-  mass += -0.1;
+  src  += evec[3]+evec[4]+evec[5];
  src  += evec[6]+evec[7]+evec[8];
  mass += LanParams.mstep;
 }
  Grid_finalize();
Author	SHA1	Message	Date
Chulwoo Jung	09aa843984	Changed batchedInnerProduct for portability	2026-03-17 18:54:18 -04:00
Chulwoo Jung	24752002fa	Verbosity reduction batched inner product for reorthogonalization	2026-03-17 13:02:16 -04:00
Chulwoo Jung	f3223021fd	RestartedLanczosBidiagonalization seems to have been fixed	2026-03-16 14:34:56 -04:00
Chulwoo Jung	4e1d95d3bb	Claude implementation of Thick Restarted Lanczos Bidiagonalization	2026-03-13 19:12:54 -04:00
Chulwoo Jung	2ed38f63ca	Merge branch 'develop' of https://github.com/paboyle/Grid into KS_shifted	2026-03-12 10:49:21 -04:00
Chulwoo Jung	80d2a8d88d	wqMerge branch 'develop' of https://github.com/paboyle/Grid into KS_shifted	2026-03-11 21:49:26 -04:00
Peter Boyle	595ceaac37	Include grid header and make the ENABLE correct	2026-03-11 17:24:44 -04:00
Peter Boyle	daf5834e8e	Fixing incorrect PR about disable fermion instantiations	2026-03-11 17:05:46 -04:00
Chulwoo Jung	2ac5431401	Turning of NERSC header checking	2026-03-06 14:16:20 -05:00
Peter Boyle	0d8658a039	Optimised	2026-03-05 06:06:32 -05:00
Peter Boyle	095e004d01	Setup change GCR	2026-03-05 06:06:32 -05:00
Peter Boyle	0acabee7f6	Modest change	2026-03-05 06:06:32 -05:00
Peter Boyle	76fbcffb60	Improvement to 16^3 hdcg	2026-03-05 06:06:32 -05:00
Peter Boyle	a0a62d7ead	Merge pull request #478 from vataspro/PolyakovUpstream Spatial Polyakov Loop implementation	2026-02-24 20:45:42 -05:00
Peter Boyle	c5038ea6a5	Merge pull request #483 from cmcknigh/bugfix/rocm7-rocblas-type-refactor Adding a version check to handle rocBlas type refactor	2026-02-24 20:45:03 -05:00
Peter Boyle	a5120903eb	Merge pull request #486 from RChrHill/fix/sp4-fp32 Define Sp4 ProjectOnGeneralGroup for generic vtype	2026-02-24 20:44:08 -05:00
Peter Boyle	00b286a08a	Merge pull request #488 from RChrHill/feature/additional-ET-traces Add ET support for Lattice spin- and colour-traces	2026-02-24 20:43:45 -05:00
Peter Boyle	24a9759353	Merge pull request #485 from edbennett/skip-fermion-instantiations Be able to skip compiling fermion instantiations altogether	2026-02-24 20:43:20 -05:00
edbennett	1b56f6f46d	be able to skip compiling fermion instantiations altogether	2026-02-24 23:52:18 +00:00
Peter Boyle	2a8084d569	Subspace setup	2026-02-13 17:26:11 -05:00
Peter Boyle	6ff29f9d4f	Alternate multigrids	2026-02-13 17:25:45 -05:00
RChHill	c4d3e79193	Add ET support for Lattice spin- and colour-traces	2026-01-29 14:46:52 +00:00
Chulwoo Jung	3e71cac7ae	Merge branch 'specflow2' of github.com:chulwoo1/Grid into KS_shifted	2026-01-15 03:40:52 +00:00
Chulwoo Jung	e8e7ef08fc	KrylovSchur and spectral flow updates	2026-01-15 03:20:01 +00:00
Chulwoo Jung	5c00fe6bef	Merge branch 'develop' of github.com:poare/Grid into KS_shifted	2026-01-12 06:26:15 +00:00
Chulwoo Jung	3175788f97	Added explicit shift before pulling	2026-01-12 06:25:09 +00:00
Patrick Oare	6f1788bb38	modified Givens rotation to implement a sparse multiplication	2026-01-06 16:19:48 -05:00
Peter Boyle	7cd3f21e6b	preserving a bunch of experiments on setup and g5 subspace doubling	2026-01-06 05:57:39 -05:00
Chulwoo Jung	dcda74f924	Timing info for schurReorder,etc	2025-12-18 18:23:50 +00:00
Chulwoo Jung	df4c2a082b	Reducing comments	2025-12-09 14:23:22 +00:00
Chulwoo Jung	88611659a3	Appear to be working	2025-12-08 21:08:14 -05:00
Chulwoo Jung	504b85dfc0	Restarting and adding codes back in	2025-12-08 13:27:06 -05:00
Chulwoo Jung	43ea83e5e1	Checking in to move back to genoa	2025-12-05 23:56:40 +00:00
Chulwoo Jung	376150c3df	Adding	2025-12-04 21:29:31 -05:00
Chulwoo Jung	842e0391e7	Checking in to move back to aurora	2025-12-04 20:13:44 -05:00
Chulwoo Jung	17e3799bcc	Necessary code for Harmonic KS added	2025-12-03 19:38:45 -05:00
Chulwoo Jung	985ab70f85	Checking in without adjusting Nk	2025-12-03 14:46:34 -05:00
Chulwoo Jung	1e85081986	Adding shift and debugging	2025-12-03 00:16:51 -05:00
Chulwoo Jung	3876fe5a29	Merge branch 'KrylovSchur' of github.com:chulwoo1/Grid into KS_shifted	2025-12-02 17:46:40 -05:00
Chulwoo Jung	6692425aa2	Checking in before pulling	2025-11-26 17:17:22 -05:00
Chulwoo Jung	d5ac4fc67f	Starting to modified KS	2025-11-26 22:13:27 +00:00
Chulwoo Jung	3538faf449	Starting Harmonic (shift and inverse)	2025-11-24 17:05:35 -05:00
RChHill	b650b89682	Define Sp4 ProjectOnGeneralGroup for generic vtype	2025-11-19 13:26:52 +00:00
Patrick Oare	0b457b9d52	fixed ritz estimate bug	2025-11-07 18:56:08 +00:00
Chulwoo Jung	fe0ab5f1a9	Merge branch 'develop' of github.com:poare/Grid into develop	2025-11-07 15:50:22 +00:00
Chulwoo Jung	caa66418bd	Checking in before pulling	2025-11-06 22:44:05 +00:00
Allen McKnight	4304245c1b	Merge branch 'develop' into bugfix/rocm7-rocblas-type-refactor	2025-11-04 08:50:11 -06:00
Chulwoo Jung	786496f22e	Checking in before pulling KrylovSchur	2025-11-03 21:18:56 +00:00
Patrick R Oare	68af1bba67	commented some slow code out	2025-10-31 11:47:29 -04:00
Patrick Oare	bf2a715ef7	bug in wilson eigenvectors: ritz estimates not equalling deviation from being an evec	2025-10-31 15:31:46 +00:00
Patrick Oare	4042ebf1bf	added ImNorm to sort	2025-10-20 19:01:53 +00:00
Your Name	1d1fd3bcaf	adding a version check to handle rocblas type change	2025-10-02 15:24:24 -05:00
Patrick R Oare	82f35001ff	small bug fix for wilson spectrum since we're actually running DWF	2025-09-25 15:36:42 -04:00
Patrick Oare	fa30c791aa	updated wilson spec	2025-09-23 15:24:50 +00:00
Patrick Oare	612049f1dd	commented out evec writer because it was taking up all the space on SDCC	2025-09-18 15:09:31 -04:00
Patrick Oare	0b92ef990c	found bug in unprec DWF: was using \|\cdot\| in comparison for the eigenvalue sorting	2025-09-12 13:31:39 -04:00
Patrick Oare	82d411ca7b	added inline to rf functions	2025-09-10 17:16:48 -04:00
Patrick Oare	597086a031	added wilson spectrum example	2025-09-10 15:41:00 -04:00
Patrick Oare	b210ddf9a7	added commented out line to run un-preconditioned DWF	2025-09-09 15:14:11 -04:00
Patrick Oare	c5d02e5799	updated RitzFilter enum and the input to run krylov schur	2025-09-09 13:02:11 -04:00
Patrick Oare	9dcd7ca761	added IO for evecs / evals	2025-09-08 12:59:48 -04:00
Chulwoo Jung	c1e5ef9476	Adding config input	2025-08-15 20:52:36 +00:00
Patrick Oare	6fd71aea9d	may have found bug	2025-08-15 12:13:01 -04:00
Patrick Oare	a18b0d496c	added more debug output	2025-08-15 11:51:15 -04:00
Patrick Oare	19f0737b98	trying one more thing	2025-08-14 14:47:38 -04:00
Patrick Oare	16d3c9cf75	added another debug feature	2025-08-14 14:37:49 -04:00
Patrick Oare	3b9fc72451	modified debug output slightly	2025-08-14 14:06:32 -04:00
Patrick Oare	99644f5d0a	commented out arg assert in kryschur spec example	2025-08-13 14:57:57 -04:00
Chulwoo Jung	7780d88d26	Adding simple lanczos, boundary to specflow(!)	2025-08-06 23:41:53 +00:00
Chulwoo Jung	2bf9179d2c	Adding mass step	2025-08-06 16:52:51 +00:00
Chulwoo Jung	c606f5dca0	Move out src initialization for re-use / Adding antiperiodic BC	2025-08-06 16:51:14 +00:00
Patrick Oare	632f5916c7	small log change for KS	2025-08-04 15:43:23 -04:00
Patrick Oare	9057694895	added double orthog to KS	2025-08-04 15:30:18 -04:00
Patrick Oare	5e85aef19d	added updates to GCR polynomial code	2025-07-31 16:42:35 -04:00
Patrick Oare	2b6d40c7e1	added example files	2025-07-31 16:41:28 -04:00
Patrick Oare	33b80c4e8e	added eigensolver code for arnoldi and krylov schur	2025-07-31 16:40:24 -04:00
Chulwoo Jung	8419cc5c64	specflow evec I/O added,	2025-07-11 15:57:23 -04:00
Alexis Provatas	c646d91527	Fix names, protect against bad index values, clean docstrings	2025-05-01 10:52:00 +01:00
Alexis Provatas	a2b98d82e1	remove obsolete spatial polyakov observable file	2025-05-01 10:52:00 +01:00
Alexis Provatas	7b9415c088	Move observable logger to Polyakov Loop file and fix docstring	2025-05-01 10:52:00 +01:00
Alexis Provatas	cb7110f492	Add Spatial Polyakov Loop observable	2025-05-01 10:52:00 +01:00
Alexis Provatas	0c7af66490	Create Spatial Polyakov Observable Module	2025-05-01 10:52:00 +01:00
Alexis Provatas	496d1b914a	Generalise Polyakov loop and overload for temporal direction	2025-05-01 10:52:00 +01:00
Chulwoo Jung	2cc6deb8e0	Merge branch 'develop' of https://github.com/paboyle/Grid into ic2	2025-04-25 10:48:41 -04:00
Chulwoo Jung	19d0590579	Checking in for merging	2025-04-25 10:48:22 -04:00