Adding simple lanczos, boundary to specflow(!)

Adding mass step
Move out src initialization for re-use / Adding antiperiodic BC
2025-11-06 06:49:30 +00:00 · 2025-08-06 23:41:53 +00:00 · 2025-08-06 16:52:51 +00:00 · 2025-08-06 16:51:14 +00:00 · 2025-07-11 15:57:23 -04:00 · 2025-04-25 10:48:41 -04:00
32 changed files with 1063 additions and 936 deletions
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -73,6 +73,7 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
+#include <Grid/algorithms/iterative/SimpleLanczos.h>
 #include <Grid/algorithms/iterative/PowerMethod.h>
 #include <Grid/algorithms/iterative/AdefGeneric.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -269,9 +269,7 @@ public:
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
-    grid->Barrier();
    axpby(T1,xscale,mscale,y,in);
-    grid->Barrier();

    // sum = .5 c[0] T0 + c[1] T1
    //    out = ()*T0 + Coeffs[1]*T1;
--- a/Grid/algorithms/iterative/SimpleLanczos.h
+++ b/Grid/algorithms/iterative/SimpleLanczos.h
@@ -0,0 +1,931 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
+
+    Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LANC_H
+#define GRID_LANC_H
+
+#include <string.h>		//memset
+
+#ifdef USE_LAPACK
+#ifdef USE_MKL
+#include<mkl_lapack.h>
+#else
+void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
+		    double *vl, double *vu, int *il, int *iu, double *abstol,
+		    int *m, double *w, double *z, int *ldz, int *isuppz,
+		    double *work, int *lwork, int *iwork, int *liwork,
+		    int *info);
+//#include <lapacke/lapacke.h>
+#endif
+#endif
+
+//#include <Grid/algorithms/densematrix/DenseMatrix.h>
+
+// eliminate temorary vector in calc()
+#define MEM_SAVE
+
+namespace Grid
+{
+
+  struct Bisection
+  {
+
+#if 0
+    static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
+			  std::vector < RealD > &BETA,
+			  std::vector < RealD > &eig)
+    {
+      int i, j;
+        std::vector < RealD > evec1 (row_num + 3);
+        std::vector < RealD > evec2 (row_num + 3);
+      RealD eps2;
+        ALPHA[1] = 0.;
+        BETHA[1] = 0.;
+      for (i = 0; i < row_num - 1; i++)
+	{
+	  ALPHA[i + 1] = A[i * (row_num + 1)].real ();
+	  BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
+	}
+      ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
+        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
+        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
+
+      // Do we really need to sort here?
+      int begin = 1;
+      int end = row_num;
+      int swapped = 1;
+      while (swapped)
+	{
+	  swapped = 0;
+	  for (i = begin; i < end; i++)
+	    {
+	      if (mag (evec2[i]) > mag (evec2[i + 1]))
+		{
+		  swap (evec2 + i, evec2 + i + 1);
+		  swapped = 1;
+		}
+	    }
+	  end--;
+	  for (i = end - 1; i >= begin; i--)
+	    {
+	      if (mag (evec2[i]) > mag (evec2[i + 1]))
+		{
+		  swap (evec2 + i, evec2 + i + 1);
+		  swapped = 1;
+		}
+	    }
+	  begin++;
+	}
+
+      for (i = 0; i < row_num; i++)
+	{
+	  for (j = 0; j < row_num; j++)
+	    {
+	      if (i == j)
+		H[i * row_num + j] = evec2[i + 1];
+	      else
+		H[i * row_num + j] = 0.;
+	    }
+	}
+    }
+#endif
+
+    static void bisec (std::vector < RealD > &c,
+		       std::vector < RealD > &b,
+		       int n,
+		       int m1,
+		       int m2,
+		       RealD eps1,
+		       RealD relfeh, std::vector < RealD > &x, RealD & eps2)
+    {
+      std::vector < RealD > wu (n + 2);
+
+      RealD h, q, x1, xu, x0, xmin, xmax;
+      int i, a, k;
+
+      b[1] = 0.0;
+      xmin = c[n] - fabs (b[n]);
+      xmax = c[n] + fabs (b[n]);
+      for (i = 1; i < n; i++)
+	{
+	  h = fabs (b[i]) + fabs (b[i + 1]);
+	  if (c[i] + h > xmax)
+	    xmax = c[i] + h;
+	  if (c[i] - h < xmin)
+	    xmin = c[i] - h;
+	}
+      xmax *= 2.;
+
+      eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
+      if (eps1 <= 0.0)
+	eps1 = eps2;
+      eps2 = 0.5 * eps1 + 7.0 * (eps2);
+      x0 = xmax;
+      for (i = m1; i <= m2; i++)
+	{
+	  x[i] = xmax;
+	  wu[i] = xmin;
+	}
+
+      for (k = m2; k >= m1; k--)
+	{
+	  xu = xmin;
+	  i = k;
+	  do
+	    {
+	      if (xu < wu[i])
+		{
+		  xu = wu[i];
+		  i = m1 - 1;
+		}
+	      i--;
+	    }
+	  while (i >= m1);
+	  if (x0 > x[k])
+	    x0 = x[k];
+	  while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
+	    {
+	      x1 = (xu + x0) / 2;
+
+	      a = 0;
+	      q = 1.0;
+	      for (i = 1; i <= n; i++)
+		{
+		  q =
+		    c[i] - x1 -
+		    ((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
+		  if (q < 0)
+		    a++;
+		}
+//      printf("x1=%0.14e a=%d\n",x1,a);
+	      if (a < k)
+		{
+		  if (a < m1)
+		    {
+		      xu = x1;
+		      wu[m1] = x1;
+		    }
+		  else
+		    {
+		      xu = x1;
+		      wu[a + 1] = x1;
+		      if (x[a] > x1)
+			x[a] = x1;
+		    }
+		}
+	      else
+		x0 = x1;
+	    }
+	  printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
+	  x[k] = (x0 + xu) / 2;
+	}
+    }
+  };
+
+/////////////////////////////////////////////////////////////
+// Implicitly restarted lanczos
+/////////////////////////////////////////////////////////////
+
+
+  template < class Field > class SimpleLanczos
+  {
+
+    const RealD small = 1.0e-16;
+  public:
+    int lock;
+    int get;
+    int Niter;
+    int converged;
+
+    int Nstop;			// Number of evecs checked for convergence
+    int Nk;			// Number of converged sought
+    int Np;			// Np -- Number of spare vecs in kryloc space
+    int Nm;			// Nm -- total number of vectors
+
+
+    RealD OrthoTime;
+
+    RealD eresid;
+
+//    SortEigen < Field > _sort;
+
+    LinearFunction < Field > &_Linop;
+
+//    OperatorFunction < Field > &_poly;
+
+    /////////////////////////
+    // Constructor
+    /////////////////////////
+    void init (void)
+    {
+    };
+//    void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector  < RealD > >&evecs);
+
+    SimpleLanczos (LinearFunction < Field > &Linop,	// op
+//		   OperatorFunction < Field > &poly,	// polynmial
+		   int _Nstop,	// sought vecs
+		   int _Nk,	// sought vecs
+		   int _Nm,	// spare vecs
+		   RealD _eresid,	// resid in lmdue deficit 
+		   int _Niter):	// Max iterations
+     
+      _Linop (Linop),
+ //     _poly (poly),
+      Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
+    {
+      Np = Nm - Nk;
+      assert (Np > 0);
+    };
+
+    /////////////////////////
+    // Sanity checked this routine (step) against Saad.
+    /////////////////////////
+    void RitzMatrix (std::vector < Field > &evec, int k)
+    {
+
+      if (1)
+	return;
+
+      GridBase *grid = evec[0].Grid();
+      Field w (grid);
+      std::cout << GridLogMessage << "RitzMatrix " << std::endl;
+      for (int i = 0; i < k; i++)
+	{
+	  _Linop(evec[i], w);
+//      _poly(_Linop,evec[i],w);
+	  std::cout << GridLogMessage << "[" << i << "] ";
+	  for (int j = 0; j < k; j++)
+	    {
+	      ComplexD in = innerProduct (evec[j], w);
+	      if (fabs ((double) i - j) > 1)
+		{
+		  if (abs (in) > 1.0e-9)
+		    {
+		      std::cout << GridLogMessage << "oops" << std::endl;
+		      abort ();
+		    }
+		  else
+		    std::cout << GridLogMessage << " 0 ";
+		}
+	      else
+		{
+		  std::cout << GridLogMessage << " " << in << " ";
+		}
+	    }
+	  std::cout << GridLogMessage << std::endl;
+	}
+    }
+
+    void step (std::vector < RealD > &lmd,
+	       std::vector < RealD > &lme,
+	       Field & last, Field & current, Field & next, uint64_t k)
+    {
+      if (lmd.size () <= k)
+	lmd.resize (k + Nm);
+      if (lme.size () <= k)
+	lme.resize (k + Nm);
+
+
+//      _poly(_Linop,current,next );   // 3. wk:=Avk−βkv_{k−1}
+      _Linop(current, next);	// 3. wk:=Avk−βkv_{k−1}
+      if (k > 0)
+	{
+	  next -= lme[k - 1] * last;
+	}
+//      std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
+
+      ComplexD zalph = innerProduct (current, next);	// 4. αk:=(wk,vk)
+      RealD alph = real (zalph);
+
+      next = next - alph * current;	// 5. wk:=wk−αkvk
+//      std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
+
+      RealD beta = normalise (next);	// 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+      // 7. vk+1 := wk/βk+1
+//       norm=beta;
+
+      int interval = Nm / 100 + 1;
+      if ((k % interval) == 0)
+	std::
+	  cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
+	  beta << std::endl;
+      const RealD tiny = 1.0e-20;
+      if (beta < tiny)
+	{
+	  std::cout << GridLogMessage << " beta is tiny " << beta << std::
+	    endl;
+	}
+      lmd[k] = alph;
+      lme[k] = beta;
+
+    }
+
+    void qr_decomp (std::vector < RealD > &lmd,
+		    std::vector  < RealD > &lme,
+		    int Nk,
+		    int Nm,
+		    std::vector  < RealD > &Qt, RealD Dsh, int kmin, int kmax)
+    {
+      int k = kmin - 1;
+      RealD x;
+
+      RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
+      RealD c = (lmd[k] - Dsh) * Fden;
+      RealD s = -lme[k] * Fden;
+
+      RealD tmpa1 = lmd[k];
+      RealD tmpa2 = lmd[k + 1];
+      RealD tmpb = lme[k];
+
+      lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
+      lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
+      lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
+      x = -s * lme[k + 1];
+      lme[k + 1] = c * lme[k + 1];
+
+      for (int i = 0; i < Nk; ++i)
+	{
+	  RealD Qtmp1 = Qt[i + Nm * k];
+	  RealD Qtmp2 = Qt[i + Nm * (k + 1)];
+	  Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
+	  Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
+	}
+
+      // Givens transformations
+      for (int k = kmin; k < kmax - 1; ++k)
+	{
+
+	  RealD Fden = 1.0 / hypot (x, lme[k - 1]);
+	  RealD c = lme[k - 1] * Fden;
+	  RealD s = -x * Fden;
+
+	  RealD tmpa1 = lmd[k];
+	  RealD tmpa2 = lmd[k + 1];
+	  RealD tmpb = lme[k];
+
+	  lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
+	  lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
+	  lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
+	  lme[k - 1] = c * lme[k - 1] - s * x;
+
+	  if (k != kmax - 2)
+	    {
+	      x = -s * lme[k + 1];
+	      lme[k + 1] = c * lme[k + 1];
+	    }
+
+	  for (int i = 0; i < Nk; ++i)
+	    {
+	      RealD Qtmp1 = Qt[i + Nm * k];
+	      RealD Qtmp2 = Qt[i + Nm * (k + 1)];
+	      Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
+	      Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
+	    }
+	}
+    }
+
+#if 0
+#ifdef USE_LAPACK
+#ifdef USE_MKL
+#define LAPACK_INT MKL_INT
+#else
+#define LAPACK_INT long long
+#endif
+    void diagonalize_lapack (std::vector  < RealD > &lmd, std::vector  < RealD > &lme, int N1,	// all
+			     int N2,	// get
+			     GridBase * grid)
+    {
+      const int size = Nm;
+      LAPACK_INT NN = N1;
+      double evals_tmp[NN];
+      double DD[NN];
+      double EE[NN];
+      for (int i = 0; i < NN; i++)
+	for (int j = i - 1; j <= i + 1; j++)
+	  if (j < NN && j >= 0)
+	    {
+	      if (i == j)
+		DD[i] = lmd[i];
+	      if (i == j)
+		evals_tmp[i] = lmd[i];
+	      if (j == (i - 1))
+		EE[j] = lme[j];
+	    }
+      LAPACK_INT evals_found;
+      LAPACK_INT lwork =
+	((18 * NN) >
+	 (1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
+      LAPACK_INT liwork = 3 + NN * 10;
+      LAPACK_INT iwork[liwork];
+      double work[lwork];
+      LAPACK_INT isuppz[2 * NN];
+      char jobz = 'N';		// calculate evals only
+      char range = 'I';		// calculate il-th to iu-th evals
+      //    char range = 'A'; // calculate all evals
+      char uplo = 'U';		// refer to upper half of original matrix
+      char compz = 'I';		// Compute eigenvectors of tridiagonal matrix
+      int ifail[NN];
+      LAPACK_INT info;
+//  int total = QMP_get_number_of_nodes();
+//  int node = QMP_get_node_number();
+//  GridBase *grid = evec[0]._grid;
+      int total = grid->_Nprocessors;
+      int node = grid->_processor;
+      int interval = (NN / total) + 1;
+      double vl = 0.0, vu = 0.0;
+      LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
+      if (iu > NN)
+	iu = NN;
+      double tol = 0.0;
+      if (1)
+	{
+	  memset (evals_tmp, 0, sizeof (double) * NN);
+	  if (il <= NN)
+	    {
+	      printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
+#ifdef USE_MKL
+	      dstegr (&jobz, &range, &NN,
+#else
+	      LAPACK_dstegr (&jobz, &range, &NN,
+#endif
+			     (double *) DD, (double *) EE, &vl, &vu, &il, &iu,	// these four are ignored if second parameteris 'A'
+			     &tol,	// tolerance
+			     &evals_found, evals_tmp, (double *) NULL, &NN,
+			     isuppz, work, &lwork, iwork, &liwork, &info);
+	      for (int i = iu - 1; i >= il - 1; i--)
+		{
+		  printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
+			  evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
+		  evals_tmp[i] = evals_tmp[i - (il - 1)];
+		  if (il > 1)
+		    evals_tmp[i - (il - 1)] = 0.;
+		}
+	    }
+	  {
+	    grid->GlobalSumVector (evals_tmp, NN);
+	  }
+	}
+// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
+    }
+#undef LAPACK_INT
+#endif
+
+
+    void diagonalize (std::vector  < RealD > &lmd,
+		      std::vector  < RealD > &lme,
+		      int N2, int N1, GridBase * grid)
+    {
+
+#ifdef USE_LAPACK
+      const int check_lapack = 0;	// just use lapack if 0, check against lapack if 1
+
+      if (!check_lapack)
+	return diagonalize_lapack (lmd, lme, N2, N1, grid);
+
+//      diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
+#endif
+    }
+#endif
+
+    static RealD normalise (Field & v)
+    {
+      RealD nn = norm2 (v);
+      nn = sqrt (nn);
+      v = v * (1.0 / nn);
+      return nn;
+    }
+
+    void orthogonalize (Field & w, std::vector < Field > &evec, int k)
+    {
+      double t0 = -usecond () / 1e6;
+      typedef typename Field::scalar_type MyComplex;
+      MyComplex ip;
+
+      if (0)
+	{
+	  for (int j = 0; j < k; ++j)
+	    {
+	      normalise (evec[j]);
+	      for (int i = 0; i < j; i++)
+		{
+		  ip = innerProduct (evec[i], evec[j]);	// are the evecs normalised? ; this assumes so.
+		  evec[j] = evec[j] - ip * evec[i];
+		}
+	    }
+	}
+
+      for (int j = 0; j < k; ++j)
+	{
+	  ip = innerProduct (evec[j], w);	// are the evecs normalised? ; this assumes so.
+	  w = w - ip * evec[j];
+	}
+      normalise (w);
+      t0 += usecond () / 1e6;
+      OrthoTime += t0;
+    }
+
+    void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
+    {
+      for (int i = 0; i < Qt.size (); ++i)
+	Qt[i] = 0.0;
+      for (int k = 0; k < Nm; ++k)
+	Qt[k + k * Nm] = 1.0;
+    }
+
+
+    void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
+    {
+
+      GridBase *grid = src.Grid();
+//      assert(grid == src._grid);
+
+      std::
+	cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
+	endl;
+      std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
+      std::cout << GridLogMessage << " -- size of eval   = " << eval.
+	size () << std::endl;
+
+//      assert(c.size() && Nm == eval.size());
+
+      std::vector < RealD > lme (Nm);
+      std::vector < RealD > lmd (Nm);
+
+
+      Field current (grid);
+      Field last (grid);
+      Field next (grid);
+
+      Nconv = 0;
+
+      RealD beta_k;
+
+      // Set initial vector
+      // (uniform vector) Why not src??
+      //      evec[0] = 1.0;
+      current = src;
+      std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
+	endl;
+      normalise (current);
+      std::
+	cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
+	std::endl;
+
+      // Initial Nk steps
+      OrthoTime = 0.;
+      double t0 = usecond () / 1e6;
+      RealD norm;		// sqrt norm of last vector
+
+      uint64_t iter = 0;
+
+      bool initted = false;
+      std::vector < RealD > low (Nstop * 10);
+      std::vector < RealD > high (Nstop * 10);
+      RealD cont = 0.;
+      while (1) {
+	  cont = 0.;
+	  std::vector < RealD > lme2 (Nm);
+	  std::vector < RealD > lmd2 (Nm);
+	  for (uint64_t k = 0; k < Nm; ++k, iter++) {
+	      step (lmd, lme, last, current, next, iter);
+	      last = current;
+	      current = next;
+	    }
+	  double t1 = usecond () / 1e6;
+	  std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
+	    t0 << "seconds" << std::endl;
+	  t0 = t1;
+	  std::
+	    cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
+	    OrthoTime << "seconds" << std::endl;
+
+	  // getting eigenvalues
+	  lmd2.resize (iter + 2);
+	  lme2.resize (iter + 2);
+	  for (uint64_t k = 0; k < iter; ++k) {
+	      lmd2[k + 1] = lmd[k];
+	      lme2[k + 2] = lme[k];
+	    }
+	  t1 = usecond () / 1e6;
+	  std::cout << GridLogMessage << "IRL:: copy: " << t1 -
+	    t0 << "seconds" << std::endl;
+	  t0 = t1;
+	  {
+	    int total = grid->_Nprocessors;
+	    int node = grid->_processor;
+	    int interval = (Nstop / total) + 1;
+	    int iu = (iter + 1) - (interval * node + 1);
+	    int il = (iter + 1) - (interval * (node + 1));
+	    std::vector < RealD > eval2 (iter + 3);
+	    RealD eps2;
+	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
+			      eps2);
+//        diagonalize(eval2,lme2,iter,Nk,grid);
+	    RealD diff = 0.;
+	    for (int i = il; i <= iu; i++) {
+		if (initted)
+		  diff =
+		    fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
+						      fabs (high[iu-i]));
+		if (initted && (diff > eresid))
+		  cont = 1.;
+		if (initted)
+		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
+			  high[iu-i], diff);
+		high[iu-i] = eval2[i];
+	      }
+	    il = (interval * node + 1);
+	    iu = (interval * (node + 1));
+	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
+			      eps2);
+	    for (int i = il; i <= iu; i++) {
+		if (initted)
+		  diff =
+		    fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
+						fabs (low[i]));
+		if (initted && (diff > eresid))
+		  cont = 1.;
+		if (initted)
+		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
+			  low[i], diff);
+		low[i] = eval2[i];
+	      }
+	    t1 = usecond () / 1e6;
+	    std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
+	      t0 << "seconds" << std::endl;
+	    t0 = t1;
+	  }
+
+	  for (uint64_t k = 0; k < Nk; ++k) {
+//          eval[k] = eval2[k];
+	    }
+	  if (initted)
+	    {
+	      grid->GlobalSumVector (&cont, 1);
+	      if (cont < 1.) return;
+	    }
+	  initted = true;
+	}
+
+    }
+
+
+
+
+
+#if 0
+
+/**
+   There is some matrix Q such that for any vector y
+   Q.e_1 = y and Q is unitary.
+**/
+    template < class T >
+      static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
+    {
+      int N = y.size ();	//Matrix Size
+      Fill (Q, 0.0);
+      T tau;
+      for (int i = 0; i < N; i++)
+	{
+	  Q[i][0] = y[i];
+	}
+      T sig = conj (y[0]) * y[0];
+      T tau0 = fabs (sqrt (sig));
+
+      for (int j = 1; j < N; j++)
+	{
+	  sig += conj (y[j]) * y[j];
+	  tau = abs (sqrt (sig));
+
+	  if (abs (tau0) > 0.0)
+	    {
+
+	      T gam = conj ((y[j] / tau) / tau0);
+	      for (int k = 0; k <= j - 1; k++)
+		{
+		  Q[k][j] = -gam * y[k];
+		}
+	      Q[j][j] = tau0 / tau;
+	    }
+	  else
+	    {
+	      Q[j - 1][j] = 1.0;
+	    }
+	  tau0 = tau;
+	}
+      return tau;
+    }
+
+/**
+	There is some matrix Q such that for any vector y
+	Q.e_k = y and Q is unitary.
+**/
+    template < class T >
+      static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
+    {
+      T tau = orthQ (Q, y);
+      SL (Q);
+      return tau;
+    }
+
+
+/**
+	Wind up with a matrix with the first con rows untouched
+
+say con = 2
+	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
+	and the matrix is upper hessenberg
+	and with f and Q appropriately modidied with Q is the arnoldi factorization
+
+**/
+
+    template < class T > static void Lock (DenseMatrix < T > &H,	///Hess mtx     
+					   DenseMatrix < T > &Q,	///Lock Transform
+					   T val,	///value to be locked
+					   int con,	///number already locked
+					   RealD small, int dfg, bool herm)
+    {
+      //ForceTridiagonal(H);
+
+      int M = H.dim;
+      DenseVector < T > vec;
+      Resize (vec, M - con);
+
+      DenseMatrix < T > AH;
+      Resize (AH, M - con, M - con);
+      AH = GetSubMtx (H, con, M, con, M);
+
+      DenseMatrix < T > QQ;
+      Resize (QQ, M - con, M - con);
+
+      Unity (Q);
+      Unity (QQ);
+
+      DenseVector < T > evals;
+      Resize (evals, M - con);
+      DenseMatrix < T > evecs;
+      Resize (evecs, M - con, M - con);
+
+      Wilkinson < T > (AH, evals, evecs, small);
+
+      int k = 0;
+      RealD cold = abs (val - evals[k]);
+      for (int i = 1; i < M - con; i++)
+	{
+	  RealD cnew = abs (val - evals[i]);
+	  if (cnew < cold)
+	    {
+	      k = i;
+	      cold = cnew;
+	    }
+	}
+      vec = evecs[k];
+
+      ComplexD tau;
+      orthQ (QQ, vec);
+      //orthQM(QQ,AH,vec);
+
+      AH = Hermitian (QQ) * AH;
+      AH = AH * QQ;
+
+      for (int i = con; i < M; i++)
+	{
+	  for (int j = con; j < M; j++)
+	    {
+	      Q[i][j] = QQ[i - con][j - con];
+	      H[i][j] = AH[i - con][j - con];
+	    }
+	}
+
+      for (int j = M - 1; j > con + 2; j--)
+	{
+
+	  DenseMatrix < T > U;
+	  Resize (U, j - 1 - con, j - 1 - con);
+	  DenseVector < T > z;
+	  Resize (z, j - 1 - con);
+	  T nm = norm (z);
+	  for (int k = con + 0; k < j - 1; k++)
+	    {
+	      z[k - con] = conj (H (j, k + 1));
+	    }
+	  normalise (z);
+
+	  RealD tmp = 0;
+	  for (int i = 0; i < z.size () - 1; i++)
+	    {
+	      tmp = tmp + abs (z[i]);
+	    }
+
+	  if (tmp < small / ((RealD) z.size () - 1.0))
+	    {
+	      continue;
+	    }
+
+	  tau = orthU (U, z);
+
+	  DenseMatrix < T > Hb;
+	  Resize (Hb, j - 1 - con, M);
+
+	  for (int a = 0; a < M; a++)
+	    {
+	      for (int b = 0; b < j - 1 - con; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += H[a][con + 1 + c] * U[c][b];
+		    }		//sum += H(a,con+1+c)*U(c,b);}
+		  Hb[b][a] = sum;
+		}
+	    }
+
+	  for (int k = con + 1; k < j; k++)
+	    {
+	      for (int l = 0; l < M; l++)
+		{
+		  H[l][k] = Hb[k - 1 - con][l];
+		}
+	    }			//H(Hb[k-1-con][l] , l,k);}}
+
+	  DenseMatrix < T > Qb;
+	  Resize (Qb, M, M);
+
+	  for (int a = 0; a < M; a++)
+	    {
+	      for (int b = 0; b < j - 1 - con; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += Q[a][con + 1 + c] * U[c][b];
+		    }		//sum += Q(a,con+1+c)*U(c,b);}
+		  Qb[b][a] = sum;
+		}
+	    }
+
+	  for (int k = con + 1; k < j; k++)
+	    {
+	      for (int l = 0; l < M; l++)
+		{
+		  Q[l][k] = Qb[k - 1 - con][l];
+		}
+	    }			//Q(Qb[k-1-con][l] , l,k);}}
+
+	  DenseMatrix < T > Hc;
+	  Resize (Hc, M, M);
+
+	  for (int a = 0; a < j - 1 - con; a++)
+	    {
+	      for (int b = 0; b < M; b++)
+		{
+		  T sum = 0;
+		  for (int c = 0; c < j - 1 - con; c++)
+		    {
+		      sum += conj (U[c][a]) * H[con + 1 + c][b];
+		    }		//sum += conj( U(c,a) )*H(con+1+c,b);}
+		  Hc[b][a] = sum;
+		}
+	    }
+
+	  for (int k = 0; k < M; k++)
+	    {
+	      for (int l = con + 1; l < j; l++)
+		{
+		  H[l][k] = Hc[k][l - 1 - con];
+		}
+	    }			//H(Hc[k][l-1-con] , l,k);}}
+
+	}
+    }
+#endif
+
+
+  };
+
+}
+#endif
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -260,39 +260,32 @@ CartesianCommunicator::~CartesianCommunicator()
 }
 #ifdef USE_GRID_REDUCTION
 void CartesianCommunicator::GlobalSum(float &f){
-  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(f);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
-  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(d);
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
-  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
-  FlightRecorder::StepLog("AllReduce");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
-  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
-  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
-  FlightRecorder::StepLog("AllReduceVector");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
@@ -801,7 +794,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque

 void CartesianCommunicator::StencilBarrier(void)
 {
-  FlightRecorder::StepLog("NodeBarrier");
  MPI_Barrier  (ShmComm);
 }
 //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -809,13 +801,11 @@ void CartesianCommunicator::StencilBarrier(void)
 //}
 void CartesianCommunicator::Barrier(void)
 {
-  FlightRecorder::StepLog("GridBarrier");
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
-  FlightRecorder::StepLog("Broadcast");
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
@@ -834,7 +824,6 @@ void CartesianCommunicator::BarrierWorld(void){
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
-  FlightRecorder::StepLog("BroadcastWorld");
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
@@ -857,7 +846,6 @@ void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
-  FlightRecorder::StepLog("AllToAll");
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -990,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  //  SharedMemoryTest();
+  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 {
  acceleratorMemSet(dest,0,bytes);
 }
-//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-//{
-//  acceleratorCopyToDevice(src,dest,bytes);
-//}
+void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+{
+  acceleratorCopyToDevice(src,dest,bytes);
+}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -236,7 +236,7 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
-#if 1
+#if 0
    deviceVector<vobj> vvtmp(1);
    acceleratorPut(vvtmp[0],vtmp);
    vobj *vvtmp_p = & vvtmp[0];
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -252,11 +252,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{

  out = in;
  RealD taus = 0.;
-
-  // Perform initial t=0 measurements
-  for(auto const &meas : this->functions)
-    meas.second(0,taus,out);
-  
  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
    evolve_step(out, taus);
@@ -341,11 +336,6 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
  RealD taus = 0.;
  RealD eps = init_epsilon;
  unsigned int step = 0;
-
-  // Perform initial t=0 measurements
-  for(auto const &meas : this->functions)
-    meas.second(step,taus,out);
-  
  do{
    int step_success = evolve_step_adaptive(out, taus, eps); 
    step += step_success; //step will not be incremented if the integration step fails
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -396,7 +396,6 @@ public:
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
-    FlightRecorder::StepLog("Communicate begin has finished");
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -251,7 +251,7 @@ inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { c
 inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
-  acceleratorCopyToDevice(from,to,bytes);
+  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
  return 0;
 }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
@@ -337,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    cgh.parallel_for(							\
 		     sycl::nd_range<3>(global,local),			\
 		     [=] (sycl::nd_item<3> item) /*mutable*/		\
-		     [[sycl::reqd_sub_group_size(16)]]			\
+		     [[intel::reqd_sub_group_size(16)]]			\
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
 		       auto iter2    = item.get_global_id(1);		\
--- a/Grid/threads/ThreadReduction.h
+++ b/Grid/threads/ThreadReduction.h
@@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once 

+#ifndef MIN
+#define MIN(x,y) ((x)>(y)?(y):(x))
+#endif
+
+
 // Introduce a class to gain deterministic bit reproducible reduction.
 // make static; perhaps just a namespace is required.
 NAMESPACE_BEGIN(Grid);
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -638,11 +638,12 @@ void Grid_debug_handler_init(void)
  sa.sa_flags    = SA_SIGINFO;
  //  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
-  //  sigaction(SIGBUS,&sa,NULL);
+  sigaction(SIGBUS,&sa,NULL);
  //  sigaction(SIGUSR2,&sa,NULL);

-  //  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
-  //  sigaction(SIGFPE,&sa,NULL);
+  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
+
+  sigaction(SIGFPE,&sa,NULL);
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);

--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@@ -66,7 +66,6 @@ namespace Grid{
  };
 }

-
 template <class T> void writeFile(T& in, std::string const fname){  
 #ifdef HAVE_LIME
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
@@ -74,7 +73,7 @@ template <class T> void writeFile(T& in, std::string const fname){
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
-  WR.writeScidacFieldRecord(in,record,0); // Lexico
+  WR.writeScidacFieldRecord(in,record,0);
  WR.close();
 #endif
  // What is the appropriate way to throw error?
@@ -108,18 +107,8 @@ int main(int argc, char **argv) {

  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){

-#if 0    
  CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
-#else
-  // Don't require Grid format RNGs
-  FieldMetaData header;
-  std::string file, filesmr;
-  file    = CPar.conf_path + "/" + CPar.conf_prefix      + "." + std::to_string(conf);
-  filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix  + "." + std::to_string(conf);

-  NerscIO::readConfiguration(Umu,header,file);
-#endif
-  
  std::cout << std::setprecision(15);
  std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
  
@@ -127,7 +116,6 @@ int main(int argc, char **argv) {
  std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);

  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
-  
  WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
    
    typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
@@ -177,48 +165,33 @@ int main(int argc, char **argv) {
    //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
    //Plq = coeff * Plq;

-
-    RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
-
    int tau = std::round(t);
-
    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
-    //    writeFile(R,efile);
-
+    writeFile(R,efile);
    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
-    //    writeFile(qfield,tfile);
+    writeFile(qfield,tfile);

-    std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
-    {
-      //      PeriodicGimplR::GaugeField Ucopy = U;
-      //      NerscIO::writeConfiguration(Ucopy,ufile);
-    }
-    
    RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
    RealD T = real( sum(qfield) );
    Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
    RealD E0 = real(peekSite(R,scoor));
    RealD T0 = real(peekSite(qfield,scoor));
    std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  "
-	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
+	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
    
  });
  
  int t=WFPar.maxTau;
  WF.smear(Uflow, Umu);
-  //  NerscIO::writeConfiguration(Uflow,filesmr);
-  
-  
+
  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
-  RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t
  RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow);
-  std::cout << GridLogMessage << "Plaquette            "<< conf << "   " << WFlow_plaq << std::endl;
-  std::cout << GridLogMessage << "T0                   "<< conf << "   " << WFlow_T0 << std::endl;
-  std::cout << GridLogMessage << "TC0                  "<< conf << "   " << WFlow_EC << std::endl;
-  std::cout << GridLogMessage << "TopologicalCharge    "<< conf << "   " << WFlow_TC   << std::endl;
-  std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << "   " << WFlow_TC5Li<< std::endl;
+  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
+  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
+  std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;

  std::cout<< GridLogMessage << " Admissibility check:\n";
  const double sp_adm = 0.067;                // admissible threshold
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -873,7 +873,7 @@ int main (int argc, char ** argv)
  int do_su4=0;
  int do_memory=1;
  int do_comms =1;
-  int do_blas  =1;
+  int do_blas  =0;
  int do_dslash=1;

  int sel=4;
--- a/systems/Jupiter/benchmarks/dwf.1node.perf
+++ b/systems/Jupiter/benchmarks/dwf.1node.perf
@@ -1,273 +0,0 @@
-RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
-RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
-RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
-RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
-SLURM detected
-AcceleratorCudaInit[0]: ========================
-AcceleratorCudaInit[0]: Device Number    : 0
-AcceleratorCudaInit[0]: ========================
-AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
-AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
-AcceleratorCudaInit[0]:   managedMemory: 1 
-AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
-AcceleratorCudaInit[0]:   warpSize: 32 
-AcceleratorCudaInit[0]:   pciBusID: 1 
-AcceleratorCudaInit[0]:   pciDeviceID: 0 
-AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
-AcceleratorCudaInit: using default device 
-AcceleratorCudaInit: assume user either uses
-AcceleratorCudaInit: a) IBM jsrun, or 
-AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
-AcceleratorCudaInit: Configure options --enable-setdevice=no 
-local rank 0 device 0 bus id: 0009:01:00.0
-AcceleratorCudaInit: ================================================
-SharedMemoryMpi:  World communicator of size 4
-SharedMemoryMpi:  Node  communicator of size 4
-0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers 
-Setting up IPC
-
-__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
-__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
-__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
-__|_                                    _|__
-__|_   GGGG    RRRR    III    DDDD      _|__
-__|_  G        R   R    I     D   D     _|__
-__|_  G        R   R    I     D    D    _|__
-__|_  G  GG    RRRR     I     D    D    _|__
-__|_  G   G    R  R     I     D   D     _|__
-__|_   GGGG    R   R   III    DDDD      _|__
-__|_                                    _|__
-__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
-__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
-  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
-
-
-Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
-
-Grid : Message : ================================================ 
-Grid : Message : MPI is initialised and logging filters activated 
-Grid : Message : ================================================ 
-Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
-Grid : Message : Requested 2147483648 byte stencil comms buffers 
-Grid : Message : MemoryManager Cache 81604378624 bytes 
-Grid : Message : MemoryManager::Init() setting up
-Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
-Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
-Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
-Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
-Grid : Message : MemoryManager::Init() Using cudaMalloc
-
-
-
-
-
-
-
-Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 0.309000 s :  Testing with full communication 
-Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 0.313000 s : Grid Layout
-Grid : Message : 0.313000 s : 	Global lattice size  : 32 32 64 64 
-Grid : Message : 0.319000 s : 	OpenMP threads       : 4
-Grid : Message : 0.320000 s : 	MPI tasks            : 1 1 2 2 
-Grid : Message : 0.129590 s : Initialising 4d RNG
-Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
-Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
-Grid : Message : 0.942440 s : Initialising 5d RNG
-Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
-Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
-local rank 1 device 0 bus id: 0019:01:00.0
-local rank 2 device 0 bus id: 0029:01:00.0
-local rank 3 device 0 bus id: 0039:01:00.0
-Grid : Message : 43.893114 s : Drawing gauge field
-Grid : Message : 54.574150 s : Random gauge initialised 
-Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
-Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
-Grid : Message : 54.580032 s : Setting up Cshift based reference 
-Grid : Message : 60.407451 s : *****************************************************************
-Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
-Grid : Message : 60.407470 s : *****************************************************************
-Grid : Message : 60.407471 s : *****************************************************************
-Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop                  
-Grid : Message : 60.407473 s : * Vectorising space-time by 8
-Grid : Message : 60.407475 s : * VComplex size is 64 B
-Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
-Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 60.407480 s : *****************************************************************
-Grid : Message : 61.102178 s : Called warmup
-Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
-Grid : Message : 62.177198 s : mflop/s =   24721998.6
-Grid : Message : 62.177201 s : mflop/s per rank =  6180499.64
-Grid : Message : 62.177204 s : mflop/s per node =  24721998.6
-Grid : Message : 62.182696 s : norm diff   5.8108784e-14  Line 306
-Grid : Message : 71.328862 s : ----------------------------------------------------------------
-Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
-Grid : Message : 71.328885 s : ----------------------------------------------------------------
-Grid : Message : 71.328886 s : Called DwDag
-Grid : Message : 71.328887 s : norm dag result 4.12810493
-Grid : Message : 71.329493 s : norm dag ref    4.12810493
-Grid : Message : 71.331967 s : norm dag diff   3.40632318e-14  Line 377
-Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
-Grid : Message : 71.803650 s : src_e0.500003185
-Grid : Message : 71.819727 s : src_o0.499996882
-Grid : Message : 71.821991 s : *********************************************************
-Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO                
-Grid : Message : 71.821995 s : * Vectorising space-time by 8
-Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
-Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 71.822003 s : *********************************************************
-Grid : Message : 72.377054 s : Deo mflop/s =   24065467
-Grid : Message : 72.377071 s : Deo mflop/s per rank   6016366.75
-Grid : Message : 72.377074 s : Deo mflop/s per node   24065467
-Grid : Message : 72.624877 s : r_e2.06377678
-Grid : Message : 72.625198 s : r_o2.06381058
-Grid : Message : 72.625507 s : res4.12758736
-Grid : Message : 73.759140 s : norm diff   0
-Grid : Message : 73.868204 s : norm diff even  0
-Grid : Message : 73.907201 s : norm diff odd   0
-
-
-
-
-
-
-
-Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 74.414582 s :  Testing without internode communication 
-Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 74.414586 s : Grid Layout
-Grid : Message : 74.414586 s : 	Global lattice size  : 32 32 64 64 
-Grid : Message : 74.414594 s : 	OpenMP threads       : 4
-Grid : Message : 74.414595 s : 	MPI tasks            : 1 1 2 2 
-Grid : Message : 74.679364 s : Initialising 4d RNG
-Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
-Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
-Grid : Message : 74.759525 s : Initialising 5d RNG
-Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
-Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
-Grid : Message : 119.252016 s : Drawing gauge field
-Grid : Message : 129.919846 s : Random gauge initialised 
-Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
-Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
-Grid : Message : 129.923611 s : Setting up Cshift based reference 
-Grid : Message : 135.522878 s : *****************************************************************
-Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
-Grid : Message : 135.522899 s : *****************************************************************
-Grid : Message : 135.522899 s : *****************************************************************
-Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop                  
-Grid : Message : 135.522901 s : * Vectorising space-time by 8
-Grid : Message : 135.522903 s : * VComplex size is 64 B
-Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
-Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 135.522908 s : *****************************************************************
-Grid : Message : 136.151202 s : Called warmup
-Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
-Grid : Message : 137.224748 s : mflop/s =   24755806
-Grid : Message : 137.224751 s : mflop/s per rank =  6188951.49
-Grid : Message : 137.224753 s : mflop/s per node =  24755806
-Grid : Message : 137.235239 s : norm diff   5.8108784e-14  Line 306
-Grid : Message : 146.451686 s : ----------------------------------------------------------------
-Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
-Grid : Message : 146.451710 s : ----------------------------------------------------------------
-Grid : Message : 146.451712 s : Called DwDag
-Grid : Message : 146.451714 s : norm dag result 4.12810493
-Grid : Message : 146.452323 s : norm dag ref    4.12810493
-Grid : Message : 146.454799 s : norm dag diff   3.40632318e-14  Line 377
-Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
-Grid : Message : 146.940894 s : src_e0.500003185
-Grid : Message : 146.953676 s : src_o0.499996882
-Grid : Message : 146.955927 s : *********************************************************
-Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO                
-Grid : Message : 146.955932 s : * Vectorising space-time by 8
-Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
-Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 146.955941 s : *********************************************************
-Grid : Message : 147.511975 s : Deo mflop/s =   24036256.5
-Grid : Message : 147.511989 s : Deo mflop/s per rank   6009064.13
-Grid : Message : 147.511991 s : Deo mflop/s per node   24036256.5
-Grid : Message : 147.522100 s : r_e2.06377678
-Grid : Message : 147.522433 s : r_o2.06381058
-Grid : Message : 147.522745 s : res4.12758736
-Grid : Message : 148.229848 s : norm diff   0
-Grid : Message : 149.233474 s : norm diff even  0
-Grid : Message : 149.235815 s : norm diff odd   0
-
-
-
-
-
-
-
-Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 149.960990 s :  Testing without intranode communication 
-Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 149.960995 s : Grid Layout
-Grid : Message : 149.960995 s : 	Global lattice size  : 32 32 64 64 
-Grid : Message : 149.961003 s : 	OpenMP threads       : 4
-Grid : Message : 149.961004 s : 	MPI tasks            : 1 1 2 2 
-Grid : Message : 150.155810 s : Initialising 4d RNG
-Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
-Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
-Grid : Message : 150.973420 s : Initialising 5d RNG
-Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
-Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
-Grid : Message : 193.933765 s : Drawing gauge field
-Grid : Message : 204.611551 s : Random gauge initialised 
-Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
-Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
-Grid : Message : 204.615265 s : Setting up Cshift based reference 
-Grid : Message : 210.117788 s : *****************************************************************
-Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
-Grid : Message : 210.117809 s : *****************************************************************
-Grid : Message : 210.117810 s : *****************************************************************
-Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop                  
-Grid : Message : 210.117813 s : * Vectorising space-time by 8
-Grid : Message : 210.117814 s : * VComplex size is 64 B
-Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
-Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 210.117819 s : *****************************************************************
-Grid : Message : 210.714641 s : Called warmup
-Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
-Grid : Message : 211.892252 s : mflop/s =   22568003.2
-Grid : Message : 211.892255 s : mflop/s per rank =  5642000.8
-Grid : Message : 211.892257 s : mflop/s per node =  22568003.2
-Grid : Message : 211.896037 s : norm diff   5.8108784e-14  Line 306
-Grid : Message : 220.751375 s : ----------------------------------------------------------------
-Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
-Grid : Message : 220.751409 s : ----------------------------------------------------------------
-Grid : Message : 220.751411 s : Called DwDag
-Grid : Message : 220.751412 s : norm dag result 4.12810493
-Grid : Message : 220.753307 s : norm dag ref    4.12810493
-Grid : Message : 220.755796 s : norm dag diff   3.40632318e-14  Line 377
-Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
-Grid : Message : 221.697800 s : src_e0.500003185
-Grid : Message : 221.890920 s : src_o0.499996882
-Grid : Message : 221.913430 s : *********************************************************
-Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO                
-Grid : Message : 221.913480 s : * Vectorising space-time by 8
-Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
-Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 221.913550 s : *********************************************************
-Grid : Message : 221.645213 s : Deo mflop/s =   24114032
-Grid : Message : 221.645228 s : Deo mflop/s per rank   6028508.01
-Grid : Message : 221.645231 s : Deo mflop/s per node   24114032
-Grid : Message : 221.656021 s : r_e2.06377678
-Grid : Message : 221.656389 s : r_o2.06381058
-Grid : Message : 221.656698 s : res4.12758736
-Grid : Message : 222.110075 s : norm diff   0
-Grid : Message : 222.857692 s : norm diff even  0
-Grid : Message : 222.875763 s : norm diff odd   0
-Grid : Message : 223.598127 s : *******************************************
-Grid : Message : 223.598145 s : ******* Grid Finalize                ******
-Grid : Message : 223.598146 s : *******************************************
--- a/systems/Jupiter/benchmarks/dwf.4node.perf
+++ b/systems/Jupiter/benchmarks/dwf.4node.perf
@@ -1,286 +0,0 @@
-RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
-RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
-RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
-RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
-RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
-RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
-RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
-RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
-RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
-RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
-RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
-RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
-RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
-RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
-RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
-RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
-SLURM detected
-AcceleratorCudaInit[0]: ========================
-AcceleratorCudaInit[0]: Device Number    : 0
-AcceleratorCudaInit[0]: ========================
-AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
-AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
-AcceleratorCudaInit[0]:   managedMemory: 1 
-AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
-AcceleratorCudaInit[0]:   warpSize: 32 
-AcceleratorCudaInit[0]:   pciBusID: 1 
-AcceleratorCudaInit[0]:   pciDeviceID: 0 
-AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
-AcceleratorCudaInit: using default device 
-AcceleratorCudaInit: assume user either uses
-AcceleratorCudaInit: a) IBM jsrun, or 
-AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
-AcceleratorCudaInit: Configure options --enable-setdevice=no 
-local rank 0 device 0 bus id: 0009:01:00.0
-AcceleratorCudaInit: ================================================
-SharedMemoryMpi:  World communicator of size 16
-SharedMemoryMpi:  Node  communicator of size 4
-0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers 
-Setting up IPC
-
-__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
-__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
-__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
-__|_                                    _|__
-__|_   GGGG    RRRR    III    DDDD      _|__
-__|_  G        R   R    I     D   D     _|__
-__|_  G        R   R    I     D    D    _|__
-__|_  G  GG    RRRR     I     D    D    _|__
-__|_  G   G    R  R     I     D   D     _|__
-__|_   GGGG    R   R   III    DDDD      _|__
-__|_                                    _|__
-__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
-__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
-  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
-
-
-Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
-
-Grid : Message : ================================================ 
-Grid : Message : MPI is initialised and logging filters activated 
-Grid : Message : ================================================ 
-Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
-Grid : Message : Requested 2147483648 byte stencil comms buffers 
-Grid : Message : MemoryManager Cache 81604378624 bytes 
-Grid : Message : MemoryManager::Init() setting up
-Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
-Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
-Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
-Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
-Grid : Message : MemoryManager::Init() Using cudaMalloc
-
-
-
-
-
-
-
-Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 0.838000 s :  Testing with full communication 
-Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 0.840000 s : Grid Layout
-Grid : Message : 0.840000 s : 	Global lattice size  : 64 64 64 64 
-Grid : Message : 0.846000 s : 	OpenMP threads       : 4
-Grid : Message : 0.846000 s : 	MPI tasks            : 2 2 2 2 
-Grid : Message : 0.165970 s : Initialising 4d RNG
-Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
-Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
-Grid : Message : 0.960410 s : Initialising 5d RNG
-Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
-Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
-local rank 2 device 0 bus id: 0029:01:00.0
-local rank 3 device 0 bus id: 0039:01:00.0
-local rank 1 device 0 bus id: 0019:01:00.0
-Grid : Message : 44.657270 s : Drawing gauge field
-Grid : Message : 55.247733 s : Random gauge initialised 
-Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
-Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
-Grid : Message : 55.253053 s : Setting up Cshift based reference 
-Grid : Message : 62.191747 s : *****************************************************************
-Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
-Grid : Message : 62.191768 s : *****************************************************************
-Grid : Message : 62.191769 s : *****************************************************************
-Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop                  
-Grid : Message : 62.191769 s : * Vectorising space-time by 8
-Grid : Message : 62.191770 s : * VComplex size is 64 B
-Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
-Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 62.191772 s : *****************************************************************
-Grid : Message : 62.857568 s : Called warmup
-Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
-Grid : Message : 65.582120 s : mflop/s =   48306525
-Grid : Message : 65.582140 s : mflop/s per rank =  3019157.81
-Grid : Message : 65.582150 s : mflop/s per node =  12076631.3
-Grid : Message : 65.637550 s : norm diff   5.80156793e-14  Line 306
-Grid : Message : 75.122153 s : ----------------------------------------------------------------
-Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
-Grid : Message : 75.122167 s : ----------------------------------------------------------------
-Grid : Message : 75.122167 s : Called DwDag
-Grid : Message : 75.122167 s : norm dag result 4.12801829
-Grid : Message : 75.123295 s : norm dag ref    4.12801829
-Grid : Message : 75.125890 s : norm dag diff   3.42093991e-14  Line 377
-Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
-Grid : Message : 75.605683 s : src_e0.500004005
-Grid : Message : 75.617824 s : src_o0.499996067
-Grid : Message : 75.620089 s : *********************************************************
-Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO                
-Grid : Message : 75.620093 s : * Vectorising space-time by 8
-Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
-Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 75.620096 s : *********************************************************
-Grid : Message : 76.732272 s : Deo mflop/s =   48068252.4
-Grid : Message : 76.732283 s : Deo mflop/s per rank   3004265.77
-Grid : Message : 76.732285 s : Deo mflop/s per node   12017063.1
-Grid : Message : 76.749317 s : r_e2.06443136
-Grid : Message : 76.749652 s : r_o2.06378451
-Grid : Message : 76.749955 s : res4.12821587
-Grid : Message : 77.198827 s : norm diff   0
-Grid : Message : 77.981760 s : norm diff even  0
-Grid : Message : 78.455900 s : norm diff odd   0
-
-
-
-
-
-
-
-Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 78.539337 s :  Testing without internode communication 
-Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 78.539339 s : Grid Layout
-Grid : Message : 78.539339 s : 	Global lattice size  : 64 64 64 64 
-Grid : Message : 78.539347 s : 	OpenMP threads       : 4
-Grid : Message : 78.539348 s : 	MPI tasks            : 2 2 2 2 
-Grid : Message : 78.798501 s : Initialising 4d RNG
-Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
-Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
-Grid : Message : 78.879916 s : Initialising 5d RNG
-Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
-Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
-Grid : Message : 124.586264 s : Drawing gauge field
-Grid : Message : 135.338090 s : Random gauge initialised 
-Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
-Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
-Grid : Message : 135.341266 s : Setting up Cshift based reference 
-Grid : Message : 142.604280 s : *****************************************************************
-Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
-Grid : Message : 142.604460 s : *****************************************************************
-Grid : Message : 142.604470 s : *****************************************************************
-Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop                  
-Grid : Message : 142.604480 s : * Vectorising space-time by 8
-Grid : Message : 142.604500 s : * VComplex size is 64 B
-Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
-Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 142.604520 s : *****************************************************************
-Grid : Message : 142.686034 s : Called warmup
-Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
-Grid : Message : 144.868559 s : mflop/s =   48706194.1
-Grid : Message : 144.868561 s : mflop/s per rank =  3044137.13
-Grid : Message : 144.868562 s : mflop/s per node =  12176548.5
-Grid : Message : 144.887595 s : norm diff   5.80156793e-14  Line 306
-Grid : Message : 153.622978 s : ----------------------------------------------------------------
-Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
-Grid : Message : 153.622995 s : ----------------------------------------------------------------
-Grid : Message : 153.622995 s : Called DwDag
-Grid : Message : 153.622996 s : norm dag result 4.12801829
-Grid : Message : 153.623604 s : norm dag ref    4.12801829
-Grid : Message : 153.626098 s : norm dag diff   3.42093991e-14  Line 377
-Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
-Grid : Message : 154.148319 s : src_e0.500004005
-Grid : Message : 154.151454 s : src_o0.499996067
-Grid : Message : 154.153722 s : *********************************************************
-Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO                
-Grid : Message : 154.153725 s : * Vectorising space-time by 8
-Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
-Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 154.153728 s : *********************************************************
-Grid : Message : 155.200671 s : Deo mflop/s =   51121022.4
-Grid : Message : 155.200682 s : Deo mflop/s per rank   3195063.9
-Grid : Message : 155.200684 s : Deo mflop/s per node   12780255.6
-Grid : Message : 155.217204 s : r_e2.06443136
-Grid : Message : 155.217550 s : r_o2.06378451
-Grid : Message : 155.217869 s : res4.12821587
-Grid : Message : 155.673744 s : norm diff   0
-Grid : Message : 156.463329 s : norm diff even  0
-Grid : Message : 156.878866 s : norm diff odd   0
-
-
-
-
-
-
-
-Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 157.620764 s :  Testing without intranode communication 
-Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
-Grid : Message : 157.620766 s : Grid Layout
-Grid : Message : 157.620766 s : 	Global lattice size  : 64 64 64 64 
-Grid : Message : 157.620773 s : 	OpenMP threads       : 4
-Grid : Message : 157.620774 s : 	MPI tasks            : 2 2 2 2 
-Grid : Message : 157.671479 s : Initialising 4d RNG
-Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
-Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
-Grid : Message : 157.755651 s : Initialising 5d RNG
-Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
-Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
-Grid : Message : 202.465158 s : Drawing gauge field
-Grid : Message : 213.214546 s : Random gauge initialised 
-Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
-Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
-Grid : Message : 213.217711 s : Setting up Cshift based reference 
-Grid : Message : 219.662772 s : *****************************************************************
-Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
-Grid : Message : 219.662787 s : *****************************************************************
-Grid : Message : 219.662788 s : *****************************************************************
-Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop                  
-Grid : Message : 219.662789 s : * Vectorising space-time by 8
-Grid : Message : 219.662790 s : * VComplex size is 64 B
-Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
-Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 219.662791 s : *****************************************************************
-Grid : Message : 220.425592 s : Called warmup
-Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
-Grid : Message : 222.536267 s : mflop/s =   50365105.5
-Grid : Message : 222.536269 s : mflop/s per rank =  3147819.09
-Grid : Message : 222.536270 s : mflop/s per node =  12591276.4
-Grid : Message : 222.541053 s : norm diff   5.80156793e-14  Line 306
-Grid : Message : 232.135901 s : ----------------------------------------------------------------
-Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
-Grid : Message : 232.135916 s : ----------------------------------------------------------------
-Grid : Message : 232.135917 s : Called DwDag
-Grid : Message : 232.135918 s : norm dag result 4.12801829
-Grid : Message : 232.151938 s : norm dag ref    4.12801829
-Grid : Message : 232.154451 s : norm dag diff   3.42093991e-14  Line 377
-Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
-Grid : Message : 232.630529 s : src_e0.500004005
-Grid : Message : 232.643197 s : src_o0.499996067
-Grid : Message : 232.645527 s : *********************************************************
-Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO                
-Grid : Message : 232.645532 s : * Vectorising space-time by 8
-Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
-Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
-Grid : Message : 232.645535 s : *********************************************************
-Grid : Message : 233.774184 s : Deo mflop/s =   47432091.9
-Grid : Message : 233.774194 s : Deo mflop/s per rank   2964505.74
-Grid : Message : 233.774196 s : Deo mflop/s per node   11858023
-Grid : Message : 233.791552 s : r_e2.06443136
-Grid : Message : 233.791899 s : r_o2.06378451
-Grid : Message : 233.792204 s : res4.12821587
-Grid : Message : 234.230783 s : norm diff   0
-Grid : Message : 235.162780 s : norm diff even  0
-Grid : Message : 235.291950 s : norm diff odd   0
-Grid : Message : 235.765411 s : *******************************************
-Grid : Message : 235.765424 s : ******* Grid Finalize                ******
-Grid : Message : 235.765425 s : *******************************************
-
--- a/systems/Jupiter/benchmarks/dwf1.slurm
+++ b/systems/Jupiter/benchmarks/dwf1.slurm
@@ -1,57 +0,0 @@
-#!/bin/sh
-#SBATCH --account=jureap14
-#SBATCH --nodes=1
-#SBATCH --ntasks=4
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=64
-#SBATCH --time=2:00:00
-#SBATCH --partition=booster
-#SBATCH --gres=gpu:4
-
-export OMP_NUM_THREADS=4
-export OMPI_MCA_btl=^uct,openib
-export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
-export UCX_RNDV_SCHEME=put_zcopy
-export UCX_RNDV_THRESH=16384
-export UCX_IB_GPU_DIRECT_RDMA=yes
-export UCX_MEMTYPE_CACHE=n
-
-OPT="--comms-overlap"
-
-source ../sourceme.sh
-
-cat << EOF > bind_gpu
-#!/bin/bash
-export GPU_MAP=(0 1 2 3)
-export NUMA_MAP=(0 1 2 3)
-export NIC_MAP=(0 1 2 3)
-export GPU=\$SLURM_LOCALID
-export NUMA=\$SLURM_LOCALID
-export NIC=\$SLURM_LOCALID
-export CUDA_VISIBLE_DEVICES=\$GPU
-export UCX_NET_DEVICES=mlx5_\${NIC}:1
-
-echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
-exec numactl -m \$NUMA -N \$NUMA \$*
-EOF
-
-chmod +x ./bind_gpu
-
-srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
-        ./bind_gpu ./Benchmark_dwf_fp32 \
-	$OPT \
-	--mpi 1.1.2.2 \
-	--accelerator-threads 8 \
-	--grid 32.32.64.64 \
-	--shm 2048 > dwf.1node.perf
-
-srun --cpu-bind=no -N 1  -n $SLURM_NTASKS \
-	./bind_gpu ./Benchmark_comms_host_device \
-	--mpi 1.1.2.2 \
-	--accelerator-threads 8 \
-	--grid 32.32.64.64 \
-	--shm 2048 > comms.1node.perf
-
-
-
-
--- a/systems/Jupiter/benchmarks/dwf4.slurm
+++ b/systems/Jupiter/benchmarks/dwf4.slurm
@@ -1,57 +0,0 @@
-#!/bin/sh
-#SBATCH --account=jureap14
-#SBATCH --nodes=4
-#SBATCH --ntasks=16
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=64
-#SBATCH --time=2:00:00
-#SBATCH --partition=booster
-#SBATCH --gres=gpu:4
-
-export OMP_NUM_THREADS=4
-export OMPI_MCA_btl=^uct,openib
-export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
-export UCX_RNDV_SCHEME=put_zcopy
-export UCX_RNDV_THRESH=16384
-export UCX_IB_GPU_DIRECT_RDMA=yes
-export UCX_MEMTYPE_CACHE=n
-
-OPT="--comms-overlap"
-
-source ../sourceme.sh
-
-cat << EOF > bind_gpu
-#!/bin/bash
-export GPU_MAP=(0 1 2 3)
-export NUMA_MAP=(0 1 2 3)
-export NIC_MAP=(0 1 2 3)
-export GPU=\$SLURM_LOCALID
-export NUMA=\$SLURM_LOCALID
-export NIC=\$SLURM_LOCALID
-export CUDA_VISIBLE_DEVICES=\$GPU
-export UCX_NET_DEVICES=mlx5_\${NIC}:1
-
-echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
-exec numactl -m \$NUMA -N \$NUMA \$*
-EOF
-
-chmod +x ./bind_gpu
-
-srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
-        ./bind_gpu ./Benchmark_dwf_fp32 \
-	$OPT \
-	--mpi 2.2.2.2 \
-	--accelerator-threads 8 \
-	--grid 64.64.64.64 \
-	--shm 2048 > dwf.4node.perf
-
-srun --cpu-bind=no -N 4  -n $SLURM_NTASKS \
-	./bind_gpu ./Benchmark_comms_host_device \
-	--mpi 2.2.2.2 \
-	--accelerator-threads 8 \
-	--grid 32.32.64.64 \
-	--shm 2048 > comms.4node.perf
-
-
-
-
--- a/systems/Jupiter/config-command
+++ b/systems/Jupiter/config-command
@@ -1,16 +0,0 @@
-export CXX=nvcc
-export OPENMPI=/p/software/default/stages/2025/software/OpenMPI/5.0.5-NVHPC-24.9-CUDA-12/
-export LDFLAGS="-cudart shared -L${OPENMPI}/lib" 
-export CXXFLAGS="-ccbin clang++ -gencode arch=compute_90,code=sm_90 -std=c++17 -cudart shared -lcublas -lmpi -I${OPENMPI}/include"
-
-../../configure \
-    --enable-comms=mpi \
-    --enable-simd=GPU \
-    --enable-gen-simd-width=64 \
-    --enable-shm=nvlink \
-    --enable-accelerator=cuda \
-    --with-lime=$CLIME \
-    --disable-gparity \
-    --disable-fermion-reps \
-    --disable-unified 
-
--- a/systems/Jupiter/sourceme.sh
+++ b/systems/Jupiter/sourceme.sh
@@ -1,9 +0,0 @@
-CLIME=$HOME/install/
-module load Clang
-module load CUDA
-module load FFTW
-module load OpenSSL
-module load MPFR
-module load NVHPC
-module load UCX
-module load OpenMPI
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@@ -7,6 +7,8 @@ CXX=mpicxx ../../configure \
 	   --enable-unified=yes \
 	   --prefix /Users/peterboyle/QCD/vtk/Grid/install \
 	   --with-lime=$CLIME \
+	   --with-hdf5=$HDF5 \
+	   --with-fftw=$FFTW \
 	   --with-openssl=$OPENSSL \
 	   --with-gmp=$GMP \
 	   --with-mpfr=$MPFR \
--- a/systems/sdcc-genoa/config-command
+++ b/systems/sdcc-genoa/config-command
@@ -1,12 +1,3 @@
-
-spack load c-lime
-spack load fftw
-spack load hdf5+cxx
-
-export FFTW=`spack find --paths fftw       | grep ^fftw   | awk '{print $2}' `
-export HDF5=`spack find --paths hdf5+cxx   | grep ^hdf5   | awk '{print $2}' `
-export CLIME=`spack find --paths c-lime    | grep ^c-lime | awk '{print $2}' `
-
 ../../configure \
 --enable-comms=mpi-auto \
 --enable-unified=yes \
@@ -14,16 +5,12 @@ export CLIME=`spack find --paths c-lime    | grep ^c-lime | awk '{print $2}' `
 --enable-shm-fast-path=shmopen \
 --enable-accelerator=none \
 --enable-simd=AVX512 \
--with-lime=$CLIME \
--with-hdf5=$HDF5 \
--with-fftw=$FFTW \
+--disable-accelerator-cshift \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=clang++ \
 MPICXX=mpicxx \
-LIBS=-llime \
-LDFLAGS=-L$CLIME/lib/ \
-CXXFLAGS="-std=c++17 -fPIE"
+CXXFLAGS="-std=c++17"



--- a/systems/sdcc-genoa/sourceme.sh
+++ b/systems/sdcc-genoa/sourceme.sh
@@ -1,5 +1,4 @@
 source $HOME/spack/share/spack/setup-env.sh
 spack load llvm@17.0.4
 export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
-module load openmpi/4.1.8
-spack load c-lime
+module load openmpi
--- a/tests/lanczos/LanParams.xml
+++ b/tests/lanczos/LanParams.xml
@@ -1,14 +1,15 @@
 <?xml version="1.0"?>
 <grid>
  <LanczosParameters>
-    <mass>0.00107</mass>
+    <mass>-1.025</mass>
+    <mstep>-0.025</mstep>
    <M5>1.8</M5>
    <Ls>48</Ls>
    <Nstop>10</Nstop>
-    <Nk>15</Nk>
-    <Np>85</Np>
-    <ChebyLow>0.003</ChebyLow>
-    <ChebyHigh>60</ChebyHigh>
-    <ChebyOrder>201</ChebyOrder>
+    <Nk>12</Nk>
+    <Np>30</Np>
+    <ChebyLow>0.1</ChebyLow>
+    <ChebyHigh>50</ChebyHigh>
+    <ChebyOrder>51</ChebyOrder>
  </LanczosParameters>
 </grid>
--- a/tests/lanczos/Test_dwf_G5R5.cc
+++ b/tests/lanczos/Test_dwf_G5R5.cc
@@ -31,23 +31,16 @@ directory

 using namespace std;
 using namespace Grid;
+ ;

-//typedef WilsonFermionD FermionOp;
+#if 0
 typedef DomainWallFermionD FermionOp;
 typedef typename DomainWallFermionD::FermionField FermionField;
-
-template <class T> void writeFile(T& in, std::string const fname){  
-#ifdef HAVE_LIME
-  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
-  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
-  Grid::emptyUserRecord record;
-  Grid::ScidacWriter WR(in.Grid()->IsBoss());
-  WR.open(fname);
-  WR.writeScidacFieldRecord(in,record,0);
-  WR.close();
+#else
+typedef MobiusFermionD FermionOp;
+typedef typename MobiusFermionD::FermionField FermionField;
 #endif
-  // What is the appropriate way to throw error?
-}
+

 RealD AllZero(RealD x) { return 0.; }

@@ -132,7 +125,7 @@ int main(int argc, char** argv) {

  int Ls=16;
  RealD M5=1.8;
-  RealD mass = 0.01;
+  RealD mass = -1.0;

  mass=LanParams.mass;
  Ls=LanParams.Ls;
@@ -170,10 +163,10 @@ int main(int argc, char** argv) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
-  int Nk = 20;
-  int Nstop = Nk;
-  int Np = 80;

+  int Nstop = 10;
+  int Nk = 20;
+  int Np = 80;
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
@@ -181,10 +174,11 @@ int main(int argc, char** argv) {
  int Nm = Nk + Np;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;
-
+  RealD mob_b=1.5;

 //while ( mass > - 5.0){
-  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+//  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
 //  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
  Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
@@ -212,12 +206,8 @@ int main(int argc, char** argv) {
  int Nconv;
  IRL.calc(eval, evec, src, Nconv);

-  std::cout << mass <<" : " << eval        << std::endl;
-  std::cout << " #evecs "   << evec.size() << std::endl;
-  std::cout << " Nconv  "   << Nconv       << std::endl;
-  std::cout << " Nm     "   << Nm          << std::endl;
-  if ( Nconv > evec.size() ) Nconv = evec.size();
-  
+  std::cout << mass <<" : " << eval << std::endl;
+
 #if 0
  Gamma g5(Gamma::Algebra::Gamma5) ;
  ComplexD dot;
@@ -247,7 +237,6 @@ int main(int argc, char** argv) {
  vector<LatticeFermion> finalevec(Nconv, FGrid);
  vector<RealD> eMe(Nconv), eMMe(Nconv);
  for(int i = 0; i < Nconv; i++){
-    cout << "calculate the matrix element["<<i<<"]" << endl;
    G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
@@ -314,7 +303,7 @@ int main(int argc, char** argv) {
      }
    }
  }
-  for(int i = 0; i < Nconv; i++){
+    for(int i = 0; i < Nconv; i++){
    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
@@ -322,7 +311,6 @@ int main(int argc, char** argv) {
  cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
  cout << eMMe << endl;

-  

 //  vector<LatticeFermion> finalevec(Nconv, FGrid);
 // temporary, until doing rotation
@@ -343,41 +331,13 @@ int main(int argc, char** argv) {
      axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
    }
  }
-  
-  for(int i = 0; i < Nconv; i++){
-    Ddwf.M(finalevec[i], G5R5Mevec[i]);
-    for(int j = 0; j < Nconv; j++){
-      std::cout << "<"<<j<<"|Ddwf|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
-    }
-  }
-  for(int i = 0; i < Nconv; i++){
-    RealD t1,t2;
-    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], t1, t2);
-    for(int j = 0; j < Nconv; j++){
-      std::cout << "<"<<j<<"|G5R5 M|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
-    }
-  }
-  
  for(int i = 0; i < Nconv; i++){
    chiral_matrix_real[i].resize(Nconv);
    chiral_matrix[i].resize(Nconv);
-
-    std::string evfile("./evec_density");
-    evfile = evfile+"_"+std::to_string(i);
-    auto evdensity = localInnerProduct(finalevec[i],finalevec[i] );
-    writeFile(evdensity,evfile);
-
    for(int j = 0; j < Nconv; j++){
      chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
-      std::cout <<" chiral_matrix_real signed "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
      chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
      std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
-      if ( chiral_matrix_real[i][j] > 0.8 ) {
-	auto g5density = localInnerProduct(finalevec[i], G5evec[j]);
-	std::string chfile("./chiral_density_");
-	chfile = chfile +std::to_string(i)+"_"+std::to_string(j);
-	writeFile(g5density,chfile);
-      }
    }
  }
  for(int i = 0; i < Nconv; i++){
@@ -386,43 +346,6 @@ int main(int argc, char** argv) {
    }
  }

-  FILE *fp = fopen("lego-plot.py","w"); assert(fp!=NULL);
-#define PYTHON_LINE(A)  fprintf(fp,A"\n");
-  PYTHON_LINE("import matplotlib.pyplot as plt");
-  PYTHON_LINE("import numpy as np");
-  PYTHON_LINE("");
-  PYTHON_LINE("fig = plt.figure()");
-  PYTHON_LINE("ax = fig.add_subplot(projection='3d')");
-  PYTHON_LINE("");
-  PYTHON_LINE("x, y = np.random.rand(2, 100) * 4");
-  fprintf(fp,"hist, xedges, yedges = np.histogram2d(x, y, bins=%d, range=[[0, %d], [0, %d]])\n",Nconv,Nconv-1,Nconv-1);
-  PYTHON_LINE("");
-  PYTHON_LINE("# Construct arrays for the anchor positions of the 16 bars");
-  PYTHON_LINE("xpos, ypos = np.meshgrid(xedges[:-1] + 0.25, yedges[:-1] + 0.25, indexing=\"ij\")");
-  PYTHON_LINE("xpos = xpos.ravel()");
-  PYTHON_LINE("ypos = ypos.ravel()");
-  PYTHON_LINE("zpos = 0");
-  PYTHON_LINE("");
-  PYTHON_LINE("# Construct arrays with the dimensions for the 16 bars.");
-  PYTHON_LINE("dx = dy = 0.5 * np.ones_like(zpos)");
-  PYTHON_LINE("dz = np.array([");
-  for(int i = 0; i < Nconv; i++){
-    fprintf(fp,"\t[ ");
-    for(int j = 0; j < Nconv; j++){
-      fprintf(fp,"%lf ",chiral_matrix_real[i][j]);
-      if(j<Nconv-1) fprintf(fp,",");
-      else          fprintf(fp," ");
-    }
-    fprintf(fp,"]");
-    if(i<Nconv-1) fprintf(fp,",\n");
-    else          fprintf(fp,"\n");
-  }
-	      
-  PYTHON_LINE("\t])");
-  PYTHON_LINE("dz = dz.ravel()");
-  PYTHON_LINE("ax.bar3d(xpos, ypos, zpos, dx, dy, dz, zsort='average')");
-  PYTHON_LINE("plt.show()");
-  fclose(fp);
-  
+
  Grid_finalize();
 }
--- a/tests/lanczos/Test_wilson_DWFKernel.cc
+++ b/tests/lanczos/Test_wilson_DWFKernel.cc
@@ -113,6 +113,9 @@ struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 				RealD, resid,
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -204,7 +207,6 @@ int main(int argc, char** argv) {
  int Nstop = 5;
  int Nk = 10;
  int Np = 90;
-  int Nm = Nk + Np;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;

@@ -226,10 +228,14 @@ int main(int argc, char** argv) {
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
-
+  Nstop=LanParams.Nstop;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
  mass=LanParams.mass;
  resid=LanParams.resid;

+  int Nm = Nk + Np;
+

 while ( mass > - 5.0){
  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
--- a/tests/lanczos/Test_wilson_lanczos.cc
+++ b/tests/lanczos/Test_wilson_lanczos.cc
@@ -61,7 +61,8 @@ int main(int argc, char** argv) {
  RNG5.SeedFixedIntegers(seeds5);

  LatticeGaugeField Umu(UGrid);
-  SU<Nc>::HotConfiguration(RNG4, Umu);
+//  SU<Nc>::HotConfiguration(RNG4, Umu);
+  SU<Nc>::ColdConfiguration(Umu);

 /*
  std::vector<LatticeColourMatrix> U(4, UGrid);
@@ -69,9 +70,15 @@ int main(int argc, char** argv) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
+//  std::vector<Complex> boundary = {1,1,1,-1};
+  std::vector<Complex> boundary = {1,1,1,1};
+  FermionOp::ImplParams Params(boundary);

-  RealD mass = -0.1;
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+
+
+  RealD mass = 0.0;
+//  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
  MdagMLinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);

@@ -89,7 +96,8 @@ int main(int argc, char** argv) {
  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
     PlainHermOp<FermionField> Op     (HermOp);

-  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
+//  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
+  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);

  std::vector<RealD> eval(Nm);
  FermionField src(FGrid);
@@ -101,7 +109,8 @@ int main(int argc, char** argv) {
  };

  int Nconv;
-  IRL.calc(eval, evec, src, Nconv);
+//  IRL.calc(eval, evec, src, Nconv);
+  IRL.calc(eval, src, Nconv);

  std::cout << eval << std::endl;

--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@@ -27,6 +27,7 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
+#include <Grid/parallelIO/IldgIOtypes.h>

 using namespace std;
 using namespace Grid;
@@ -38,11 +39,29 @@ typedef typename WilsonFermionD::FermionField FermionField;

 RealD AllZero(RealD x) { return 0.; }

+template <class T> void writeFile(T& in, std::string const fname){
+#if 1
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}
+
+
 namespace Grid {

 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
+		  		RealD, mstep , 
+				Integer, Nstop,
+                                Integer, Nk,
+                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -115,6 +134,7 @@ int main(int argc, char** argv) {

  LatticeGaugeField Umu(UGrid);
 //  SU<Nc>::HotConfiguration(RNG4, Umu);
+//  SU<Nc>::ColdConfiguration(Umu);

  FieldMetaData header;
  std::string file("./config");
@@ -158,10 +178,20 @@ int main(int argc, char** argv) {
  }

  mass=LanParams.mass;
+  Nstop=LanParams.Nstop;
+  Nk=LanParams.Nk;
+  Np=LanParams.Np;
+  Nm = Nk + Np;
+
+  FermionField src(FGrid);
+  gaussian(RNG5, src);
+  std::vector<Complex> boundary = {1,1,1,-1};
+//  std::vector<Complex> boundary = {1,1,1,1};
+  FermionOp::ImplParams Params(boundary);


-while ( mass > - 5.0){
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+while ( mass > - 2.5){
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
@@ -180,10 +210,9 @@ while ( mass > - 5.0){
     PlainHermOp<FermionField> Op2     (HermOp2);

  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
+//  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);

  std::vector<RealD> eval(Nm);
-  FermionField src(FGrid);
-  gaussian(RNG5, src);
  std::vector<FermionField> evec(Nm, FGrid);
  for (int i = 0; i < 1; i++) {
    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
@@ -192,6 +221,7 @@ while ( mass > - 5.0){

  int Nconv;
  IRL.calc(eval, evec, src, Nconv);
+//  IRL.calc(eval,  src, Nconv);

  std::cout << mass <<" : " << eval << std::endl;

@@ -202,9 +232,17 @@ while ( mass > - 5.0){
    tmp = g5*evec[i];
    dot = innerProduct(tmp,evec[i]);
    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
+//    if ( i<1)
+    {
+	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
+        auto evdensity = localInnerProduct(evec[i],evec[i] );
+	writeFile(evdensity,evfile);
+    }
  }
  src  = evec[0]+evec[1]+evec[2];
-  mass += -0.1;
+  src  += evec[3]+evec[4]+evec[5];
+  src  += evec[6]+evec[7]+evec[8];
+  mass += LanParams.mstep;
 }

  Grid_finalize();
--- a/visualisation/CMakeLists.txt
+++ b/visualisation/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.12 FATAL_ERROR)

 project(GridViewer)

-list(APPEND CMAKE_PREFIX_PATH "/home/paboyle/Visualisation/install/")
+list(APPEND CMAKE_PREFIX_PATH "/Users/peterboyle/QCD/vtk/VTK-9.4.2-install/")

 find_package(VTK COMPONENTS 
  CommonColor
--- a/visualisation/FieldDensityAnimate.cxx
+++ b/visualisation/FieldDensityAnimate.cxx
@@ -48,14 +48,15 @@ typedef vtkMarchingCubes isosurface;

 int mpeg = 0 ;
 int xlate = 0 ;
-int framerate = 10;

 template <class T> void readFile(T& out, std::string const fname){
+#ifdef HAVE_LIME
  Grid::emptyUserRecord record;
  Grid::ScidacReader RD;
  RD.open(fname);
  RD.readScidacFieldRecord(out,record);
  RD.close();
+#endif
 }
 using namespace Grid;

@@ -207,10 +208,6 @@ int main(int argc, char* argv[])
    xlate = 1;
  }

-  if( GridCmdOptionExists(argv,argv+argc,"--fps") ){
-    arg=GridCmdOptionPayload(argv,argv+argc,"--fps");
-    GridCmdOptionInt(arg,framerate);
-  }
  if( GridCmdOptionExists(argv,argv+argc,"--isosurface") ){
    arg=GridCmdOptionPayload(argv,argv+argc,"--isosurface");
    GridCmdOptionFloat(arg,default_contour);
@@ -423,7 +420,7 @@ int main(int argc, char* argv[])
    
    vtkFFMPEGWriter *writer = vtkFFMPEGWriter::New();
    writer->SetFileName("movie.avi");
-    writer->SetRate(framerate);
+    writer->SetRate(1);
    writer->SetInputConnection(imageFilter->GetOutputPort());
    writer->Start();

@@ -480,7 +477,7 @@ int main(int argc, char* argv[])
    slidercallback->fu_list = fu_list;
    sliderWidget->AddObserver(vtkCommand::InteractionEvent, slidercallback);

-    int timerId = iren->CreateRepeatingTimer(1000/framerate);
+    int timerId = iren->CreateRepeatingTimer(300);
    std::cout << "timerId: " << timerId << std::endl;

    // Start the interaction and timer
--- a/visualisation/README
+++ b/visualisation/README
@@ -73,21 +73,6 @@ each to:

   VTK really should make it easier to pick up the flags required for FFMPEG linkage, especially as they are very quirky on MacOS.

-========================================
-Aurora compilation:
-========================================
-module load ffmpeg
-download & untar: VTK-7.0.2
-mkdir build
-cd build 
-ccmake ../
-
-"t"
-Enable: VTK_MODULE_ENABLE_VTK_IOFFMPEG   YES   
-"configure" ; should "discover" the installed ffmpeg module
-
-Still need an "X" connection to make the MPEG files.
-

 ========================================
 Grid:
--- a/visualisation/cmake-command
+++ b/visualisation/cmake-command
@@ -1,17 +1,9 @@
-export grid_config=/home/paboyle/GPT/install/bin/grid-config
-libs=`$grid_config --libs`
-ldflags=`$grid_config --ldflags`
-cxxflags=`$grid_config --cxxflags`
-cxx=`$grid_config --cxx`
-cc=icx
+libs=`grid-config --libs`
+ldflags=`grid-config --ldflags`
+cxxflags=`grid-config --cxxflags`
+cxx=`grid-config --cxx`

 mkdir build
 cd build

-echo CC $cc
-echo CXX $cxx
-echo CXXFLAGS $cxxflags
-echo LDFLAGS  $ldflags
-echo LIBS  $libs
-
-LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_C_COMPILER=$cc -DCMAKE_CXX_COMPILER="$cxx" -DCMAKE_CXX_FLAGS="$cxxflags "
+LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_CXX_COMPILER=$cxx -DCMAKE_CXX_FLAGS=$cxxflags
Author	SHA1	Message	Date
Chulwoo Jung	7780d88d26	Adding simple lanczos, boundary to specflow(!)	2025-08-06 23:41:53 +00:00
Chulwoo Jung	2bf9179d2c	Adding mass step	2025-08-06 16:52:51 +00:00
Chulwoo Jung	c606f5dca0	Move out src initialization for re-use / Adding antiperiodic BC	2025-08-06 16:51:14 +00:00
Chulwoo Jung	8419cc5c64	specflow evec I/O added,	2025-07-11 15:57:23 -04:00
Chulwoo Jung	2cc6deb8e0	Merge branch 'develop' of https://github.com/paboyle/Grid into ic2	2025-04-25 10:48:41 -04:00
Chulwoo Jung	19d0590579	Checking in for merging	2025-04-25 10:48:22 -04:00