improved

std::cout<<GridLogMessage<<"Debug:"<<std::endl;
std::cout<<GridLogMessage<<" --dylib-map : print dynamic library map, useful for interpreting signal backtraces "<<std::endl; std::cout<<GridLogMessage<<" --heartbeat : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl; std::cout<<GridLogMessage<<" --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl; std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl; std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl; std::cout<<GridLogMessage<<" --debug-heartbeat : periodically report backtrace "<<std::endl; --dylib-map : Grid prints its dylib regions --heartbeat : itimer based / SIGALRM wake up which seems to make Aurora more stable --debug-heartbeat : periodically report to stderr where we are in code Now have libunwind option (configure: --with-unwind=<prefix>) to give an Asynch-Signal safe backtrace. Avoid glibc backtrace due to mallocs.
2025-08-25 07:27:09 +01:00 · 2025-06-27 06:08:54 +00:00 · 2025-06-27 06:08:54 +00:00 · 2025-06-27 06:08:54 +00:00 · 2025-06-27 06:08:54 +00:00 · 2025-06-27 06:08:54 +00:00
53 changed files with 1847 additions and 2220 deletions
--- a/Grid/DisableWarnings.h
+++ b/Grid/DisableWarnings.h
@@ -51,11 +51,13 @@ directory
 #pragma nv_diag_suppress cast_to_qualified_type
 //disables nvcc specific warning in many files
 #pragma nv_diag_suppress esa_on_defaulted_function_ignored
+#pragma nv_diag_suppress declared_but_not_referenced
 #pragma nv_diag_suppress extra_semicolon
 #else
 //disables nvcc specific warning in json.hpp
 #pragma diag_suppress unsigned_compare_with_zero
 #pragma diag_suppress cast_to_qualified_type
+#pragma diag_suppress declared_but_not_referenced
 //disables nvcc specific warning in many files
 #pragma diag_suppress esa_on_defaulted_function_ignored
 #pragma diag_suppress extra_semicolon
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -73,7 +73,6 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
-#include <Grid/algorithms/iterative/SimpleLanczos.h>
 #include <Grid/algorithms/iterative/PowerMethod.h>
 #include <Grid/algorithms/iterative/AdefGeneric.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -269,7 +269,9 @@ public:
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
+    grid->Barrier();
    axpby(T1,xscale,mscale,y,in);
+    grid->Barrier();

    // sum = .5 c[0] T0 + c[1] T1
    //    out = ()*T0 + Coeffs[1]*T1;
--- a/Grid/algorithms/iterative/SimpleLanczos.h
+++ b/Grid/algorithms/iterative/SimpleLanczos.h
@@ -1,931 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
-
-    Copyright (C) 2015
-
-Author: Chulwoo Jung <chulwoo@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_LANC_H
-#define GRID_LANC_H
-
-#include <string.h>		//memset
-
-#ifdef USE_LAPACK
-#ifdef USE_MKL
-#include<mkl_lapack.h>
-#else
-void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
-		    double *vl, double *vu, int *il, int *iu, double *abstol,
-		    int *m, double *w, double *z, int *ldz, int *isuppz,
-		    double *work, int *lwork, int *iwork, int *liwork,
-		    int *info);
-//#include <lapacke/lapacke.h>
-#endif
-#endif
-
-//#include <Grid/algorithms/densematrix/DenseMatrix.h>
-
-// eliminate temorary vector in calc()
-#define MEM_SAVE
-
-namespace Grid
-{
-
-  struct Bisection
-  {
-
-#if 0
-    static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
-			  std::vector < RealD > &BETA,
-			  std::vector < RealD > &eig)
-    {
-      int i, j;
-        std::vector < RealD > evec1 (row_num + 3);
-        std::vector < RealD > evec2 (row_num + 3);
-      RealD eps2;
-        ALPHA[1] = 0.;
-        BETHA[1] = 0.;
-      for (i = 0; i < row_num - 1; i++)
-	{
-	  ALPHA[i + 1] = A[i * (row_num + 1)].real ();
-	  BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
-	}
-      ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
-        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
-        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
-
-      // Do we really need to sort here?
-      int begin = 1;
-      int end = row_num;
-      int swapped = 1;
-      while (swapped)
-	{
-	  swapped = 0;
-	  for (i = begin; i < end; i++)
-	    {
-	      if (mag (evec2[i]) > mag (evec2[i + 1]))
-		{
-		  swap (evec2 + i, evec2 + i + 1);
-		  swapped = 1;
-		}
-	    }
-	  end--;
-	  for (i = end - 1; i >= begin; i--)
-	    {
-	      if (mag (evec2[i]) > mag (evec2[i + 1]))
-		{
-		  swap (evec2 + i, evec2 + i + 1);
-		  swapped = 1;
-		}
-	    }
-	  begin++;
-	}
-
-      for (i = 0; i < row_num; i++)
-	{
-	  for (j = 0; j < row_num; j++)
-	    {
-	      if (i == j)
-		H[i * row_num + j] = evec2[i + 1];
-	      else
-		H[i * row_num + j] = 0.;
-	    }
-	}
-    }
-#endif
-
-    static void bisec (std::vector < RealD > &c,
-		       std::vector < RealD > &b,
-		       int n,
-		       int m1,
-		       int m2,
-		       RealD eps1,
-		       RealD relfeh, std::vector < RealD > &x, RealD & eps2)
-    {
-      std::vector < RealD > wu (n + 2);
-
-      RealD h, q, x1, xu, x0, xmin, xmax;
-      int i, a, k;
-
-      b[1] = 0.0;
-      xmin = c[n] - fabs (b[n]);
-      xmax = c[n] + fabs (b[n]);
-      for (i = 1; i < n; i++)
-	{
-	  h = fabs (b[i]) + fabs (b[i + 1]);
-	  if (c[i] + h > xmax)
-	    xmax = c[i] + h;
-	  if (c[i] - h < xmin)
-	    xmin = c[i] - h;
-	}
-      xmax *= 2.;
-
-      eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
-      if (eps1 <= 0.0)
-	eps1 = eps2;
-      eps2 = 0.5 * eps1 + 7.0 * (eps2);
-      x0 = xmax;
-      for (i = m1; i <= m2; i++)
-	{
-	  x[i] = xmax;
-	  wu[i] = xmin;
-	}
-
-      for (k = m2; k >= m1; k--)
-	{
-	  xu = xmin;
-	  i = k;
-	  do
-	    {
-	      if (xu < wu[i])
-		{
-		  xu = wu[i];
-		  i = m1 - 1;
-		}
-	      i--;
-	    }
-	  while (i >= m1);
-	  if (x0 > x[k])
-	    x0 = x[k];
-	  while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
-	    {
-	      x1 = (xu + x0) / 2;
-
-	      a = 0;
-	      q = 1.0;
-	      for (i = 1; i <= n; i++)
-		{
-		  q =
-		    c[i] - x1 -
-		    ((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
-		  if (q < 0)
-		    a++;
-		}
-//      printf("x1=%0.14e a=%d\n",x1,a);
-	      if (a < k)
-		{
-		  if (a < m1)
-		    {
-		      xu = x1;
-		      wu[m1] = x1;
-		    }
-		  else
-		    {
-		      xu = x1;
-		      wu[a + 1] = x1;
-		      if (x[a] > x1)
-			x[a] = x1;
-		    }
-		}
-	      else
-		x0 = x1;
-	    }
-	  printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
-	  x[k] = (x0 + xu) / 2;
-	}
-    }
-  };
-
-/////////////////////////////////////////////////////////////
-// Implicitly restarted lanczos
-/////////////////////////////////////////////////////////////
-
-
-  template < class Field > class SimpleLanczos
-  {
-
-    const RealD small = 1.0e-16;
-  public:
-    int lock;
-    int get;
-    int Niter;
-    int converged;
-
-    int Nstop;			// Number of evecs checked for convergence
-    int Nk;			// Number of converged sought
-    int Np;			// Np -- Number of spare vecs in kryloc space
-    int Nm;			// Nm -- total number of vectors
-
-
-    RealD OrthoTime;
-
-    RealD eresid;
-
-//    SortEigen < Field > _sort;
-
-    LinearFunction < Field > &_Linop;
-
-//    OperatorFunction < Field > &_poly;
-
-    /////////////////////////
-    // Constructor
-    /////////////////////////
-    void init (void)
-    {
-    };
-//    void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector  < RealD > >&evecs);
-
-    SimpleLanczos (LinearFunction < Field > &Linop,	// op
-//		   OperatorFunction < Field > &poly,	// polynmial
-		   int _Nstop,	// sought vecs
-		   int _Nk,	// sought vecs
-		   int _Nm,	// spare vecs
-		   RealD _eresid,	// resid in lmdue deficit 
-		   int _Niter):	// Max iterations
-     
-      _Linop (Linop),
- //     _poly (poly),
-      Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
-    {
-      Np = Nm - Nk;
-      assert (Np > 0);
-    };
-
-    /////////////////////////
-    // Sanity checked this routine (step) against Saad.
-    /////////////////////////
-    void RitzMatrix (std::vector < Field > &evec, int k)
-    {
-
-      if (1)
-	return;
-
-      GridBase *grid = evec[0].Grid();
-      Field w (grid);
-      std::cout << GridLogMessage << "RitzMatrix " << std::endl;
-      for (int i = 0; i < k; i++)
-	{
-	  _Linop(evec[i], w);
-//      _poly(_Linop,evec[i],w);
-	  std::cout << GridLogMessage << "[" << i << "] ";
-	  for (int j = 0; j < k; j++)
-	    {
-	      ComplexD in = innerProduct (evec[j], w);
-	      if (fabs ((double) i - j) > 1)
-		{
-		  if (abs (in) > 1.0e-9)
-		    {
-		      std::cout << GridLogMessage << "oops" << std::endl;
-		      abort ();
-		    }
-		  else
-		    std::cout << GridLogMessage << " 0 ";
-		}
-	      else
-		{
-		  std::cout << GridLogMessage << " " << in << " ";
-		}
-	    }
-	  std::cout << GridLogMessage << std::endl;
-	}
-    }
-
-    void step (std::vector < RealD > &lmd,
-	       std::vector < RealD > &lme,
-	       Field & last, Field & current, Field & next, uint64_t k)
-    {
-      if (lmd.size () <= k)
-	lmd.resize (k + Nm);
-      if (lme.size () <= k)
-	lme.resize (k + Nm);
-
-
-//      _poly(_Linop,current,next );   // 3. wk:=Avk−βkv_{k−1}
-      _Linop(current, next);	// 3. wk:=Avk−βkv_{k−1}
-      if (k > 0)
-	{
-	  next -= lme[k - 1] * last;
-	}
-//      std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
-
-      ComplexD zalph = innerProduct (current, next);	// 4. αk:=(wk,vk)
-      RealD alph = real (zalph);
-
-      next = next - alph * current;	// 5. wk:=wk−αkvk
-//      std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
-
-      RealD beta = normalise (next);	// 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-      // 7. vk+1 := wk/βk+1
-//       norm=beta;
-
-      int interval = Nm / 100 + 1;
-      if ((k % interval) == 0)
-	std::
-	  cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
-	  beta << std::endl;
-      const RealD tiny = 1.0e-20;
-      if (beta < tiny)
-	{
-	  std::cout << GridLogMessage << " beta is tiny " << beta << std::
-	    endl;
-	}
-      lmd[k] = alph;
-      lme[k] = beta;
-
-    }
-
-    void qr_decomp (std::vector < RealD > &lmd,
-		    std::vector  < RealD > &lme,
-		    int Nk,
-		    int Nm,
-		    std::vector  < RealD > &Qt, RealD Dsh, int kmin, int kmax)
-    {
-      int k = kmin - 1;
-      RealD x;
-
-      RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
-      RealD c = (lmd[k] - Dsh) * Fden;
-      RealD s = -lme[k] * Fden;
-
-      RealD tmpa1 = lmd[k];
-      RealD tmpa2 = lmd[k + 1];
-      RealD tmpb = lme[k];
-
-      lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
-      lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
-      lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
-      x = -s * lme[k + 1];
-      lme[k + 1] = c * lme[k + 1];
-
-      for (int i = 0; i < Nk; ++i)
-	{
-	  RealD Qtmp1 = Qt[i + Nm * k];
-	  RealD Qtmp2 = Qt[i + Nm * (k + 1)];
-	  Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
-	  Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
-	}
-
-      // Givens transformations
-      for (int k = kmin; k < kmax - 1; ++k)
-	{
-
-	  RealD Fden = 1.0 / hypot (x, lme[k - 1]);
-	  RealD c = lme[k - 1] * Fden;
-	  RealD s = -x * Fden;
-
-	  RealD tmpa1 = lmd[k];
-	  RealD tmpa2 = lmd[k + 1];
-	  RealD tmpb = lme[k];
-
-	  lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
-	  lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
-	  lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
-	  lme[k - 1] = c * lme[k - 1] - s * x;
-
-	  if (k != kmax - 2)
-	    {
-	      x = -s * lme[k + 1];
-	      lme[k + 1] = c * lme[k + 1];
-	    }
-
-	  for (int i = 0; i < Nk; ++i)
-	    {
-	      RealD Qtmp1 = Qt[i + Nm * k];
-	      RealD Qtmp2 = Qt[i + Nm * (k + 1)];
-	      Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
-	      Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
-	    }
-	}
-    }
-
-#if 0
-#ifdef USE_LAPACK
-#ifdef USE_MKL
-#define LAPACK_INT MKL_INT
-#else
-#define LAPACK_INT long long
-#endif
-    void diagonalize_lapack (std::vector  < RealD > &lmd, std::vector  < RealD > &lme, int N1,	// all
-			     int N2,	// get
-			     GridBase * grid)
-    {
-      const int size = Nm;
-      LAPACK_INT NN = N1;
-      double evals_tmp[NN];
-      double DD[NN];
-      double EE[NN];
-      for (int i = 0; i < NN; i++)
-	for (int j = i - 1; j <= i + 1; j++)
-	  if (j < NN && j >= 0)
-	    {
-	      if (i == j)
-		DD[i] = lmd[i];
-	      if (i == j)
-		evals_tmp[i] = lmd[i];
-	      if (j == (i - 1))
-		EE[j] = lme[j];
-	    }
-      LAPACK_INT evals_found;
-      LAPACK_INT lwork =
-	((18 * NN) >
-	 (1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
-      LAPACK_INT liwork = 3 + NN * 10;
-      LAPACK_INT iwork[liwork];
-      double work[lwork];
-      LAPACK_INT isuppz[2 * NN];
-      char jobz = 'N';		// calculate evals only
-      char range = 'I';		// calculate il-th to iu-th evals
-      //    char range = 'A'; // calculate all evals
-      char uplo = 'U';		// refer to upper half of original matrix
-      char compz = 'I';		// Compute eigenvectors of tridiagonal matrix
-      int ifail[NN];
-      LAPACK_INT info;
-//  int total = QMP_get_number_of_nodes();
-//  int node = QMP_get_node_number();
-//  GridBase *grid = evec[0]._grid;
-      int total = grid->_Nprocessors;
-      int node = grid->_processor;
-      int interval = (NN / total) + 1;
-      double vl = 0.0, vu = 0.0;
-      LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
-      if (iu > NN)
-	iu = NN;
-      double tol = 0.0;
-      if (1)
-	{
-	  memset (evals_tmp, 0, sizeof (double) * NN);
-	  if (il <= NN)
-	    {
-	      printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
-#ifdef USE_MKL
-	      dstegr (&jobz, &range, &NN,
-#else
-	      LAPACK_dstegr (&jobz, &range, &NN,
-#endif
-			     (double *) DD, (double *) EE, &vl, &vu, &il, &iu,	// these four are ignored if second parameteris 'A'
-			     &tol,	// tolerance
-			     &evals_found, evals_tmp, (double *) NULL, &NN,
-			     isuppz, work, &lwork, iwork, &liwork, &info);
-	      for (int i = iu - 1; i >= il - 1; i--)
-		{
-		  printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
-			  evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
-		  evals_tmp[i] = evals_tmp[i - (il - 1)];
-		  if (il > 1)
-		    evals_tmp[i - (il - 1)] = 0.;
-		}
-	    }
-	  {
-	    grid->GlobalSumVector (evals_tmp, NN);
-	  }
-	}
-// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
-    }
-#undef LAPACK_INT
-#endif
-
-
-    void diagonalize (std::vector  < RealD > &lmd,
-		      std::vector  < RealD > &lme,
-		      int N2, int N1, GridBase * grid)
-    {
-
-#ifdef USE_LAPACK
-      const int check_lapack = 0;	// just use lapack if 0, check against lapack if 1
-
-      if (!check_lapack)
-	return diagonalize_lapack (lmd, lme, N2, N1, grid);
-
-//      diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
-#endif
-    }
-#endif
-
-    static RealD normalise (Field & v)
-    {
-      RealD nn = norm2 (v);
-      nn = sqrt (nn);
-      v = v * (1.0 / nn);
-      return nn;
-    }
-
-    void orthogonalize (Field & w, std::vector < Field > &evec, int k)
-    {
-      double t0 = -usecond () / 1e6;
-      typedef typename Field::scalar_type MyComplex;
-      MyComplex ip;
-
-      if (0)
-	{
-	  for (int j = 0; j < k; ++j)
-	    {
-	      normalise (evec[j]);
-	      for (int i = 0; i < j; i++)
-		{
-		  ip = innerProduct (evec[i], evec[j]);	// are the evecs normalised? ; this assumes so.
-		  evec[j] = evec[j] - ip * evec[i];
-		}
-	    }
-	}
-
-      for (int j = 0; j < k; ++j)
-	{
-	  ip = innerProduct (evec[j], w);	// are the evecs normalised? ; this assumes so.
-	  w = w - ip * evec[j];
-	}
-      normalise (w);
-      t0 += usecond () / 1e6;
-      OrthoTime += t0;
-    }
-
-    void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
-    {
-      for (int i = 0; i < Qt.size (); ++i)
-	Qt[i] = 0.0;
-      for (int k = 0; k < Nm; ++k)
-	Qt[k + k * Nm] = 1.0;
-    }
-
-
-    void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
-    {
-
-      GridBase *grid = src.Grid();
-//      assert(grid == src._grid);
-
-      std::
-	cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
-	endl;
-      std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
-      std::cout << GridLogMessage << " -- size of eval   = " << eval.
-	size () << std::endl;
-
-//      assert(c.size() && Nm == eval.size());
-
-      std::vector < RealD > lme (Nm);
-      std::vector < RealD > lmd (Nm);
-
-
-      Field current (grid);
-      Field last (grid);
-      Field next (grid);
-
-      Nconv = 0;
-
-      RealD beta_k;
-
-      // Set initial vector
-      // (uniform vector) Why not src??
-      //      evec[0] = 1.0;
-      current = src;
-      std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
-	endl;
-      normalise (current);
-      std::
-	cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
-	std::endl;
-
-      // Initial Nk steps
-      OrthoTime = 0.;
-      double t0 = usecond () / 1e6;
-      RealD norm;		// sqrt norm of last vector
-
-      uint64_t iter = 0;
-
-      bool initted = false;
-      std::vector < RealD > low (Nstop * 10);
-      std::vector < RealD > high (Nstop * 10);
-      RealD cont = 0.;
-      while (1) {
-	  cont = 0.;
-	  std::vector < RealD > lme2 (Nm);
-	  std::vector < RealD > lmd2 (Nm);
-	  for (uint64_t k = 0; k < Nm; ++k, iter++) {
-	      step (lmd, lme, last, current, next, iter);
-	      last = current;
-	      current = next;
-	    }
-	  double t1 = usecond () / 1e6;
-	  std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
-	    t0 << "seconds" << std::endl;
-	  t0 = t1;
-	  std::
-	    cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
-	    OrthoTime << "seconds" << std::endl;
-
-	  // getting eigenvalues
-	  lmd2.resize (iter + 2);
-	  lme2.resize (iter + 2);
-	  for (uint64_t k = 0; k < iter; ++k) {
-	      lmd2[k + 1] = lmd[k];
-	      lme2[k + 2] = lme[k];
-	    }
-	  t1 = usecond () / 1e6;
-	  std::cout << GridLogMessage << "IRL:: copy: " << t1 -
-	    t0 << "seconds" << std::endl;
-	  t0 = t1;
-	  {
-	    int total = grid->_Nprocessors;
-	    int node = grid->_processor;
-	    int interval = (Nstop / total) + 1;
-	    int iu = (iter + 1) - (interval * node + 1);
-	    int il = (iter + 1) - (interval * (node + 1));
-	    std::vector < RealD > eval2 (iter + 3);
-	    RealD eps2;
-	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
-			      eps2);
-//        diagonalize(eval2,lme2,iter,Nk,grid);
-	    RealD diff = 0.;
-	    for (int i = il; i <= iu; i++) {
-		if (initted)
-		  diff =
-		    fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
-						      fabs (high[iu-i]));
-		if (initted && (diff > eresid))
-		  cont = 1.;
-		if (initted)
-		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
-			  high[iu-i], diff);
-		high[iu-i] = eval2[i];
-	      }
-	    il = (interval * node + 1);
-	    iu = (interval * (node + 1));
-	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
-			      eps2);
-	    for (int i = il; i <= iu; i++) {
-		if (initted)
-		  diff =
-		    fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
-						fabs (low[i]));
-		if (initted && (diff > eresid))
-		  cont = 1.;
-		if (initted)
-		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
-			  low[i], diff);
-		low[i] = eval2[i];
-	      }
-	    t1 = usecond () / 1e6;
-	    std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
-	      t0 << "seconds" << std::endl;
-	    t0 = t1;
-	  }
-
-	  for (uint64_t k = 0; k < Nk; ++k) {
-//          eval[k] = eval2[k];
-	    }
-	  if (initted)
-	    {
-	      grid->GlobalSumVector (&cont, 1);
-	      if (cont < 1.) return;
-	    }
-	  initted = true;
-	}
-
-    }
-
-
-
-
-
-#if 0
-
-/**
-   There is some matrix Q such that for any vector y
-   Q.e_1 = y and Q is unitary.
-**/
-    template < class T >
-      static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
-    {
-      int N = y.size ();	//Matrix Size
-      Fill (Q, 0.0);
-      T tau;
-      for (int i = 0; i < N; i++)
-	{
-	  Q[i][0] = y[i];
-	}
-      T sig = conj (y[0]) * y[0];
-      T tau0 = fabs (sqrt (sig));
-
-      for (int j = 1; j < N; j++)
-	{
-	  sig += conj (y[j]) * y[j];
-	  tau = abs (sqrt (sig));
-
-	  if (abs (tau0) > 0.0)
-	    {
-
-	      T gam = conj ((y[j] / tau) / tau0);
-	      for (int k = 0; k <= j - 1; k++)
-		{
-		  Q[k][j] = -gam * y[k];
-		}
-	      Q[j][j] = tau0 / tau;
-	    }
-	  else
-	    {
-	      Q[j - 1][j] = 1.0;
-	    }
-	  tau0 = tau;
-	}
-      return tau;
-    }
-
-/**
-	There is some matrix Q such that for any vector y
-	Q.e_k = y and Q is unitary.
-**/
-    template < class T >
-      static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
-    {
-      T tau = orthQ (Q, y);
-      SL (Q);
-      return tau;
-    }
-
-
-/**
-	Wind up with a matrix with the first con rows untouched
-
-say con = 2
-	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
-	and the matrix is upper hessenberg
-	and with f and Q appropriately modidied with Q is the arnoldi factorization
-
-**/
-
-    template < class T > static void Lock (DenseMatrix < T > &H,	///Hess mtx     
-					   DenseMatrix < T > &Q,	///Lock Transform
-					   T val,	///value to be locked
-					   int con,	///number already locked
-					   RealD small, int dfg, bool herm)
-    {
-      //ForceTridiagonal(H);
-
-      int M = H.dim;
-      DenseVector < T > vec;
-      Resize (vec, M - con);
-
-      DenseMatrix < T > AH;
-      Resize (AH, M - con, M - con);
-      AH = GetSubMtx (H, con, M, con, M);
-
-      DenseMatrix < T > QQ;
-      Resize (QQ, M - con, M - con);
-
-      Unity (Q);
-      Unity (QQ);
-
-      DenseVector < T > evals;
-      Resize (evals, M - con);
-      DenseMatrix < T > evecs;
-      Resize (evecs, M - con, M - con);
-
-      Wilkinson < T > (AH, evals, evecs, small);
-
-      int k = 0;
-      RealD cold = abs (val - evals[k]);
-      for (int i = 1; i < M - con; i++)
-	{
-	  RealD cnew = abs (val - evals[i]);
-	  if (cnew < cold)
-	    {
-	      k = i;
-	      cold = cnew;
-	    }
-	}
-      vec = evecs[k];
-
-      ComplexD tau;
-      orthQ (QQ, vec);
-      //orthQM(QQ,AH,vec);
-
-      AH = Hermitian (QQ) * AH;
-      AH = AH * QQ;
-
-      for (int i = con; i < M; i++)
-	{
-	  for (int j = con; j < M; j++)
-	    {
-	      Q[i][j] = QQ[i - con][j - con];
-	      H[i][j] = AH[i - con][j - con];
-	    }
-	}
-
-      for (int j = M - 1; j > con + 2; j--)
-	{
-
-	  DenseMatrix < T > U;
-	  Resize (U, j - 1 - con, j - 1 - con);
-	  DenseVector < T > z;
-	  Resize (z, j - 1 - con);
-	  T nm = norm (z);
-	  for (int k = con + 0; k < j - 1; k++)
-	    {
-	      z[k - con] = conj (H (j, k + 1));
-	    }
-	  normalise (z);
-
-	  RealD tmp = 0;
-	  for (int i = 0; i < z.size () - 1; i++)
-	    {
-	      tmp = tmp + abs (z[i]);
-	    }
-
-	  if (tmp < small / ((RealD) z.size () - 1.0))
-	    {
-	      continue;
-	    }
-
-	  tau = orthU (U, z);
-
-	  DenseMatrix < T > Hb;
-	  Resize (Hb, j - 1 - con, M);
-
-	  for (int a = 0; a < M; a++)
-	    {
-	      for (int b = 0; b < j - 1 - con; b++)
-		{
-		  T sum = 0;
-		  for (int c = 0; c < j - 1 - con; c++)
-		    {
-		      sum += H[a][con + 1 + c] * U[c][b];
-		    }		//sum += H(a,con+1+c)*U(c,b);}
-		  Hb[b][a] = sum;
-		}
-	    }
-
-	  for (int k = con + 1; k < j; k++)
-	    {
-	      for (int l = 0; l < M; l++)
-		{
-		  H[l][k] = Hb[k - 1 - con][l];
-		}
-	    }			//H(Hb[k-1-con][l] , l,k);}}
-
-	  DenseMatrix < T > Qb;
-	  Resize (Qb, M, M);
-
-	  for (int a = 0; a < M; a++)
-	    {
-	      for (int b = 0; b < j - 1 - con; b++)
-		{
-		  T sum = 0;
-		  for (int c = 0; c < j - 1 - con; c++)
-		    {
-		      sum += Q[a][con + 1 + c] * U[c][b];
-		    }		//sum += Q(a,con+1+c)*U(c,b);}
-		  Qb[b][a] = sum;
-		}
-	    }
-
-	  for (int k = con + 1; k < j; k++)
-	    {
-	      for (int l = 0; l < M; l++)
-		{
-		  Q[l][k] = Qb[k - 1 - con][l];
-		}
-	    }			//Q(Qb[k-1-con][l] , l,k);}}
-
-	  DenseMatrix < T > Hc;
-	  Resize (Hc, M, M);
-
-	  for (int a = 0; a < j - 1 - con; a++)
-	    {
-	      for (int b = 0; b < M; b++)
-		{
-		  T sum = 0;
-		  for (int c = 0; c < j - 1 - con; c++)
-		    {
-		      sum += conj (U[c][a]) * H[con + 1 + c][b];
-		    }		//sum += conj( U(c,a) )*H(con+1+c,b);}
-		  Hc[b][a] = sum;
-		}
-	    }
-
-	  for (int k = 0; k < M; k++)
-	    {
-	      for (int l = con + 1; l < j; l++)
-		{
-		  H[l][k] = Hc[k][l - 1 - con];
-		}
-	    }			//H(Hc[k][l-1-con] , l,k);}}
-
-	}
-    }
-#endif
-
-
-  };
-
-}
-#endif
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@@ -183,6 +183,7 @@ public:
 		      int recv_from_rank,
 		      int bytes);
  
+  int IsOffNode(int rank);
  double StencilSendToRecvFrom(void *xmit,
 			       int xmit_to_rank,int do_xmit,
 			       void *recv,
@@ -201,9 +202,9 @@ public:
  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);

  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-				    void *xmit,
+				    void *xmit,void *xmit_comp,
 				    int xmit_to_rank,int do_xmit,
-				    void *recv,
+				    void *recv,void *recv_comp,
 				    int recv_from_rank,int do_recv,
 				    int xbytes,int rbytes,int dir);
  
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -260,32 +260,39 @@ CartesianCommunicator::~CartesianCommunicator()
 }
 #ifdef USE_GRID_REDUCTION
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(f);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(d);
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("AllReduce float");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("AllReduce double");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
+  FlightRecorder::StepLog("AllReduce uint32_t");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
+  FlightRecorder::StepLog("AllReduce uint64_t");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
+  FlightRecorder::StepLog("AllReduceVector");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
@@ -294,26 +301,31 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
+  FlightRecorder::StepLog("GlobalXOR");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(float &f)
 {
+  FlightRecorder::StepLog("GlobalMax");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalMax(double &d)
 {
+  FlightRecorder::StepLog("GlobalMax");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
+  FlightRecorder::StepLog("GlobalSumVector(float *)");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
+  FlightRecorder::StepLog("GlobalSumVector(double *)");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
@@ -388,11 +400,16 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
-  offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
+  offbytes       += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
-
+int CartesianCommunicator::IsOffNode(int rank)
+{
+  int grank = ShmRanks[rank];
+  if ( grank == MPI_UNDEFINED ) return true;
+  else return false;
+}

 #ifdef ACCELERATOR_AWARE_MPI
 void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
@@ -407,9 +424,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
  return 0.0; // Do nothing -- no preparation required
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
+							 void *xmit,void *xmit_comp,
 							 int dest,int dox,
-							 void *recv,
+							 void *recv,void *recv_comp,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
@@ -433,7 +450,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  if ( dor ) {
    if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+from*32;
-      ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
+      //      std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
+      ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
      assert(ierr==0);
      list.push_back(rrq);
      off_node_bytes+=rbytes;
@@ -442,6 +460,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
    else { 
      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
      assert(shm!=NULL);
+      //      std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
      acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
    }
 #endif
@@ -450,7 +469,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
      tag= dir+_processor*32;
-      ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+      ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      assert(ierr==0);
      list.push_back(xrq);
      off_node_bytes+=xbytes;
@@ -669,9 +688,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
 }  

 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-							 void *xmit,
+							 void *xmit,void *xmit_comp,
 							 int dest,int dox,
-							 void *recv,
+							 void *recv,void *recv_comp,
 							 int from,int dor,
 							 int xbytes,int rbytes,int dir)
 {
@@ -794,6 +813,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque

 void CartesianCommunicator::StencilBarrier(void)
 {
+  FlightRecorder::StepLog("NodeBarrier");
  MPI_Barrier  (ShmComm);
 }
 //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -801,11 +821,13 @@ void CartesianCommunicator::StencilBarrier(void)
 //}
 void CartesianCommunicator::Barrier(void)
 {
+  FlightRecorder::StepLog("GridBarrier");
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("Broadcast");
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
@@ -819,11 +841,13 @@ int CartesianCommunicator::RankWorld(void){
  return r;
 }
 void CartesianCommunicator::BarrierWorld(void){
+  FlightRecorder::StepLog("BarrierWorld");
  int ierr = MPI_Barrier(communicator_world);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("BroadcastWorld");
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
@@ -846,6 +870,7 @@ void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
+  FlightRecorder::StepLog("AllToAll");
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@@ -124,6 +124,8 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
  dest=0;
 }

+int CartesianCommunicator::IsOffNode(int rank) { return false; }
+
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 						     int xmit_to_rank,int dox,
 						     void *recv,
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -543,49 +543,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifndef ACCELERATOR_AWARE_MPI
  // printf("Host buffer allocate for GPU non-aware MPI\n");
-#if 0
-  HostCommBuf= acceleratorAllocHost(bytes);
-#else 
  HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
-#if 0
-  #warning "Moving host buffers to specific NUMA domain"
-  int numa;
-  char *numa_name=(char *)getenv("MPI_BUF_NUMA");
-  if(numa_name) {
-    unsigned long page_size = sysconf(_SC_PAGESIZE);
-    numa = atoi(numa_name);
-    unsigned long page_count = bytes/page_size;
-    std::vector<void *> pages(page_count);
-    std::vector<int>    nodes(page_count,numa);
-    std::vector<int>    status(page_count,-1);
-    for(unsigned long p=0;p<page_count;p++){
-      pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
-    }
-    int ret = move_pages(0,
-			 page_count,
-			 &pages[0],
-			 &nodes[0],
-			 &status[0],
-			 MPOL_MF_MOVE);
-    printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
-    if (ret) perror(" move_pages failed for reason:");
-  }
-#endif  
-  acceleratorPin(HostCommBuf,bytes);
-#endif  
-
 #endif  
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
-    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
+    std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  if ( WorldRank == 0 ){
-    std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
+    std::cout << Mheader " acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
  }
  SharedMemoryZero(ShmCommBuf,bytes);
-  std::cout<< "Setting up IPC"<<std::endl;
+  if ( WorldRank == 0 ){
+    std::cout<< Mheader "Setting up IPC"<<std::endl;
+  }
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -616,8 +588,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
      if ( err != ZE_RESULT_SUCCESS ) {
 	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
-      } else {
-	std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
      handle.pid = getpid();
@@ -676,12 +646,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 #ifdef SHM_SOCKETS
      myfd=UnixSockets::RecvFileDescriptor();
 #else
-      std::cout<<"mapping seeking remote pid/fd "
-	       <<handle.pid<<"/"
-	       <<handle.fd<<std::endl;
+      //      std::cout<<"mapping seeking remote pid/fd "
+      //	       <<handle.pid<<"/"
+      //	       <<handle.fd<<std::endl;

      int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
-      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
+      //      std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
      //      int myfd  = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
      myfd  = syscall(438,pidfd,handle.fd,0);
      int err_t = errno;
@@ -691,7 +661,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	assert(0);
      }
 #endif
-      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
+      //      std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
      memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
      memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));

@@ -700,9 +670,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
 	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
-      } else {
-	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
-	std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
      }
      assert(thisBuf!=nullptr);
    }
@@ -783,6 +750,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    WorldShmCommBufs[r] =ptr;
    //    std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
  }
+  std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
  _ShmAlloc=1;
  _ShmAllocBytes  = bytes;
 };
@@ -990,7 +958,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  SharedMemoryTest();
+  //  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
@@ -1039,11 +1007,13 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
 {
  int gpeer = ShmRanks[rank];
  assert(gpeer!=ShmRank); // never send to self
+  //  std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
+    //    std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
    return (void *) remote;
  }
 }
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 {
  acceleratorMemSet(dest,0,bytes);
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-{
-  acceleratorCopyToDevice(src,dest,bytes);
-}
+//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+//{
+//  acceleratorCopyToDevice(src,dest,bytes);
+//}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -143,9 +143,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
    int comm_proc = ((x+sshift)/rd)%pd;
    
    if (comm_proc==0) {
+      FlightRecorder::StepLog("Cshift_Copy_plane");
      tcopy-=usecond();
      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
      tcopy+=usecond();
+      FlightRecorder::StepLog("Cshift_Copy_plane_complete");
    } else {

      int words = buffer_size;
@@ -153,9 +155,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r

      int bytes = words * sizeof(vobj);

+      FlightRecorder::StepLog("Cshift_Gather_plane");
      tgather-=usecond();
      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
      tgather+=usecond();
+      FlightRecorder::StepLog("Cshift_Gather_plane_complete");

      //      int rank           = grid->_processor;
      int recv_from_rank;
@@ -166,6 +170,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      tcomms-=usecond();
      grid->Barrier();

+      FlightRecorder::StepLog("Cshift_SendRecv");
 #ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
@@ -182,10 +187,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 			   bytes);
      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
 #endif
+      FlightRecorder::StepLog("Cshift_SendRecv_complete");

      xbytes+=bytes;
      grid->Barrier();
      tcomms+=usecond();
+      FlightRecorder::StepLog("Cshift_barrier_complete");

      tscatter-=usecond();
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -236,7 +236,7 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
-#if 0
+#if 1
    deviceVector<vobj> vvtmp(1);
    acceleratorPut(vvtmp[0],vtmp);
    vobj *vvtmp_p = & vvtmp[0];
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -325,8 +325,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
    assert(ok);
  }
  FlightRecorder::StepLog("Start global sum");
-  //  grid->GlobalSumP2P(nrm);
-  grid->GlobalSum(nrm);
+  grid->GlobalSumP2P(nrm);
+  //  grid->GlobalSum(nrm);
  FlightRecorder::StepLog("Finished global sum");
  //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
  FlightRecorder::ReductionLog(local,real(nrm)); 
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -154,6 +154,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }

  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -179,6 +179,12 @@ public:
  StencilImpl Stencil; 
  StencilImpl StencilEven; 
  StencilImpl StencilOdd; 
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }
    
  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -146,6 +146,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }

  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@@ -32,209 +32,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-///////////////////////////////////////////////////////////////
-// Wilson compressor will need FaceGather policies for:
-// Periodic, Dirichlet, and partial Dirichlet for DWF
-///////////////////////////////////////////////////////////////
-const int dwf_compressor_depth=2;
-#define DWF_COMPRESS
-class FaceGatherPartialDWF
-{
-public:
-#ifdef DWF_COMPRESS
-  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
-#else
-  static int PartialCompressionFactor(GridBase *grid) { return 1;}
-#endif
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
-				   const Lattice<vobj> &rhs,
-				   cobj *buffer,
-				   compressor &compress,
-				   int off,int so,int partial)
-  {
-    //DWF only hack: If a direction that is OFF node we use Partial Dirichlet
-    //  Shrinks local and remote comms buffers
-    GridBase *Grid = rhs.Grid();
-    int Ls = Grid->_rdimensions[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else 
-    int depth=Ls/2;
-#endif
-    std::pair<int,int> *table_v = & table[0];
-    auto rhs_v = rhs.View(AcceleratorRead);
-    int vol=table.size()/Ls;
-    accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
-	Integer i=idx/Ls;
-	Integer s=idx%Ls;
-	Integer sc=depth+s-(Ls-depth);
-	if(s<depth)     compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
-	if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
-    });
-    rhs_v.ViewClose();
-  }
-  template<class decompressor,class Decompression>
-  static void DecompressFace(decompressor decompress,Decompression &dd)
-  {
-    auto Ls = dd.dims[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth=Ls/2;
-#endif    
-    // Just pass in the Grid
-    auto kp = dd.kernel_p;
-    auto mp = dd.mpi_p;
-    int size= dd.buffer_size;
-    int vol= size/Ls;
-    accelerator_forNB(o,size,1,{
-	int idx=o/Ls;
-	int   s=o%Ls;
-	if ( s < depth ) {
-	  int oo=s*vol+idx;
-	  kp[o]=mp[oo];
-	} else if ( s >= Ls-depth ) {
-	  int sc = depth + s - (Ls-depth);
-	  int oo=sc*vol+idx; 
-	  kp[o]=mp[oo];
-	} else {
-	  kp[o] = Zero();//fill rest with zero if partial dirichlet
-	}
-    });
-  }
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  // Need to gather *interior portions* for ALL s-slices in simd directions
-  // Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
-  // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
-  ////////////////////////////////////////////////////////////////////////////////////////////
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
-				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
-				    compressor &compress,int type,int partial)
-  {
-    GridBase *Grid = rhs.Grid();
-    int Ls = Grid->_rdimensions[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth = Ls/2;
-#endif
-    
-    // insertion of zeroes...
-    assert( (table.size()&0x1)==0);
-    int num=table.size()/2;
-    int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
-    
-    auto rhs_v = rhs.View(AcceleratorRead);
-    auto p0=&pointers[0][0];
-    auto p1=&pointers[1][0];
-    auto tp=&table[0];
-    int nnum=num/Ls;
-    accelerator_forNB(j, num, vobj::Nsimd(), {
-	//  Reorders both local and remote comms buffers
-	//  
-	int s  = j % Ls;
-	int sp1 = (s+depth)%Ls;  // peri incremented s slice
-	
-	int hxyz= j/Ls;
-
-	int xyz0= hxyz*2; // xyzt part of coor
-	int xyz1= hxyz*2+1;
-	
-	int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
-	
-	int kk0= xyz0*Ls + s ; // s=0 goes to s=1
-	int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
-	compress.CompressExchange(p0[jj],p1[jj],
-				  rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
-				  rhs_v[so+tp[kk1 ].second], 
-				  type);
-    });
-    rhs_v.ViewClose();
-  }
-  // Merge routine is for SIMD faces
-  template<class decompressor,class Merger>
-  static void MergeFace(decompressor decompress,Merger &mm)
-  {
-    auto Ls = mm.dims[0];
-#ifdef DWF_COMPRESS
-    int depth=dwf_compressor_depth;
-#else
-    int depth = Ls/2;
-#endif
-    int  num= mm.buffer_size/2; // relate vol and Ls to buffer size
-    auto mp = &mm.mpointer[0];
-    auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
-    auto vp1= &mm.vpointers[1][0];
-    auto type= mm.type;
-    int nnum = num/Ls;
-    accelerator_forNB(o,num,Merger::Nsimd,{
-
-	int  s=o%Ls;
-	int hxyz=o/Ls; // xyzt related component
-	int xyz0=hxyz*2;
-	int xyz1=hxyz*2+1;
-
-	int sp = (s+depth)%Ls; 
-	int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
-
-	int oo0= s+xyz0*Ls;
-	int oo1= s+xyz1*Ls;
-
-	// same ss0, ss1 pair goes to new layout
-	decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
-      });
-  }
-};
-class FaceGatherDWFMixedBCs
-{
-public:
-#ifdef DWF_COMPRESS
-  static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
-#else 
-  static int PartialCompressionFactor(GridBase *grid) {return 1;}
-#endif
-  
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
-					 const Lattice<vobj> &rhs,
-					 cobj *buffer,
-					 compressor &compress,
-					 int off,int so,int partial)
-  {
-    //    std::cout << " face gather simple DWF partial "<<partial <<std::endl;
-    if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
-    else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
-  }
-  template<class vobj,class cobj,class compressor>
-  static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
-				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
-				    compressor &compress,int type,int partial)
-  {
-    //    std::cout << " face gather exch DWF partial "<<partial <<std::endl;
-    if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
-    else        FaceGatherSimple::Gather_plane_exchange    (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
-  }
-  template<class decompressor,class Merger>
-  static void MergeFace(decompressor decompress,Merger &mm)
-  {
-    int partial = mm.partial;
-    //    std::cout << " merge DWF partial "<<partial <<std::endl;
-    if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
-    else           FaceGatherSimple::MergeFace(decompress,mm);
-  }
-
-  template<class decompressor,class Decompression>
-  static void DecompressFace(decompressor decompress,Decompression &dd)
-  {
-    int partial = dd.partial;
-    //    std::cout << " decompress DWF partial "<<partial <<std::endl;
-    if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
-    else           FaceGatherSimple::DecompressFace(decompress,dd);
-  }
-};
-
 /////////////////////////////////////////////////////////////////////////////////////////////
 // optimised versions supporting half precision too??? Deprecate
 /////////////////////////////////////////////////////////////////////////////////////////////
@@ -242,8 +39,7 @@ public:

 //Could make FaceGather a template param, but then behaviour is runtime not compile time
 template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
-class WilsonCompressorTemplate  : public FaceGatherDWFMixedBCs
-//  : public FaceGatherSimple
+class WilsonCompressorTemplate : public FaceGatherSimple
 {
 public:
  
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -165,6 +165,12 @@ public:
  StencilImpl Stencil;
  StencilImpl StencilEven;
  StencilImpl StencilOdd;
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }

  // Copy of the gauge field , with even and odd subsets
  DoubledGaugeField Umu;
--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -204,7 +204,14 @@ public:
  DoubledGaugeField Umu;
  DoubledGaugeField UmuEven;
  DoubledGaugeField UmuOdd;
-    
+
+
+  void SloppyComms(int sloppy)
+  {
+    Stencil.SetSloppyComms(sloppy);
+    StencilEven.SetSloppyComms(sloppy);
+    StencilOdd.SetSloppyComms(sloppy);
+  }
  // Comms buffer
  //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;

--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -252,6 +252,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{

  out = in;
  RealD taus = 0.;
+
+  // Perform initial t=0 measurements
+  for(auto const &meas : this->functions)
+    meas.second(0,taus,out);
+  
  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
    evolve_step(out, taus);
@@ -336,6 +341,11 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
  RealD taus = 0.;
  RealD eps = init_epsilon;
  unsigned int step = 0;
+
+  // Perform initial t=0 measurements
+  for(auto const &meas : this->functions)
+    meas.second(step,taus,out);
+  
  do{
    int step_success = evolve_step_adaptive(out, taus, eps); 
    step += step_success; //step will not be incremented if the integration step fails
--- a/Grid/stencil/Stencil.cc
+++ b/Grid/stencil/Stencil.cc
@@ -30,25 +30,26 @@
 NAMESPACE_BEGIN(Grid);

 uint64_t DslashFullCount;
-uint64_t DslashPartialCount;
+//uint64_t DslashPartialCount;
 uint64_t DslashDirichletCount;

 void DslashResetCounts(void)
 {
  DslashFullCount=0;
-  DslashPartialCount=0;
+  //  DslashPartialCount=0;
  DslashDirichletCount=0;
 }
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
 {
  dirichlet = DslashDirichletCount;
-  partial   = DslashPartialCount;
+  partial   = 0;
  full      = DslashFullCount;
 }
 void DslashLogFull(void)     { DslashFullCount++;}
-void DslashLogPartial(void)  { DslashPartialCount++;}
+//void DslashLogPartial(void)  { DslashPartialCount++;}
 void DslashLogDirichlet(void){ DslashDirichletCount++;}

+deviceVector<unsigned char> StencilBuffer::DeviceCommBuf; 

 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table)
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
 // These can move into a params header and be given MacroMagic serialisation
 struct DefaultImplParams {
  Coordinate dirichlet; // Blocksize of dirichlet BCs
-  int  partialDirichlet;
+  //  int  partialDirichlet;
  DefaultImplParams()  {
    dirichlet.resize(0);
-    partialDirichlet=0;
+    //    partialDirichlet=0;
  };
 };

@@ -69,6 +69,12 @@ struct DefaultImplParams {
 void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
 				 int off,std::vector<std::pair<int,int> > & table);

+class StencilBuffer
+{
+public:
+  static deviceVector<unsigned char> DeviceCommBuf;     // placed in Stencil.cc
+};
+
 void DslashResetCounts(void);
 void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
 void DslashLogFull(void);
@@ -113,8 +119,8 @@ class CartesianStencilAccelerator {
  ///////////////////////////////////////////////////
  // If true, this is partially communicated per face
  ///////////////////////////////////////////////////
-  StencilVector _comms_partial_send; 
-  StencilVector _comms_partial_recv;
+  //  StencilVector _comms_partial_send; 
+  //  StencilVector _comms_partial_recv;
  //
  StencilVector _comm_buf_size;
  StencilVector _permute_type;
@@ -205,16 +211,16 @@ public:
  struct Packet {
    void * send_buf;
    void * recv_buf;
-#ifndef ACCELERATOR_AWARE_MPI
-    void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
-    void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
-#endif
+    void * compressed_send_buf;
+    void * compressed_recv_buf;
    Integer to_rank;
    Integer from_rank;
    Integer do_send;
    Integer do_recv;
    Integer xbytes;
    Integer rbytes;
+    Integer xbytes_compressed;
+    Integer rbytes_compressed;
  };
  struct Merge {
    static constexpr int Nsimd = vobj::Nsimd();
@@ -223,7 +229,7 @@ public:
    std::vector<cobj *> vpointers;
    Integer buffer_size;
    Integer type;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct Decompress {
@@ -231,7 +237,7 @@ public:
    cobj * kernel_p;
    cobj * mpi_p;
    Integer buffer_size;
-    Integer partial; // partial dirichlet BCs
+    //    Integer partial; // partial dirichlet BCs
    Coordinate dims;
  };
  struct CopyReceiveBuffer {
@@ -252,9 +258,45 @@ public:

 protected:
  GridBase *                        _grid;
+
+  ///////////////////////////////////////////////////
+  // Sloppy comms will make a second buffer upon comms
+  ///////////////////////////////////////////////////
+  size_t device_heap_top;  //
+  size_t device_heap_bytes;//
+  size_t device_heap_size; //
+  void *DeviceBufferMalloc(size_t bytes)
+  {
+    void *ptr = (void *)device_heap_top;
+    device_heap_top  += bytes;
+    device_heap_bytes+= bytes;
+    if ( device_heap_bytes > device_heap_size ) {
+      std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
+      assert (device_heap_bytes <= device_heap_size);
+    }
+    return ptr;
+  }
+  void  DeviceBufferFreeAll(void)
+  {
+    device_heap_size = _unified_buffer_size*sizeof(cobj);
+    // Resize up if necessary, never down
+    if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
+      StencilBuffer::DeviceCommBuf.resize(device_heap_size);
+    }
+    device_heap_top  =(size_t) &StencilBuffer::DeviceCommBuf[0];
+    device_heap_size = StencilBuffer::DeviceCommBuf.size();
+    device_heap_bytes=0;
+  }
+
 public:
  GridBase *Grid(void) const { return _grid; }

+  /////////////////////////////////////////////////////////
+  // Control reduced precision comms
+  /////////////////////////////////////////////////////////
+  int SloppyComms;
+  void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
+
  ////////////////////////////////////////////////////////////////////////
  // Needed to conveniently communicate gparity parameters into GPU memory
  // without adding parameters. Perhaps a template parameter to StenciView is
@@ -268,7 +310,7 @@ public:
  }

  int face_table_computed;
-  int partialDirichlet;
+  //  int partialDirichlet;
  int fullDirichlet;
  std::vector<deviceVector<std::pair<int,int> > > face_table ;
  deviceVector<int> surface_list;
@@ -361,24 +403,145 @@ public:
  ////////////////////////////////////////////////////////////////////////
  // Non blocking send and receive. Necessarily parallel.
  ////////////////////////////////////////////////////////////////////////
+  void DecompressPacket(Packet &packet)
+  {
+    if ( !SloppyComms ) return;
+
+    if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
+
+      typedef typename getPrecision<cobj>::real_scalar_type word;
+      uint64_t words = packet.rbytes/sizeof(word);
+      const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+      const uint64_t outer = words/nsimd;
+
+      if(sizeof(word)==8) {
+
+	// Can either choose to represent as float vs double and prec change
+	// OR
+	// truncate the mantissa bfp16 style
+	double *dbuf =(double *) packet.recv_buf;
+	float  *fbuf =(float  *) packet.compressed_recv_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
+	});
+
+      } else if ( sizeof(word)==4){
+	// Can either choose to represent as half vs float and prec change
+        // OR
+	// truncate the mantissa bfp16 style
+
+	uint32_t *fbuf =(uint32_t *) packet.recv_buf;
+	uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
+	});
+
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+    }
+  }
+  void CompressPacket(Packet &packet)
+  {
+    packet.xbytes_compressed = packet.xbytes;
+    packet.compressed_send_buf = packet.send_buf;
+
+    packet.rbytes_compressed = packet.rbytes;
+    packet.compressed_recv_buf = packet.recv_buf;
+
+    if ( !SloppyComms  ) {
+      return;
+    }
+
+    typedef typename getPrecision<cobj>::real_scalar_type word;
+    uint64_t words = packet.xbytes/sizeof(word);
+    const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
+    const uint64_t outer = words/nsimd;
+
+    if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
+
+      packet.rbytes_compressed = packet.rbytes/2;
+      packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
+      //      std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
+      
+    }
+    //else {
+    //      std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
+    //    }
+    
+    if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
+
+      packet.xbytes_compressed = packet.xbytes/2;
+      packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
+      //      std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
+
+      if(sizeof(word)==8) {
+
+	double *dbuf =(double *) packet.send_buf;
+	float  *fbuf =(float  *) packet.compressed_send_buf;
+
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
+	});
+
+      } else if ( sizeof(word)==4){
+
+	uint32_t *fbuf =(uint32_t *) packet.send_buf;
+	uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
+	
+	accelerator_forNB(ss,outer,nsimd,{
+	  int lane = acceleratorSIMTlane(nsimd);
+	  hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
+	});
+
+      } else {
+	assert(0 && "unknown floating point precision");
+      }
+
+    }
+    //    else {
+    //      std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
+    //    }
+
+    return;
+  }
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
-    //    std::cout << "Communicate Begin "<<std::endl;
-    //    _grid->Barrier();
    FlightRecorder::StepLog("Communicate begin");
+    ///////////////////////////////////////////////
    // All GPU kernel tasks must complete
-    //    accelerator_barrier();     // All kernels should ALREADY be complete
-    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
-                               // But the HaloGather had a barrier too.
+    //    accelerator_barrier();      All kernels should ALREADY be complete
+    //Everyone is here, so noone running slow and still using receive buffer
+    _grid->StencilBarrier();
+    // But the HaloGather had a barrier too.
+    ///////////////////////////////////////////////
+    if (SloppyComms) {
+      DeviceBufferFreeAll();
+    }
+    for(int i=0;i<Packets.size();i++){
+      this->CompressPacket(Packets[i]);
+    }
+    if (SloppyComms) { 
+      accelerator_barrier();
+#ifdef NVLINK_GET
+      _grid->StencilBarrier(); 
+#endif
+    }
+    
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate prepare "<<i<<std::endl;
      //      _grid->Barrier();
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
-					  Packets[i].send_buf,
+					  Packets[i].compressed_send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
-					  Packets[i].recv_buf,
+					  Packets[i].compressed_recv_buf,
 					  Packets[i].from_rank,Packets[i].do_recv,
-					  Packets[i].xbytes,Packets[i].rbytes,i);
+					  Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
    }
    //    std::cout << "Communicate PollDtoH "<<std::endl;
    //    _grid->Barrier();
@@ -389,18 +552,22 @@ public:
    // Starts intranode
    for(int i=0;i<Packets.size();i++){
      //      std::cout << "Communicate Begin "<<i<<std::endl;
+      //      _grid->Barrier();
      _grid->StencilSendToRecvFromBegin(MpiReqs,
-					Packets[i].send_buf,
+					Packets[i].send_buf,Packets[i].compressed_send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
-					Packets[i].recv_buf,
+					Packets[i].recv_buf,Packets[i].compressed_recv_buf,
 					Packets[i].from_rank,Packets[i].do_recv,
-					Packets[i].xbytes,Packets[i].rbytes,i);
+					Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
+      //      std::cout << "Communicate Begin started "<<i<<std::endl;
+      //      _grid->Barrier();
    }
+    FlightRecorder::StepLog("Communicate begin has finished");
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
      if ( Packets[i].do_send )
-	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
+	FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
    }
  }

@@ -415,14 +582,15 @@ public:
    //    std::cout << "Communicate Complete Complete "<<std::endl;
    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
-    if   ( this->partialDirichlet ) DslashLogPartial();
-    else if ( this->fullDirichlet ) DslashLogDirichlet();
+    //    if   ( this->partialDirichlet ) DslashLogPartial();
+    if ( this->fullDirichlet ) DslashLogDirichlet();
    else DslashLogFull();
    //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
    //    accelerator_barrier(); 
    for(int i=0;i<Packets.size();i++){
+      this->DecompressPacket(Packets[i]);
      if ( Packets[i].do_recv )
-	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
+	FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
    }
    FlightRecorder::StepLog("Finish communicate complete");
  }
@@ -617,7 +785,7 @@ public:
  }
  void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
    Decompress d;
-    d.partial  = this->partialDirichlet;
+    //    d.partial  = this->partialDirichlet;
    d.dims     = _grid->_fdimensions;
    d.kernel_p = k_p;
    d.mpi_p    = m_p;
@@ -626,7 +794,7 @@ public:
  }
  void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
    Merge m;
-    m.partial  = this->partialDirichlet;
+    //    m.partial  = this->partialDirichlet;
    m.dims     = _grid->_fdimensions;
    m.type     = type;
    m.mpointer = merge_p;
@@ -731,8 +899,8 @@ public:
      int block = dirichlet_block[dimension];
      this->_comms_send[ii] = comm_dim;
      this->_comms_recv[ii] = comm_dim;
-      this->_comms_partial_send[ii] = 0;
-      this->_comms_partial_recv[ii] = 0;
+      //      this->_comms_partial_send[ii] = 0;
+      //      this->_comms_partial_recv[ii] = 0;
      if ( block && comm_dim ) {
 	assert(abs(displacement) < ld );
 	// Quiesce communication across block boundaries
@@ -753,10 +921,10 @@ public:
 	  if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
 	  if ( ( (ld*pc     ) % block ) == 0 ) this->_comms_recv[ii] = 0;
 	}
-	if ( partialDirichlet ) {
-	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
-	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
-	}
+	//	if ( partialDirichlet ) {
+	//	  this->_comms_partial_send[ii] = !this->_comms_send[ii];
+	//	  this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
+	//	}
      }
    }
  }
@@ -768,6 +936,7 @@ public:
 		   Parameters p=Parameters(),
 		   bool preserve_shm=false)
  {
+    SloppyComms = 0;
    face_table_computed=0;
    _grid    = grid;
    this->parameters=p;
@@ -785,7 +954,7 @@ public:
    this->same_node.resize(npoints);

    if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
-    partialDirichlet = p.partialDirichlet;
+    //    partialDirichlet = p.partialDirichlet;
    DirichletBlock(p.dirichlet); // comms send/recv set up
    fullDirichlet=0;
    for(int d=0;d<p.dirichlet.size();d++){
@@ -866,7 +1035,7 @@ public:
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();

-    // Allow for multiple stencils to exist simultaneously
+    // Allow for multiple stencils to be communicated simultaneously
    if (!preserve_shm)
      _grid->ShmBufferFreeAll();

@@ -934,7 +1103,8 @@ public:
    GridBase *grid=_grid;
    const int Nsimd = grid->Nsimd();

-    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    //    int comms_recv      = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
+    int comms_recv      = this->_comms_recv[point];
    int fd              = _grid->_fdimensions[dimension];
    int ld              = _grid->_ldimensions[dimension];
    int rd              = _grid->_rdimensions[dimension];
@@ -1123,8 +1293,8 @@ public:

    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;
    
    assert(rhs.Grid()==_grid);
    //	  conformable(_grid,rhs.Grid());
@@ -1159,11 +1329,11 @@ public:
 	int rbytes;

 	if ( comms_send ) xbytes = bytes; // Full send
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0; // full dirichlet

 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;
 	
 	int so  = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@@ -1190,7 +1360,8 @@ public:
 	}


-	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	//	if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
+	if ( compress.DecompressionStep()&&comms_recv) {
 	  recv_buf=u_simd_recv_buf[0];
 	} else {
 	  recv_buf=this->u_recv_buf_p;
@@ -1224,7 +1395,8 @@ public:
 #endif

 	//	std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
-	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	//	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
+	compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);

        int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
 	if ( !duplicate ) { // Force comms for now
@@ -1233,8 +1405,8 @@ public:
 	  // Build a list of things to do after we synchronise GPUs
 	  // Start comms now???
 	  ///////////////////////////////////////////////////////////
-	  int do_send = (comms_send|comms_partial_send) && (!shm_send );
-	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
+	  int do_send = (comms_send) && (!shm_send );
+	  int do_recv = (comms_send) && (!shm_recv );
 	  AddPacket((void *)&send_buf[comm_off],
 		    (void *)&recv_buf[comm_off],
 		    xmit_to_rank, do_send,
@@ -1242,7 +1414,7 @@ public:
 		    xbytes,rbytes);
 	}

-	if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
+	if ( (compress.DecompressionStep() && comms_recv) ) {
 	  AddDecompress(&this->u_recv_buf_p[comm_off],
 			&recv_buf[comm_off],
 			words,Decompressions);
@@ -1264,8 +1436,8 @@ public:

    int comms_send   = this->_comms_send[point];
    int comms_recv   = this->_comms_recv[point];
-    int comms_partial_send   = this->_comms_partial_send[point] ;
-    int comms_partial_recv   = this->_comms_partial_recv[point] ;
+    //    int comms_partial_send   = this->_comms_partial_send[point] ;
+    //    int comms_partial_recv   = this->_comms_partial_recv[point] ;

    int fd = _grid->_fdimensions[dimension];
    int rd = _grid->_rdimensions[dimension];
@@ -1340,18 +1512,20 @@ public:

 	
 	if ( comms_send ) xbytes = bytes;
-	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else xbytes = 0;

 	if ( comms_recv ) rbytes = bytes;
-	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
+	//	else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
 	else rbytes = 0;

 	// Gathers SIMD lanes for send and merge
 	// Different faces can be full comms or partial comms with  multiple ranks per node
-	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	//	if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
+	if ( comms_send || comms_recv ) {

-	  int partial = partialDirichlet;
+	  //	  int partial = partialDirichlet;
+	  int partial = 0;
 	  compressor::Gather_plane_exchange(face_table[face_idx],rhs,
 					    spointers,dimension,sx,cbmask,
 					    compress,permute_type,partial );
@@ -1417,7 +1591,8 @@ public:
 	      if ( (bytes != rbytes) && (rbytes!=0) ){
 		acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
 	      }
-	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      //	      int do_send = (comms_send|comms_partial_send) && (!shm_send );
+	      int do_send = (comms_send) && (!shm_send );
 	      AddPacket((void *)sp,(void *)rp,
 			xmit_to_rank,do_send,
 			recv_from_rank,do_send,
@@ -1431,7 +1606,8 @@ public:
 	  }
 	}
 	// rpointer may be doing a remote read in the gather over SHM
-	if ( comms_recv|comms_partial_recv ) {
+	//	if ( comms_recv|comms_partial_recv ) {
+	if ( comms_recv ) {
 	  AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
 	}

--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -67,7 +67,7 @@ void acceleratorInit(void)
 	printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);


-	GPU_PROP_FMT(totalGlobalMem,"%lld");
+	GPU_PROP_FMT(totalGlobalMem,"%zu");
 	GPU_PROP(managedMemory);
 	GPU_PROP(isMultiGpuBoard);
 	GPU_PROP(warpSize);
@@ -240,7 +240,7 @@ void acceleratorInit(void)

  char hostname[HOST_NAME_MAX+1];
  gethostname(hostname, HOST_NAME_MAX+1);
-  if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
+  if ( rank==0 ) printf("AcceleratorSyclInit world_rank %d is host %s \n",world_rank,hostname);

  auto devices = sycl::device::get_devices();
  for(int d = 0;d<devices.size();d++){
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
  auto err = cudaMallocHost((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
    assert(0);
  }
  return ptr;
@@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
  auto err = cudaMallocManaged((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
    assert(0);
  }
  return ptr;
@@ -237,7 +237,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
  auto err = cudaMalloc((void **)&ptr,bytes);
  if( err != cudaSuccess ) {
    ptr = (void *) NULL;
-    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
+    printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
  }
  return ptr;
 };
@@ -251,7 +251,7 @@ inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { c
 inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
-  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
+  acceleratorCopyToDevice(from,to,bytes);
  return 0;
 }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
@@ -337,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    cgh.parallel_for(							\
 		     sycl::nd_range<3>(global,local),			\
 		     [=] (sycl::nd_item<3> item) /*mutable*/		\
-		     [[intel::reqd_sub_group_size(16)]]			\
+		     [[sycl::reqd_sub_group_size(16)]]			\
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
 		       auto iter2    = item.get_global_id(1);		\
--- a/Grid/threads/ThreadReduction.h
+++ b/Grid/threads/ThreadReduction.h
@@ -28,11 +28,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once 

-#ifndef MIN
-#define MIN(x,y) ((x)>(y)?(y):(x))
-#endif
-
-
 // Introduce a class to gain deterministic bit reproducible reduction.
 // make static; perhaps just a namespace is required.
 NAMESPACE_BEGIN(Grid);
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -46,10 +46,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <cstdlib>
 #include <memory>

+
 #include <Grid/Grid.h>

 #include <Grid/util/CompilerCompatible.h>

+#ifdef HAVE_UNWIND
+#include <libunwind.h>
+#endif

 #include <fenv.h>
 #ifdef __APPLE__
@@ -295,6 +299,20 @@ void GridBanner(void)
    std::cout << std::setprecision(9);
 }

+//Some file local variables
+static int fileno_stdout;
+static int fileno_stderr;
+static int signal_delay;
+class dlRegion {
+public:
+  uint64_t start;
+  uint64_t end;
+  uint64_t size;
+  uint64_t offset;
+  std::string name;
+};
+std::vector<dlRegion> dlMap;
+
 void Grid_init(int *argc,char ***argv)
 {

@@ -347,6 +365,19 @@ void Grid_init(int *argc,char ***argv)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
+  // Sleep n-seconds at end of handler
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
+    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
+    GridCmdOptionInt(arg,signal_delay);
+  }
+  // periodic wakeup with stack trace printed
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
+    Grid_debug_heartbeat();
+  }
+  // periodic wakeup with empty handler (interrupts some system calls)
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
+    Grid_heartbeat();
+  }

 #if defined(A64FX)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
@@ -396,15 +427,25 @@ void Grid_init(int *argc,char ***argv)
    fp=freopen(ename.str().c_str(),"w",stderr);
    assert(fp!=(FILE *)NULL);
  }
+  fileno_stdout = fileno(stdout);
+  fileno_stderr = fileno(stderr) ;
+    
  ////////////////////////////////////////////////////
  // OK to use GridLogMessage etc from here on
  ////////////////////////////////////////////////////
  std::cout << GridLogMessage << "================================================ "<<std::endl;
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
  std::cout << GridLogMessage << "================================================ "<<std::endl;
-
-  gethostname(hostname, HOST_NAME_MAX+1);
-  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
+  {
+    gethostname(hostname, HOST_NAME_MAX+1);
+    time_t mytime;
+    struct tm *info;
+    char buffer[80];
+    time(&mytime);
+    info = localtime(&mytime);
+    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", info);
+    std::cout << GridLogMessage << "This rank is running on host "<< hostname<<" at local time "<<buffer<<std::endl;
+  }

  /////////////////////////////////////////////////////////
  // Reporting
@@ -421,6 +462,47 @@ void Grid_init(int *argc,char ***argv)
    MemoryProfiler::stats = &dbgMemStats;
  }

+  /////////////////////////////////////////////////////////
+  // LD.so space
+  /////////////////////////////////////////////////////////
+#ifndef __APPLE__
+  {
+    // Provides mapping of .so files 
+    FILE *f = fopen("/proc/self/maps", "r");
+    if (f) {
+      char line[256];
+      while (fgets(line, sizeof(line), f)) {
+	if (strstr(line, "r-xp")) {
+	  dlRegion region;
+	  uint32_t major, minor, inode;
+	  uint64_t start,end,offset;
+	  char path[PATH_MAX];
+	  sscanf(line,"%lx-%lx r-xp %lx %x:%x %d %s",
+		 &start,&end,&offset,
+		 &major,&minor,&inode,path);
+	  region.start=start;
+	  region.end  =end;
+	  region.offset=offset;
+	  region.name = std::string(path);
+	  region.size = region.end-region.start;
+	  dlMap.push_back(region);
+	  //	  std::cout << GridLogMessage<< line;
+	}
+      }
+      fclose(f);
+    }
+    if( GridCmdOptionExists(*argv,*argv+*argc,"--dylib-map") ){
+      std::cout << GridLogMessage << "================================================ "<<std::endl;
+      std::cout << GridLogMessage<< " Dynamic library map: " <<std::endl; 
+      std::cout << GridLogMessage << "================================================ "<<std::endl;
+      for(int r=0;r<dlMap.size();r++){
+	auto region = dlMap[r];
+	std::cout << GridLogMessage<<" "<<region.name<<std::hex<<region.start<<"-"<<region.end<<" sz "<<region.size<<std::dec<<std::endl;
+      }
+      std::cout << GridLogMessage << "================================================ "<<std::endl;
+    }
+  }
+#endif
  ////////////////////////////////////
  // Logging
  ////////////////////////////////////
@@ -453,14 +535,19 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
    std::cout<<GridLogMessage<<"  --device-mem M  : Size of device software cache for lattice fields (MB) "<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
-    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
+    std::cout<<GridLogMessage<<"Verbose:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --log list      : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
-    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
-    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl;
-    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;
-    std::cout<<GridLogMessage<<"  --debug-mem     : print Grid allocator activity"<<std::endl;
    std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;
+    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
+    std::cout<<GridLogMessage<<"Debug:"<<std::endl;
+    std::cout<<GridLogMessage<<"  --dylib-map     : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
+    std::cout<<GridLogMessage<<"  --heartbeat     : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
+    std::cout<<GridLogMessage<<"  --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
+    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
+    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
+    std::cout<<GridLogMessage<<"  --debug-heartbeat : periodically report backtrace "<<std::endl;
+    std::cout<<GridLogMessage<<"  --debug-mem     : print Grid allocator activity"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
@@ -555,17 +642,56 @@ void GridLogLayout() {
 }

 void * Grid_backtrace_buffer[_NBACKTRACE];
+#define SIGLOG(A) ::write(fileno_stderr,A,strlen(A));

-void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
+void sig_print_dig(uint32_t dig)
 {
-  fprintf(stderr,"Signal handler on host %s\n",hostname);
-  fprintf(stderr,"FlightRecorder step %d stage %s \n",
-	  FlightRecorder::StepLoggingCounter,
-	  FlightRecorder::StepName);
-  fprintf(stderr,"Caught signal %d\n",si->si_signo);
-  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
-  fprintf(stderr,"         code %d\n",si->si_code);
-  // x86 64bit
+  const char *digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
+  if ( dig>=0 && dig< 16){
+    SIGLOG(digits[dig]);
+  }
+}
+void sig_print_uint(uint32_t A)
+{
+  int dig;
+  int nz=0;
+#define DIGIT(DIV) dig = (A/DIV)%10 ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
+  DIGIT(1000000000); // Catches 4BN = 2^32
+  DIGIT(100000000);
+  DIGIT(10000000);
+  DIGIT(1000000);
+  DIGIT(100000);
+  DIGIT(10000);
+  DIGIT(1000);
+  DIGIT(100);
+  DIGIT(10);
+  DIGIT(1);
+  if (nz==0) SIGLOG("0");
+}
+void sig_print_hex(uint64_t A)
+{
+  int nz=0;
+  int dig;
+#define NIBBLE(A) dig = A ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;  
+  SIGLOG("0x");
+  NIBBLE((A>>(15*4))&0xF);
+  NIBBLE((A>>(14*4))&0xF);
+  NIBBLE((A>>(13*4))&0xF);
+  NIBBLE((A>>(12*4))&0xF);
+  NIBBLE((A>>(11*4))&0xF);
+  NIBBLE((A>>(10*4))&0xF);
+  NIBBLE((A>>(9*4))&0xF);
+  NIBBLE((A>>(8*4))&0xF);
+  NIBBLE((A>>(7*4))&0xF);
+  NIBBLE((A>>(6*4))&0xF);
+  NIBBLE((A>>(5*4))&0xF);
+  NIBBLE((A>>(4*4))&0xF);
+  NIBBLE((A>>(3*4))&0xF);
+  NIBBLE((A>>(2*4))&0xF);
+  NIBBLE((A>>4)&0xF);
+  sig_print_dig(A&0xF);
+}
+/*
 #ifdef __linux__
 #ifdef __x86_64__
  ucontext_t * uc= (ucontext_t *)ptr;
@@ -573,81 +699,158 @@ void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
 #endif
 #endif
-  fflush(stderr);
-  BACKTRACEFP(stderr);
-  fprintf(stderr,"Called backtrace\n");
-  fflush(stdout);
-  fflush(stderr);
+*/
+void Grid_generic_handler(int sig,siginfo_t *si,void * ptr)
+{
+  SIGLOG("Signal handler on host ");
+  SIGLOG(hostname);
+  SIGLOG(" process id ");
+  sig_print_uint((uint32_t)getpid());
+  SIGLOG("\n");
+  SIGLOG("FlightRecorder step ");
+  sig_print_uint(FlightRecorder::StepLoggingCounter);
+  SIGLOG(" stage ");
+  SIGLOG(FlightRecorder::StepName);
+  SIGLOG("\n");
+  SIGLOG("Caught signal ");
+  sig_print_uint(si->si_signo);
+  SIGLOG("\n");
+  SIGLOG("  mem address ");
+  sig_print_hex((uint64_t)si->si_addr);
+  SIGLOG("\n");
+  SIGLOG("  code ");
+  sig_print_uint(si->si_code);
+  SIGLOG("\n");
+
+  ucontext_t *uc= (ucontext_t *)ptr;
+  
+  SIGLOG("Backtrace:\n");
+#ifdef HAVE_UNWIND
+  // Debug cross check on offsets
+  //  int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
+  //  backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
+  unw_cursor_t cursor;
+  unw_word_t ip, off;
+  if (!unw_init_local(&cursor, uc) ) {
+
+    SIGLOG("   frame     IP       function\n");
+    int level = 0;
+    int ret = 0;
+    while(1) {
+      char name[128];
+      if (level >= _NBACKTRACE) return;
+	
+      unw_get_reg(&cursor, UNW_REG_IP, &ip);
+
+      sig_print_uint(level); SIGLOG(" ");
+      sig_print_hex(ip);     SIGLOG(" ");
+      for(int r=0;r<dlMap.size();r++){
+	if((ip>=dlMap[r].start) &&(ip<dlMap[r].end)){
+	  SIGLOG(dlMap[r].name.c_str());
+	  SIGLOG("+");
+	  sig_print_hex((ip-dlMap[r].start));
+	  break;
+	}
+      }
+      SIGLOG("\n");
+      Grid_backtrace_buffer[level]=(void *)ip;
+      level++;
+      ret = unw_step(&cursor);
+      if (ret <= 0) {
+	return;
+      }
+    }
+  }
+#else
+  // Known Asynch-Signal unsafe
+  int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
+  backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
+#endif
+}
+
+void Grid_heartbeat_signal_handler(int sig,siginfo_t *si,void * ptr)
+{
+  Grid_generic_handler(sig,si,ptr);
+  SIGLOG("\n");
+}
+void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
+{
+  Grid_generic_handler(sig,si,ptr);
+  if (signal_delay) {
+    SIGLOG("Adding extra signal delay ");
+    sig_print_uint(signal_delay);
+    SIGLOG(" s\n");
+    usleep( (uint64_t) signal_delay*1000LL*1000LL);
+  }
+  SIGLOG("\n");
  return;
 }

-void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
+void Grid_fatal_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
-  fprintf(stderr,"Signal handler on host %s\n",hostname);
-  fprintf(stderr,"Caught signal %d\n",si->si_signo);
-  fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr);
-  fprintf(stderr,"         code %d\n",si->si_code);
-  // Linux/Posix
-#ifdef __linux__
-  // And x86 64bit
-#ifdef __x86_64__
-  ucontext_t * uc= (ucontext_t *)ptr;
-  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
-  fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip);
-#define REG(A)  fprintf(stderr,"  %s %lx\n",#A,sc-> A);
-  REG(rdi);
-  REG(rsi);
-  REG(rbp);
-  REG(rbx);
-  REG(rdx);
-  REG(rax);
-  REG(rcx);
-  REG(rsp);
-  REG(rip);
-
-
-  REG(r8);
-  REG(r9);
-  REG(r10);
-  REG(r11);
-  REG(r12);
-  REG(r13);
-  REG(r14);
-  REG(r15);
-#endif
-#endif
-  fflush(stderr);
-  BACKTRACEFP(stderr);
-  fprintf(stderr,"Called backtrace\n");
-  fflush(stdout);
-  fflush(stderr);
+  Grid_generic_handler(sig,si,ptr);
+  SIGLOG("\n");
  exit(0);
  return;
 };
+void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr)
+{
+  //  SIGLOG("heartbeat signal handled\n");
+  return;
+}
+void Grid_debug_heartbeat(void)
+{
+  struct sigaction sa_ping;

+  sigemptyset (&sa_ping.sa_mask);
+  sa_ping.sa_sigaction= Grid_usr_signal_handler;
+  sa_ping.sa_flags    = SA_SIGINFO;
+  sigaction(SIGALRM,&sa_ping,NULL);
+
+  // repeating 10s heartbeat
+  struct itimerval it_val;
+  it_val.it_value.tv_sec = 10;
+  it_val.it_value.tv_usec = 0;
+  it_val.it_interval = it_val.it_value;
+  setitimer(ITIMER_REAL, &it_val, NULL);
+}
+void Grid_heartbeat(void)
+{
+  struct sigaction sa_ping;
+
+  sigemptyset (&sa_ping.sa_mask);
+  sa_ping.sa_sigaction= Grid_empty_signal_handler;
+  sa_ping.sa_flags    = SA_SIGINFO;
+  sigaction(SIGALRM,&sa_ping,NULL);
+
+  // repeating 10s heartbeat
+  struct itimerval it_val;
+  it_val.it_value.tv_sec = 10;
+  it_val.it_value.tv_usec = 1000;
+  it_val.it_interval = it_val.it_value;
+  setitimer(ITIMER_REAL, &it_val, NULL);
+}
 void Grid_exit_handler(void)
 {
-  //  BACKTRACEFP(stdout);
-  //  fflush(stdout);
+  BACKTRACEFP(stdout);
+  fflush(stdout);
 }
 void Grid_debug_handler_init(void)
 {
  struct sigaction sa;
  sigemptyset (&sa.sa_mask);
-  sa.sa_sigaction= Grid_sa_signal_handler;
+  sa.sa_sigaction= Grid_fatal_signal_handler;
  sa.sa_flags    = SA_SIGINFO;
-  //  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
-  sigaction(SIGBUS,&sa,NULL);
-  //  sigaction(SIGUSR2,&sa,NULL);
-
-  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
-
-  sigaction(SIGFPE,&sa,NULL);
-  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);
+#ifndef GRID_SYCL
+  sigaction(SIGSEGV,&sa,NULL); // SYCL is using SIGSEGV
+  sigaction(SIGBUS,&sa,NULL);
+  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
+  sigaction(SIGFPE,&sa,NULL);
+#endif

-  // Non terminating SIGUSR1/2 handler
+  // Non terminating SIGHUP handler
  struct sigaction sa_ping;
  sigemptyset (&sa_ping.sa_mask);
  sa_ping.sa_sigaction= Grid_usr_signal_handler;
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -38,7 +38,11 @@ char * GridHostname(void);

 // internal, controled with --handle
 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
+void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr);
+void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr);
 void Grid_debug_handler_init(void);
+void Grid_debug_heartbeat(void);
+void Grid_heartbeat(void);
 void Grid_quiesce_nodes(void);
 void Grid_unquiesce_nodes(void);

--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@@ -66,6 +66,7 @@ namespace Grid{
  };
 }

+
 template <class T> void writeFile(T& in, std::string const fname){  
 #ifdef HAVE_LIME
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
@@ -73,7 +74,7 @@ template <class T> void writeFile(T& in, std::string const fname){
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
-  WR.writeScidacFieldRecord(in,record,0);
+  WR.writeScidacFieldRecord(in,record,0); // Lexico
  WR.close();
 #endif
  // What is the appropriate way to throw error?
@@ -107,8 +108,18 @@ int main(int argc, char **argv) {

  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){

+#if 0    
  CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
+#else
+  // Don't require Grid format RNGs
+  FieldMetaData header;
+  std::string file, filesmr;
+  file    = CPar.conf_path + "/" + CPar.conf_prefix      + "." + std::to_string(conf);
+  filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix  + "." + std::to_string(conf);

+  NerscIO::readConfiguration(Umu,header,file);
+#endif
+  
  std::cout << std::setprecision(15);
  std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
  
@@ -116,6 +127,7 @@ int main(int argc, char **argv) {
  std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);

  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
+  
  WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
    
    typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
@@ -165,33 +177,48 @@ int main(int argc, char **argv) {
    //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
    //Plq = coeff * Plq;

-    int tau = std::round(t);
-    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
-    writeFile(R,efile);
-    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
-    writeFile(qfield,tfile);

+    RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
+
+    int tau = std::round(t);
+
+    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
+    //    writeFile(R,efile);
+
+    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
+    //    writeFile(qfield,tfile);
+
+    std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
+    {
+      //      PeriodicGimplR::GaugeField Ucopy = U;
+      //      NerscIO::writeConfiguration(Ucopy,ufile);
+    }
+    
    RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
    RealD T = real( sum(qfield) );
    Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
    RealD E0 = real(peekSite(R,scoor));
    RealD T0 = real(peekSite(qfield,scoor));
    std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  "
-	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
+	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
    
  });
  
  int t=WFPar.maxTau;
  WF.smear(Uflow, Umu);
-
+  //  NerscIO::writeConfiguration(Uflow,filesmr);
+  
+  
  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
+  RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t
  RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow);
-  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
-  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
-  std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl;
-  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;
+  std::cout << GridLogMessage << "Plaquette            "<< conf << "   " << WFlow_plaq << std::endl;
+  std::cout << GridLogMessage << "T0                   "<< conf << "   " << WFlow_T0 << std::endl;
+  std::cout << GridLogMessage << "TC0                  "<< conf << "   " << WFlow_EC << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge    "<< conf << "   " << WFlow_TC   << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << "   " << WFlow_TC5Li<< std::endl;

  std::cout<< GridLogMessage << " Admissibility check:\n";
  const double sp_adm = 0.067;                // admissible threshold
--- a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc
@@ -201,8 +201,7 @@ int main(int argc, char **argv) {

  Params.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
-  ParamsDir.partialDirichlet=0;
-  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
+  //  ParamsDir.partialDirichlet=0;

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
@@ -298,11 +297,11 @@ int main(int argc, char **argv) {
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;

-    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
-    else                      ParamsNum.partialDirichlet = 0;
+    //    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
+    //    else                      ParamsNum.partialDirichlet = 0;

-    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
-    else                      ParamsDen.partialDirichlet = 0;
+    //    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
+    //    else                      ParamsDen.partialDirichlet = 0;
    
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
--- a/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
+++ b/HMC/Mobius2p1f_DD_EOFA_96I_mshift.cc
@@ -333,9 +333,9 @@ int main(int argc, char **argv) {
  ParamsF.dirichlet=NonDirichlet;
  ParamsDir.dirichlet=Dirichlet;
  ParamsDirF.dirichlet=Dirichlet;
-  ParamsDir.partialDirichlet=1;
-  ParamsDirF.partialDirichlet=1;
-  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
+  //  ParamsDir.partialDirichlet=1;
+  //  ParamsDirF.partialDirichlet=1;
+  //  std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;

  //  double StoppingCondition = 1e-14;
  //  double MDStoppingCondition = 1e-9;
@@ -481,21 +481,21 @@ int main(int argc, char **argv) {
    if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
    else                      ParamsDen.dirichlet = NonDirichlet;

-    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
-    else                      ParamsNum.partialDirichlet = 0;
+    //    if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
+    //    else                      ParamsNum.partialDirichlet = 0;

-    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
-    else                      ParamsDen.partialDirichlet = 0;
+    //    if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
+    //    else                      ParamsDen.partialDirichlet = 0;
    
    Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
    Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));

    ParamsDenF.dirichlet = ParamsDen.dirichlet;
-    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
+    //    ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
    DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));

    ParamsNumF.dirichlet = ParamsNum.dirichlet;
-    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
+    //    ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
    NumeratorsF.push_back  (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));

    LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -166,18 +166,18 @@ int main (int argc, char ** argv)
  }  


+
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
-
  for(int lat=8;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){

      Coordinate latt_size  ({lat*mpi_layout[0],
-	                      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
+    	                      lat*mpi_layout[1],
+                              lat*mpi_layout[2],
+	                      lat*mpi_layout[3]});

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
@@ -193,101 +193,6 @@ int main (int argc, char ** argv)
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
      }

-      int ncomm;
-
-      double dbytes;
-      for(int i=0;i<Nloop;i++){
-	double start=usecond();
-
-	dbytes=0;
-	ncomm=0;
-
-	std::vector<CommsRequest_t> requests;
-
-	for(int mu=0;mu<4;mu++){
-	
-
-	  if (mpi_layout[mu]>1 ) {
-	  
-	    ncomm++;
-	    int comm_proc=1;
-	    int xmit_to_rank;
-	    int recv_from_rank;
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu);
-	
-	    comm_proc = mpi_layout[mu]-1;
-	  
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu+4][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu+4][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu+4);
-	  
-	  }
-	}
-	Grid.StencilSendToRecvFromComplete(requests,0);
-	Grid.Barrier();
-	double stop=usecond();
-	t_time[i] = stop-start; // microseconds
-	
-      }
-
-      timestat.statistics(t_time);
-
-      dbytes=dbytes*ppn;
-      double xbytes    = dbytes*0.5;
-      //      double rbytes    = dbytes*0.5;
-      double bidibytes = dbytes;
-
-      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
-               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
-               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
-               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
-               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
-               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
-
-
-    }
-  }    
-
-
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
-  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  header();
-
-  for(int lat=8;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=8;Ls*=2){
-
-      Coordinate latt_size  ({lat*mpi_layout[0],
-      			      lat*mpi_layout[1],
-      			      lat*mpi_layout[2],
-      			      lat*mpi_layout[3]});
-
-      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      RealD Nrank = Grid._Nprocessors;
-      RealD Nnode = Grid.NodeCount();
-      RealD ppn = Nrank/Nnode;
-
-      std::vector<HalfSpinColourVectorD *> xbuf(8);
-      std::vector<HalfSpinColourVectorD *> rbuf(8);
-      Grid.ShmBufferFreeAll();
-      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
-      for(int d=0;d<8;d++){
-	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
-	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
-      }
-
      int ncomm;
      double dbytes;
      for(int i=0;i<Nloop;i++){
@@ -296,45 +201,34 @@ int main (int argc, char ** argv)
 	std::vector<CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
-	for(int mu=0;mu<4;mu++){
-	
+
+	for(int dir=0;dir<8;dir++) {
+
+	  double tbytes;
+	  int mu =dir % 4;
+
 	  if (mpi_layout[mu]>1 ) {
 	  
 	    ncomm++;
-	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
-	    
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu);
-	    Grid.StencilSendToRecvFromComplete(requests,mu);
-	    requests.resize(0);
+	    if ( dir == mu ) { 
+	      int comm_proc=1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    } else { 
+	      int comm_proc = mpi_layout[mu]-1;
+	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    }
+            int tid = omp_get_thread_num();
+	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
+					       (void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);

-	    comm_proc = mpi_layout[mu]-1;
-	  
-	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	    dbytes+=
-	      Grid.StencilSendToRecvFromBegin(requests,
-					      (void *)&xbuf[mu+4][0],
-					      xmit_to_rank,1,
-					      (void *)&rbuf[mu+4][0],
-					      recv_from_rank,1,
-					      bytes,bytes,mu+4);
-	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
-	    requests.resize(0);
-	  
+	    dbytes+=tbytes;
 	  }
-	}
+        }
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
-	
      }

      timestat.statistics(t_time);
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -32,18 +32,18 @@
 using namespace std;
 using namespace Grid;

-template<class d>
-struct scal {
-  d internal;
+////////////////////////
+/// Move to domains ////
+////////////////////////
+
+Gamma::Algebra Gmu [] = {
+			 Gamma::Algebra::GammaX,
+			 Gamma::Algebra::GammaY,
+			 Gamma::Algebra::GammaZ,
+			 Gamma::Algebra::GammaT
 };

-  Gamma::Algebra Gmu [] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ,
-    Gamma::Algebra::GammaT
-  };
-
+void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);

 int main (int argc, char ** argv)
 {
@@ -52,39 +52,108 @@ int main (int argc, char ** argv)

  int threads = GridThread::GetThreads();

-  Coordinate latt4 = GridDefaultLatt();
-  int Ls=8;
-  for(int i=0;i<argc;i++)
+  int Ls=16;
+  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
    }
+  }

+  //////////////////
+  // With comms
+  //////////////////
+  Coordinate Dirichlet(Nd+1,0);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet,false);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet,true);
+
+  //////////////////
+  // Domain decomposed
+  //////////////////
+  /*
+  Coordinate latt4  = GridDefaultLatt();
+  Coordinate mpi    = GridDefaultMpi();
+  Coordinate CommDim(Nd);
+  Coordinate shm;
+  GlobalSharedMemory::GetShmDims(mpi,shm);
+
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  //  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+
+  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+
+  Benchmark(Ls,Dirichlet,false);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+
+  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
+  
+  Benchmark(Ls,Dirichlet,true);
+  */
+  
+  Grid_finalize();
+  exit(0);
+}
+void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
+{
+  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();

  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);

-
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+#undef SINGLE
+#ifdef SINGLE
+  typedef vComplexF          Simd;
+  typedef LatticeFermionF    FermionField;
+  typedef LatticeGaugeFieldF GaugeField;
+  typedef LatticeColourMatrixF ColourMatrixField;
+  typedef DomainWallFermionF FermionAction;
+#else
+  typedef vComplexD          Simd;
+  typedef LatticeFermionD    FermionField;
+  typedef LatticeGaugeFieldD GaugeField;
+  typedef LatticeColourMatrixD ColourMatrixField;
+  typedef DomainWallFermionD FermionAction;
+#endif
+  
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);

-  std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
-  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
-  GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
-  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
-  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
+
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
-  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;

-  LatticeFermion src   (FGrid); random(RNG5,src);
+ 
+  FermionField src   (FGrid); random(RNG5,src);
 #if 0
  src = Zero();
  {
@@ -100,46 +169,39 @@ int main (int argc, char ** argv)
  src = src*N2;
 #endif

-
-  LatticeFermion result(FGrid); result=Zero();
-  LatticeFermion    ref(FGrid);    ref=Zero();
-  LatticeFermion    tmp(FGrid);
-  LatticeFermion    err(FGrid);
+  FermionField result(FGrid); result=Zero();
+  FermionField    ref(FGrid);    ref=Zero();
+  FermionField    tmp(FGrid);
+  FermionField    err(FGrid);

  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
-  LatticeGaugeField Umu(UGrid);
+  GaugeField Umu(UGrid);
+  GaugeField UmuCopy(UGrid);
  SU<Nc>::HotConfiguration(RNG4,Umu);
+  //  SU<Nc>::ColdConfiguration(Umu);
+  UmuCopy=Umu;
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
-#if 0
-  Umu=1.0;
-  for(int mu=0;mu<Nd;mu++){
-    LatticeColourMatrix ttmp(UGrid);
-    ttmp = PeekIndex<LorentzIndex>(Umu,mu);
-    //    if (mu !=2 ) ttmp = 0;
-    //    ttmp = ttmp* pow(10.0,mu);
-    PokeIndex<LorentzIndex>(Umu,ttmp,mu);
-  }
-  std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
-#endif

+  ////////////////////////////////////
+  // Apply BCs
+  ////////////////////////////////////
+  Coordinate Block(4);
+  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
+
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
+  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
+
+  DirichletFilter<GaugeField> Filter(Block);
+  Filter.applyFilter(Umu);
+  
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
-  // replicate across fifth dimension
-  LatticeGaugeField Umu5d(FGrid);
-  std::vector<LatticeColourMatrix> U(4,FGrid);
-  {
-    autoView( Umu5d_v, Umu5d, CpuWrite);
-    autoView( Umu_v  , Umu  , CpuRead);
-    for(int ss=0;ss<Umu.Grid()->oSites();ss++){
-      for(int s=0;s<Ls;s++){
-	Umu5d_v[Ls*ss+s] = Umu_v[ss];
-      }
-    }
-  }
+  std::vector<ColourMatrixField> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
+    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
+
  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;

  if (1)
@@ -147,10 +209,28 @@ int main (int argc, char ** argv)
    ref = Zero();
    for(int mu=0;mu<Nd;mu++){

-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
+	  }
+	}
+      }
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;

-      tmp =adj(U[mu])*src;
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
@@ -167,11 +247,9 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop                  "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
-  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
-  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
+  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -181,9 +259,15 @@ int main (int argc, char ** argv)
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;

-  DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  int ncall =1000;
-
+  FermionAction::ImplParams p;
+  p.dirichlet=Dirichlet;
+  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
+  Dw.SloppyComms(sloppy);
+  Dw.ImportGauge(Umu);
+  
+  int ncall =300;
+  RealD n2e;
+  
  if (1) {
    FGrid->Barrier();
    Dw.Dhop(src,result,0);
@@ -198,8 +282,8 @@ int main (int argc, char ** argv)
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=single_site_flops*volume*ncall;

-    auto nsimd = vComplex::Nsimd();
-    auto simdwidth = sizeof(vComplex);
+    auto nsimd = Simd::Nsimd();
+    auto simdwidth = sizeof(Simd);

    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
@@ -208,28 +292,27 @@ int main (int argc, char ** argv)
    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);

    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
-    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl;
-    std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl;
    err = ref-result;
-    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    //exit(0);
+    n2e = norm2(err);
+    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;

-    if(( norm2(err)>1.0e-4) ) {
-      /*
-      std::cout << "RESULT\n " << result<<std::endl;
-      std::cout << "REF   \n " << ref   <<std::endl;
-      std::cout << "ERR   \n " << err   <<std::endl;
-      */
+    if(( n2e>1.0e-4) ) {
      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
      FGrid->Barrier();
+      std::cout<<GridLogMessage << "RESULT" << std::endl;
+      //      std::cout << result<<std::endl;
+      std::cout << norm2(result)<<std::endl;
+      std::cout<<GridLogMessage << "REF" << std::endl;
+      std::cout << norm2(ref)<<std::endl;
+      std::cout<<GridLogMessage << "ERR" << std::endl;
+      std::cout << norm2(err)<<std::endl;
+      FGrid->Barrier();
      exit(-1);
    }
-    assert (norm2(err)< 1.0e-4 );
+    assert (n2e< 1.0e-4 );
  }

  if (1)
@@ -238,16 +321,30 @@ int main (int argc, char ** argv)
    for(int mu=0;mu<Nd;mu++){

      //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
-      tmp = U[mu]*Cshift(src,mu+1,1);
+      tmp = Cshift(src,mu+1,1);
      {
 	autoView( ref_v, ref, CpuWrite);
 	autoView( tmp_v, tmp, CpuRead);
-	for(int i=0;i<ref_v.size();i++){
-	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
+	autoView( U_v  , U[mu]  , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    int i=s+Ls*ss;
+	    ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
+	  }
 	}
      }
-
-      tmp =adj(U[mu])*src;
+      
+      {
+	autoView( tmp_v  , tmp  , CpuWrite);
+	autoView( U_v  , U[mu]  , CpuRead);
+	autoView( src_v, src    , CpuRead);
+	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
+	  for(int s=0;s<Ls;s++){
+	    tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
+	  }
+	}
+      }
+      //      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      {
 	autoView( ref_v, ref, CpuWrite);
@@ -259,27 +356,27 @@ int main (int argc, char ** argv)
    }
    ref = -0.5*ref;
  }
-  //  dump=1;
-  Dw.Dhop(src,result,1);
+
+  Dw.Dhop(src,result,DaggerYes);
+
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
+  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
+
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
  err = ref-result;
-  std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
-/*
-	std::cout<< "DAG RESULT\n "  <<ref     << std::endl;
-	std::cout<< "DAG sRESULT\n " <<result  << std::endl;
-	std::cout<< "DAG ERR   \n "  << err    <<std::endl;
-*/
-  }
-  LatticeFermion src_e (FrbGrid);
-  LatticeFermion src_o (FrbGrid);
-  LatticeFermion r_e   (FrbGrid);
-  LatticeFermion r_o   (FrbGrid);
-  LatticeFermion r_eo  (FGrid);
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;

+  assert((n2e)<1.0e-4);
+  
+  FermionField src_e (FrbGrid);
+  FermionField src_o (FrbGrid);
+  FermionField r_e   (FrbGrid);
+  FermionField r_o   (FrbGrid);
+  FermionField r_eo  (FGrid);

  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
@@ -291,10 +388,8 @@ int main (int argc, char ** argv)

  // S-direction is INNERMOST and takes no part in the parity.
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO                "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
-  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
 #ifdef GRID_OMP
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -308,13 +403,7 @@ int main (int argc, char ** argv)
    Dw.DhopEO(src_o,r_e,DaggerNo);
    double t0=usecond();
    for(int i=0;i<ncall;i++){
-#ifdef CUDA_PROFILE
-      if(i==10) cudaProfilerStart();
-#endif
      Dw.DhopEO(src_o,r_e,DaggerNo);
-#ifdef CUDA_PROFILE
-      if(i==20) cudaProfilerStop();
-#endif
    }
    double t1=usecond();
    FGrid->Barrier();
@@ -338,14 +427,9 @@ int main (int argc, char ** argv)
  setCheckerboard(r_eo,r_e);

  err = r_eo-result;
-  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-  if((norm2(err)>1.0e-4)){
-    /*
-	std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
-	std::cout<< "Deo REF\n " <<result  << std::endl;
-	std::cout<< "Deo ERR   \n " << err <<std::endl;
-    */
-  }
+  n2e= norm2(err);
+  std::cout<<GridLogMessage << "norm diff   "<< n2e<<std::endl;
+  assert(n2e<1.0e-4);

  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
@@ -354,6 +438,4 @@ int main (int argc, char ** argv)

  assert(norm2(src_e)<1.0e-4);
  assert(norm2(src_o)<1.0e-4);
-  Grid_finalize();
-  exit(0);
 }
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@@ -43,7 +43,7 @@ Gamma::Algebra Gmu [] = {
 			 Gamma::Algebra::GammaT
 };

-void Benchmark(int Ls, Coordinate Dirichlet);
+void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);

 int main (int argc, char ** argv)
 {
@@ -69,11 +69,19 @@ int main (int argc, char ** argv)
  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
  
-  Benchmark(Ls,Dirichlet);
+  Benchmark(Ls,Dirichlet,false);
+
+  std::cout << "\n\n\n\n\n\n" <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
+  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
+  
+  Benchmark(Ls,Dirichlet,true);

  //////////////////
  // Domain decomposed
  //////////////////
+  /*
  Coordinate latt4  = GridDefaultLatt();
  Coordinate mpi    = GridDefaultMpi();
  Coordinate CommDim(Nd);
@@ -81,42 +89,35 @@ int main (int argc, char ** argv)
  GlobalSharedMemory::GetShmDims(mpi,shm);


-  //////////////////////
-  // Node level
-  //////////////////////
  std::cout << "\n\n\n\n\n\n" <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
+  //  std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;

  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-  //  Dirichlet[0] = 0;
-  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
+  Dirichlet[0] = 0;
+  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
+  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
+  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
+  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];

-  Benchmark(Ls,Dirichlet);
+  Benchmark(Ls,Dirichlet,false);

  std::cout << "\n\n\n\n\n\n" <<std::endl;

  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
+  std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;

  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
-  //  Dirichlet[0] = 0;
-  //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
-  //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
-  //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
-  //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
  
-  Benchmark(Ls,Dirichlet);
-
+  Benchmark(Ls,Dirichlet,true);
+  */
+  
  Grid_finalize();
  exit(0);
 }
-void Benchmark(int Ls, Coordinate Dirichlet)
+void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
 {
  Coordinate latt4 = GridDefaultLatt();
  GridLogLayout();
@@ -132,21 +133,13 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  typedef LatticeGaugeFieldF GaugeField;
  typedef LatticeColourMatrixF ColourMatrixField;
  typedef DomainWallFermionF FermionAction;
-#endif
-#ifdef DOUBLE
+#else
  typedef vComplexD          Simd;
  typedef LatticeFermionD    FermionField;
  typedef LatticeGaugeFieldD GaugeField;
  typedef LatticeColourMatrixD ColourMatrixField;
  typedef DomainWallFermionD FermionAction;
 #endif
-#ifdef DOUBLE2
-  typedef vComplexD2          Simd;
-  typedef LatticeFermionD2    FermionField;
-  typedef LatticeGaugeFieldD2 GaugeField;
-  typedef LatticeColourMatrixD2 ColourMatrixField;
-  typedef DomainWallFermionD2 FermionAction;
-#endif
  
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -269,6 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
  FermionAction::ImplParams p;
  p.dirichlet=Dirichlet;
  FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
+  Dw.SloppyComms(sloppy);
  Dw.ImportGauge(Umu);
  
  int ncall =300;
--- a/benchmarks/Benchmark_dwf_fp32_partial.cc
+++ b/benchmarks/Benchmark_dwf_fp32_partial.cc
@@ -1,465 +0,0 @@
- /*************************************************************************************
-    Grid physics library, www.github.com/paboyle/Grid
-    Source file: ./benchmarks/Benchmark_dwf.cc
-    Copyright (C) 2015
-
-    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-    Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-#ifdef GRID_CUDA
-#define CUDA_PROFILE
-#endif
-
-#ifdef CUDA_PROFILE
-#include <cuda_profiler_api.h>
-#endif
-
-using namespace std;
-using namespace Grid;
-
-////////////////////////
-/// Move to domains ////
-////////////////////////
-
-Gamma::Algebra Gmu [] = {
-			 Gamma::Algebra::GammaX,
-			 Gamma::Algebra::GammaY,
-			 Gamma::Algebra::GammaZ,
-			 Gamma::Algebra::GammaT
-};
-
-void Benchmark(int Ls, Coordinate Dirichlet, int partial);
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-
-  int threads = GridThread::GetThreads();
-
-  int Ls=8;
-  for(int i=0;i<argc;i++) {
-    if(std::string(argv[i]) == "-Ls"){
-      std::stringstream ss(argv[i+1]); ss >> Ls;
-    }
-  }
-
-  //////////////////
-  // With comms
-  //////////////////
-  Coordinate Dirichlet(Nd+1,0);
-
-  for(auto partial : {0}) {
-  std::cout << "\n\n\n\n\n\n" <<std::endl;
-  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-  std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
-  std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    Benchmark(Ls,Dirichlet,partial);
-  }
-
-  //////////////////
-  // Domain decomposed
-  //////////////////
-  Coordinate latt4  = GridDefaultLatt();
-  Coordinate mpi    = GridDefaultMpi();
-  Coordinate CommDim(Nd);
-  //Coordinate shm({2,1,1,1});
-  Coordinate shm;
-  GlobalSharedMemory::GetShmDims(mpi,shm);
-
-  std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
-
-  //////////////////////
-  // Node level
-  //////////////////////
-  for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
-  //  for(int d=0;d<Nd;d++) CommDim[d]= 1;
-  Dirichlet[0] = 0;
-  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
-  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
-  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
-  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
-
-  for(auto partial : {0,1}) {
-    std::cout << "\n\n\n\n\n\n" <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    Benchmark(Ls,Dirichlet,partial);
-  }
-  
-  for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
-  Dirichlet[0] = 0;
-  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
-  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
-  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
-  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
-  
-  for(auto partial : {0,1}) {
-    std::cout << "\n\n\n\n\n\n" <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
-    std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
-    Benchmark(Ls,Dirichlet,partial);
-  }
-  Grid_finalize();
-  exit(0);
-}
-void Benchmark(int Ls, Coordinate Dirichlet, int partial)
-{
-  Coordinate latt4 = GridDefaultLatt();
-  GridLogLayout();
-
-  long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-#define SINGLE
-#ifdef SINGLE
-  typedef vComplexF          Simd;
-  typedef LatticeFermionF    FermionField;
-  typedef LatticeGaugeFieldF GaugeField;
-  typedef LatticeColourMatrixF ColourMatrixField;
-  typedef DomainWallFermionF FermionAction;
-#endif
-#ifdef DOUBLE
-  typedef vComplexD          Simd;
-  typedef LatticeFermionD    FermionField;
-  typedef LatticeGaugeFieldD GaugeField;
-  typedef LatticeColourMatrixD ColourMatrixField;
-  typedef DomainWallFermionD FermionAction;
-#endif
-#ifdef DOUBLE2
-  typedef vComplexD2          Simd;
-  typedef LatticeFermionD2    FermionField;
-  typedef LatticeGaugeFieldD2 GaugeField;
-  typedef LatticeColourMatrixD2 ColourMatrixField;
-  typedef DomainWallFermionD2 FermionAction;
-#endif
-  
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG"));
-
-  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG"));
-
- 
-  FermionField src   (FGrid); random(RNG5,src);
-#if 0
-  src = Zero();
-  {
-    Coordinate origin({0,0,0,latt4[2]-1,0});
-    SpinColourVectorF tmp;
-    tmp=Zero();
-    tmp()(0)(0)=Complex(-2.0,0.0);
-    std::cout << " source site 0 " << tmp<<std::endl;
-    pokeSite(tmp,src,origin);
-  }
-#else
-  RealD N2 = 1.0/::sqrt(norm2(src));
-  src = src*N2;
-#endif
-
-  FermionField result(FGrid); result=Zero();
-  FermionField    ref(FGrid);    ref=Zero();
-  FermionField    tmp(FGrid);
-  FermionField    err(FGrid);
-
-  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
-  GaugeField Umu(UGrid);
-  GaugeField UmuFull(UGrid);
-  GaugeField UmuCopy(UGrid);
-  SU<Nc>::HotConfiguration(RNG4,Umu);
-  UmuCopy=Umu;
-  UmuFull=Umu;
-  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
-
-  ////////////////////////////////////
-  // Apply BCs
-  ////////////////////////////////////
-  Coordinate Block(4);
-  for(int d=0;d<4;d++)  Block[d]= Dirichlet[d+1];
-
-  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
-  std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
-
-  DirichletFilter<GaugeField> Filter(Block);
-  Filter.applyFilter(Umu);
-  if(!partial) Filter.applyFilter(UmuCopy);
-  
-  ////////////////////////////////////
-  // Naive wilson implementation
-  ////////////////////////////////////
-  std::vector<ColourMatrixField> U(4,UGrid);
-  std::vector<ColourMatrixField> Ucopy(4,UGrid);
-  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-    Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
-  }
-
-  std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
-
-  if (1)
-  {
-    ref = Zero();
-    for(int mu=0;mu<Nd;mu++){
-      int depth=dwf_compressor_depth;
-      tmp = Cshift(src,mu+1,1);
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v    , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v  , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	autoView( src_v, src    , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      tmp =Cshift(tmp,mu+1,-1);
-      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
-    }
-    ref = -0.5*ref;
-  }
-
-  RealD mass=0.1;
-  RealD M5  =1.8;
-
-  RealD NP = UGrid->_Nprocessors;
-  RealD NN = UGrid->NodeCount();
-
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
-  std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
-  std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
-  std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
-#ifdef GRID_OMP
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-#endif
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
-  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-
-  FermionAction::ImplParams p;
-  p.dirichlet=Dirichlet;
-  p.partialDirichlet=partial;
-  FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
-  
-  int ncall =1;
-  RealD n2e;
-  
-  if (1) {
-    FGrid->Barrier();
-    Dw.Dhop(src,result,0);
-    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
-    double t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.Dhop(src,result,0);
-    }
-    double t1=usecond();
-    FGrid->Barrier();
-
-    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=single_site_flops*volume*ncall;
-
-    auto nsimd = Simd::Nsimd();
-    auto simdwidth = sizeof(Simd);
-
-    // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
-    double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
-
-    // mem: Nd Wilson * Ls, Nd gauge, Nc colors
-    double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
-
-    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
-    err = ref-result;
-    n2e = norm2(err);
-
-    std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
-
-    if(( n2e>1.0e-4) ) {
-      std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
-      FGrid->Barrier();
-
-      DumpSliceNorm("s-slice ref ",ref,1);
-      DumpSliceNorm("s-slice res ",result,1);
-      DumpSliceNorm("s-slice error ",err,1);
-      exit(-1);
-    }
-    assert (n2e< 1.0e-4 );
-  }
-
-  if (1)
-  { // Naive wilson dag implementation
-
-    ref = Zero();
-    for(int mu=0;mu<Nd;mu++){
-
-      int depth=dwf_compressor_depth;
-      tmp = Cshift(src,mu+1,1);
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v    , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
-      {
-	autoView( tmp_v  , tmp  , CpuWrite);
-	autoView( U_v  , U[mu]  , CpuRead);
-	autoView( Ucopy_v, Ucopy[mu]  , CpuRead);
-	autoView( src_v, src    , CpuRead);
-	for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
-	  for(int s=0;s<Ls;s++){
-	    if ( (s<depth) || (s>=Ls-depth)){
-	      tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
-	    } else {
-	      tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
-	    }
-	  }
-	}
-      }
-      tmp =Cshift(tmp,mu+1,-1);
-      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-    }
-    ref = -0.5*ref;
-  }
-
-  Dw.Dhop(src,result,DaggerYes);
-
-  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
-  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
-  std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
-
-  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
-  std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
-  std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl;
-  err = ref-result;
-  n2e= norm2(err);
-  std::cout<<GridLogMessage << "norm dag diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
-
-  assert((n2e)<1.0e-4);
-  
-  FermionField src_e (FrbGrid);
-  FermionField src_o (FrbGrid);
-  FermionField r_e   (FrbGrid);
-  FermionField r_o   (FrbGrid);
-  FermionField r_eo  (FGrid);
-
-  std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
-  pickCheckerboard(Even,src_e,src);
-  pickCheckerboard(Odd,src_o,src);
-
-  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
-  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
-
-
-  // S-direction is INNERMOST and takes no part in the parity.
-  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO                "<<std::endl;
-  std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
-#ifdef GRID_OMP
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-#endif
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
-  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
-  {
-    FGrid->Barrier();
-    Dw.DhopEO(src_o,r_e,DaggerNo);
-    double t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.DhopEO(src_o,r_e,DaggerNo);
-    }
-    double t1=usecond();
-    FGrid->Barrier();
-
-    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=(single_site_flops*volume*ncall)/2.0;
-
-    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
-    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl;
-  }
-  Dw.DhopEO(src_o,r_e,DaggerNo);
-  Dw.DhopOE(src_e,r_o,DaggerNo);
-  Dw.Dhop  (src  ,result,DaggerNo);
-
-  std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
-  std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
-  std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
-
-  setCheckerboard(r_eo,r_o);
-  setCheckerboard(r_eo,r_e);
-
-  err = r_eo-result;
-  n2e= norm2(err);
-  std::cout<<GridLogMessage << "norm diff   "<< n2e<< "  Line "<<__LINE__ <<std::endl;
-  assert(n2e<1.0e-4);
-
-  pickCheckerboard(Even,src_e,err);
-  pickCheckerboard(Odd,src_o,err);
-  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
-  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
-
-  assert(norm2(src_e)<1.0e-4);
-  assert(norm2(src_o)<1.0e-4);
-}
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -873,7 +873,7 @@ int main (int argc, char ** argv)
  int do_su4=0;
  int do_memory=1;
  int do_comms =1;
-  int do_blas  =0;
+  int do_blas  =1;
  int do_dslash=1;

  int sel=4;
--- a/configure.ac
+++ b/configure.ac
@@ -86,6 +86,7 @@ AC_ARG_WITH([gmp],
    [try this for a non-standard install prefix of the GMP library])],
    [AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
    [AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
+
 AC_ARG_WITH([mpfr],
    [AS_HELP_STRING([--with-mpfr=prefix],
    [try this for a non-standard install prefix of the MPFR library])],
@@ -106,6 +107,13 @@ AC_ARG_WITH([lime],
            [AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
            [AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])

+############### LIBUNWIND
+AC_ARG_WITH([unwind],
+            [AS_HELP_STRING([--with-unwind=prefix],
+            [try this for a non-standard install prefix of the libunwind library])],
+            [AM_CXXFLAGS="-I$with_unwind/include $AM_CXXFLAGS"]
+            [AM_LDFLAGS="-L$with_unwind/lib $AM_LDFLAGS"])
+
 ############### OpenSSL
 AC_ARG_WITH([openssl],
            [AS_HELP_STRING([--with-openssl=prefix],
@@ -373,6 +381,16 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
               [have_lime=true],
 	             [AC_MSG_WARN(LIME library was not found in your system.)])

+AC_SEARCH_LIBS([unw_backtrace], [unwind],
+               [AC_DEFINE([HAVE_UNWIND], [1], [Define to 1 if you have the `libunwind' library])]
+               [have_unwind=true],
+	             [AC_MSG_WARN(libunwind library was not found in your system.)])
+
+AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
+               [AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
+               [have_unwind_x86_64=true],
+	             [AC_MSG_WARN(libunwind library was not found in your system.)])
+
 AC_SEARCH_LIBS([SHA256_Init], [crypto],
               [AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]
               [have_crypto=true],
--- a/systems/Jupiter/benchmarks/dwf.1node.perf
+++ b/systems/Jupiter/benchmarks/dwf.1node.perf
@@ -0,0 +1,273 @@
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+SLURM detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 1 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0009:01:00.0
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 4
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 81604378624 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
+Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+
+
+
+
+
+
+
+Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.309000 s :  Testing with full communication 
+Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.313000 s : Grid Layout
+Grid : Message : 0.313000 s : 	Global lattice size  : 32 32 64 64 
+Grid : Message : 0.319000 s : 	OpenMP threads       : 4
+Grid : Message : 0.320000 s : 	MPI tasks            : 1 1 2 2 
+Grid : Message : 0.129590 s : Initialising 4d RNG
+Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 0.942440 s : Initialising 5d RNG
+Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+local rank 1 device 0 bus id: 0019:01:00.0
+local rank 2 device 0 bus id: 0029:01:00.0
+local rank 3 device 0 bus id: 0039:01:00.0
+Grid : Message : 43.893114 s : Drawing gauge field
+Grid : Message : 54.574150 s : Random gauge initialised 
+Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 54.580032 s : Setting up Cshift based reference 
+Grid : Message : 60.407451 s : *****************************************************************
+Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 60.407470 s : *****************************************************************
+Grid : Message : 60.407471 s : *****************************************************************
+Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 60.407473 s : * Vectorising space-time by 8
+Grid : Message : 60.407475 s : * VComplex size is 64 B
+Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
+Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 60.407480 s : *****************************************************************
+Grid : Message : 61.102178 s : Called warmup
+Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
+Grid : Message : 62.177198 s : mflop/s =   24721998.6
+Grid : Message : 62.177201 s : mflop/s per rank =  6180499.64
+Grid : Message : 62.177204 s : mflop/s per node =  24721998.6
+Grid : Message : 62.182696 s : norm diff   5.8108784e-14  Line 306
+Grid : Message : 71.328862 s : ----------------------------------------------------------------
+Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 71.328885 s : ----------------------------------------------------------------
+Grid : Message : 71.328886 s : Called DwDag
+Grid : Message : 71.328887 s : norm dag result 4.12810493
+Grid : Message : 71.329493 s : norm dag ref    4.12810493
+Grid : Message : 71.331967 s : norm dag diff   3.40632318e-14  Line 377
+Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 71.803650 s : src_e0.500003185
+Grid : Message : 71.819727 s : src_o0.499996882
+Grid : Message : 71.821991 s : *********************************************************
+Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 71.821995 s : * Vectorising space-time by 8
+Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
+Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 71.822003 s : *********************************************************
+Grid : Message : 72.377054 s : Deo mflop/s =   24065467
+Grid : Message : 72.377071 s : Deo mflop/s per rank   6016366.75
+Grid : Message : 72.377074 s : Deo mflop/s per node   24065467
+Grid : Message : 72.624877 s : r_e2.06377678
+Grid : Message : 72.625198 s : r_o2.06381058
+Grid : Message : 72.625507 s : res4.12758736
+Grid : Message : 73.759140 s : norm diff   0
+Grid : Message : 73.868204 s : norm diff even  0
+Grid : Message : 73.907201 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 74.414582 s :  Testing without internode communication 
+Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 74.414586 s : Grid Layout
+Grid : Message : 74.414586 s : 	Global lattice size  : 32 32 64 64 
+Grid : Message : 74.414594 s : 	OpenMP threads       : 4
+Grid : Message : 74.414595 s : 	MPI tasks            : 1 1 2 2 
+Grid : Message : 74.679364 s : Initialising 4d RNG
+Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 74.759525 s : Initialising 5d RNG
+Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 119.252016 s : Drawing gauge field
+Grid : Message : 129.919846 s : Random gauge initialised 
+Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 129.923611 s : Setting up Cshift based reference 
+Grid : Message : 135.522878 s : *****************************************************************
+Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 135.522899 s : *****************************************************************
+Grid : Message : 135.522899 s : *****************************************************************
+Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 135.522901 s : * Vectorising space-time by 8
+Grid : Message : 135.522903 s : * VComplex size is 64 B
+Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
+Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 135.522908 s : *****************************************************************
+Grid : Message : 136.151202 s : Called warmup
+Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
+Grid : Message : 137.224748 s : mflop/s =   24755806
+Grid : Message : 137.224751 s : mflop/s per rank =  6188951.49
+Grid : Message : 137.224753 s : mflop/s per node =  24755806
+Grid : Message : 137.235239 s : norm diff   5.8108784e-14  Line 306
+Grid : Message : 146.451686 s : ----------------------------------------------------------------
+Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 146.451710 s : ----------------------------------------------------------------
+Grid : Message : 146.451712 s : Called DwDag
+Grid : Message : 146.451714 s : norm dag result 4.12810493
+Grid : Message : 146.452323 s : norm dag ref    4.12810493
+Grid : Message : 146.454799 s : norm dag diff   3.40632318e-14  Line 377
+Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 146.940894 s : src_e0.500003185
+Grid : Message : 146.953676 s : src_o0.499996882
+Grid : Message : 146.955927 s : *********************************************************
+Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 146.955932 s : * Vectorising space-time by 8
+Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
+Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 146.955941 s : *********************************************************
+Grid : Message : 147.511975 s : Deo mflop/s =   24036256.5
+Grid : Message : 147.511989 s : Deo mflop/s per rank   6009064.13
+Grid : Message : 147.511991 s : Deo mflop/s per node   24036256.5
+Grid : Message : 147.522100 s : r_e2.06377678
+Grid : Message : 147.522433 s : r_o2.06381058
+Grid : Message : 147.522745 s : res4.12758736
+Grid : Message : 148.229848 s : norm diff   0
+Grid : Message : 149.233474 s : norm diff even  0
+Grid : Message : 149.235815 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 149.960990 s :  Testing without intranode communication 
+Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 149.960995 s : Grid Layout
+Grid : Message : 149.960995 s : 	Global lattice size  : 32 32 64 64 
+Grid : Message : 149.961003 s : 	OpenMP threads       : 4
+Grid : Message : 149.961004 s : 	MPI tasks            : 1 1 2 2 
+Grid : Message : 150.155810 s : Initialising 4d RNG
+Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 150.973420 s : Initialising 5d RNG
+Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 193.933765 s : Drawing gauge field
+Grid : Message : 204.611551 s : Random gauge initialised 
+Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 204.615265 s : Setting up Cshift based reference 
+Grid : Message : 210.117788 s : *****************************************************************
+Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 210.117809 s : *****************************************************************
+Grid : Message : 210.117810 s : *****************************************************************
+Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 210.117813 s : * Vectorising space-time by 8
+Grid : Message : 210.117814 s : * VComplex size is 64 B
+Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
+Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 210.117819 s : *****************************************************************
+Grid : Message : 210.714641 s : Called warmup
+Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
+Grid : Message : 211.892252 s : mflop/s =   22568003.2
+Grid : Message : 211.892255 s : mflop/s per rank =  5642000.8
+Grid : Message : 211.892257 s : mflop/s per node =  22568003.2
+Grid : Message : 211.896037 s : norm diff   5.8108784e-14  Line 306
+Grid : Message : 220.751375 s : ----------------------------------------------------------------
+Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 220.751409 s : ----------------------------------------------------------------
+Grid : Message : 220.751411 s : Called DwDag
+Grid : Message : 220.751412 s : norm dag result 4.12810493
+Grid : Message : 220.753307 s : norm dag ref    4.12810493
+Grid : Message : 220.755796 s : norm dag diff   3.40632318e-14  Line 377
+Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 221.697800 s : src_e0.500003185
+Grid : Message : 221.890920 s : src_o0.499996882
+Grid : Message : 221.913430 s : *********************************************************
+Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 221.913480 s : * Vectorising space-time by 8
+Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
+Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 221.913550 s : *********************************************************
+Grid : Message : 221.645213 s : Deo mflop/s =   24114032
+Grid : Message : 221.645228 s : Deo mflop/s per rank   6028508.01
+Grid : Message : 221.645231 s : Deo mflop/s per node   24114032
+Grid : Message : 221.656021 s : r_e2.06377678
+Grid : Message : 221.656389 s : r_o2.06381058
+Grid : Message : 221.656698 s : res4.12758736
+Grid : Message : 222.110075 s : norm diff   0
+Grid : Message : 222.857692 s : norm diff even  0
+Grid : Message : 222.875763 s : norm diff odd   0
+Grid : Message : 223.598127 s : *******************************************
+Grid : Message : 223.598145 s : ******* Grid Finalize                ******
+Grid : Message : 223.598146 s : *******************************************
--- a/systems/Jupiter/benchmarks/dwf.4node.perf
+++ b/systems/Jupiter/benchmarks/dwf.4node.perf
@@ -0,0 +1,286 @@
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
+RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
+RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
+RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
+SLURM detected
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device Number    : 0
+AcceleratorCudaInit[0]: ========================
+AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
+AcceleratorCudaInit[0]:   totalGlobalMem: 102005473280 
+AcceleratorCudaInit[0]:   managedMemory: 1 
+AcceleratorCudaInit[0]:   isMultiGpuBoard: 0 
+AcceleratorCudaInit[0]:   warpSize: 32 
+AcceleratorCudaInit[0]:   pciBusID: 1 
+AcceleratorCudaInit[0]:   pciDeviceID: 0 
+AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
+AcceleratorCudaInit: using default device 
+AcceleratorCudaInit: assume user either uses
+AcceleratorCudaInit: a) IBM jsrun, or 
+AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding 
+AcceleratorCudaInit: Configure options --enable-setdevice=no 
+local rank 0 device 0 bus id: 0009:01:00.0
+AcceleratorCudaInit: ================================================
+SharedMemoryMpi:  World communicator of size 16
+SharedMemoryMpi:  Node  communicator of size 4
+0SharedMemoryMpi:  SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers 
+Setting up IPC
+
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__
+__|_                                    _|__
+__|_   GGGG    RRRR    III    DDDD      _|__
+__|_  G        R   R    I     D   D     _|__
+__|_  G        R   R    I     D    D    _|__
+__|_  G  GG    RRRR     I     D    D    _|__
+__|_  G   G    R  R     I     D   D     _|__
+__|_   GGGG    R   R   III    DDDD      _|__
+__|_                                    _|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
+  |  |  |  |  |  |  |  |  |  |  |  |  |  |  
+
+
+Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
+
+Grid : Message : ================================================ 
+Grid : Message : MPI is initialised and logging filters activated 
+Grid : Message : ================================================ 
+Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
+Grid : Message : Requested 2147483648 byte stencil comms buffers 
+Grid : Message : MemoryManager Cache 81604378624 bytes 
+Grid : Message : MemoryManager::Init() setting up
+Grid : Message : MemoryManager::Init() cache pool for recent host   allocations: SMALL 8 LARGE 2 HUGE 0
+Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
+Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
+Grid : Message : MemoryManager::Init() Using cudaMalloc
+
+
+
+
+
+
+
+Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.838000 s :  Testing with full communication 
+Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 0.840000 s : Grid Layout
+Grid : Message : 0.840000 s : 	Global lattice size  : 64 64 64 64 
+Grid : Message : 0.846000 s : 	OpenMP threads       : 4
+Grid : Message : 0.846000 s : 	MPI tasks            : 2 2 2 2 
+Grid : Message : 0.165970 s : Initialising 4d RNG
+Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 0.960410 s : Initialising 5d RNG
+Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+local rank 2 device 0 bus id: 0029:01:00.0
+local rank 3 device 0 bus id: 0039:01:00.0
+local rank 1 device 0 bus id: 0019:01:00.0
+Grid : Message : 44.657270 s : Drawing gauge field
+Grid : Message : 55.247733 s : Random gauge initialised 
+Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 55.253053 s : Setting up Cshift based reference 
+Grid : Message : 62.191747 s : *****************************************************************
+Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 62.191768 s : *****************************************************************
+Grid : Message : 62.191769 s : *****************************************************************
+Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 62.191769 s : * Vectorising space-time by 8
+Grid : Message : 62.191770 s : * VComplex size is 64 B
+Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
+Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 62.191772 s : *****************************************************************
+Grid : Message : 62.857568 s : Called warmup
+Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
+Grid : Message : 65.582120 s : mflop/s =   48306525
+Grid : Message : 65.582140 s : mflop/s per rank =  3019157.81
+Grid : Message : 65.582150 s : mflop/s per node =  12076631.3
+Grid : Message : 65.637550 s : norm diff   5.80156793e-14  Line 306
+Grid : Message : 75.122153 s : ----------------------------------------------------------------
+Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 75.122167 s : ----------------------------------------------------------------
+Grid : Message : 75.122167 s : Called DwDag
+Grid : Message : 75.122167 s : norm dag result 4.12801829
+Grid : Message : 75.123295 s : norm dag ref    4.12801829
+Grid : Message : 75.125890 s : norm dag diff   3.42093991e-14  Line 377
+Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 75.605683 s : src_e0.500004005
+Grid : Message : 75.617824 s : src_o0.499996067
+Grid : Message : 75.620089 s : *********************************************************
+Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 75.620093 s : * Vectorising space-time by 8
+Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
+Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 75.620096 s : *********************************************************
+Grid : Message : 76.732272 s : Deo mflop/s =   48068252.4
+Grid : Message : 76.732283 s : Deo mflop/s per rank   3004265.77
+Grid : Message : 76.732285 s : Deo mflop/s per node   12017063.1
+Grid : Message : 76.749317 s : r_e2.06443136
+Grid : Message : 76.749652 s : r_o2.06378451
+Grid : Message : 76.749955 s : res4.12821587
+Grid : Message : 77.198827 s : norm diff   0
+Grid : Message : 77.981760 s : norm diff even  0
+Grid : Message : 78.455900 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 78.539337 s :  Testing without internode communication 
+Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 78.539339 s : Grid Layout
+Grid : Message : 78.539339 s : 	Global lattice size  : 64 64 64 64 
+Grid : Message : 78.539347 s : 	OpenMP threads       : 4
+Grid : Message : 78.539348 s : 	MPI tasks            : 2 2 2 2 
+Grid : Message : 78.798501 s : Initialising 4d RNG
+Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 78.879916 s : Initialising 5d RNG
+Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 124.586264 s : Drawing gauge field
+Grid : Message : 135.338090 s : Random gauge initialised 
+Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 135.341266 s : Setting up Cshift based reference 
+Grid : Message : 142.604280 s : *****************************************************************
+Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 142.604460 s : *****************************************************************
+Grid : Message : 142.604470 s : *****************************************************************
+Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 142.604480 s : * Vectorising space-time by 8
+Grid : Message : 142.604500 s : * VComplex size is 64 B
+Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
+Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 142.604520 s : *****************************************************************
+Grid : Message : 142.686034 s : Called warmup
+Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
+Grid : Message : 144.868559 s : mflop/s =   48706194.1
+Grid : Message : 144.868561 s : mflop/s per rank =  3044137.13
+Grid : Message : 144.868562 s : mflop/s per node =  12176548.5
+Grid : Message : 144.887595 s : norm diff   5.80156793e-14  Line 306
+Grid : Message : 153.622978 s : ----------------------------------------------------------------
+Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 153.622995 s : ----------------------------------------------------------------
+Grid : Message : 153.622995 s : Called DwDag
+Grid : Message : 153.622996 s : norm dag result 4.12801829
+Grid : Message : 153.623604 s : norm dag ref    4.12801829
+Grid : Message : 153.626098 s : norm dag diff   3.42093991e-14  Line 377
+Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 154.148319 s : src_e0.500004005
+Grid : Message : 154.151454 s : src_o0.499996067
+Grid : Message : 154.153722 s : *********************************************************
+Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 154.153725 s : * Vectorising space-time by 8
+Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
+Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 154.153728 s : *********************************************************
+Grid : Message : 155.200671 s : Deo mflop/s =   51121022.4
+Grid : Message : 155.200682 s : Deo mflop/s per rank   3195063.9
+Grid : Message : 155.200684 s : Deo mflop/s per node   12780255.6
+Grid : Message : 155.217204 s : r_e2.06443136
+Grid : Message : 155.217550 s : r_o2.06378451
+Grid : Message : 155.217869 s : res4.12821587
+Grid : Message : 155.673744 s : norm diff   0
+Grid : Message : 156.463329 s : norm diff even  0
+Grid : Message : 156.878866 s : norm diff odd   0
+
+
+
+
+
+
+
+Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 157.620764 s :  Testing without intranode communication 
+Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
+Grid : Message : 157.620766 s : Grid Layout
+Grid : Message : 157.620766 s : 	Global lattice size  : 64 64 64 64 
+Grid : Message : 157.620773 s : 	OpenMP threads       : 4
+Grid : Message : 157.620774 s : 	MPI tasks            : 2 2 2 2 
+Grid : Message : 157.671479 s : Initialising 4d RNG
+Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
+Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
+Grid : Message : 157.755651 s : Initialising 5d RNG
+Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
+Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
+Grid : Message : 202.465158 s : Drawing gauge field
+Grid : Message : 213.214546 s : Random gauge initialised 
+Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
+Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
+Grid : Message : 213.217711 s : Setting up Cshift based reference 
+Grid : Message : 219.662772 s : *****************************************************************
+Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
+Grid : Message : 219.662787 s : *****************************************************************
+Grid : Message : 219.662788 s : *****************************************************************
+Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop                  
+Grid : Message : 219.662789 s : * Vectorising space-time by 8
+Grid : Message : 219.662790 s : * VComplex size is 64 B
+Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
+Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 219.662791 s : *****************************************************************
+Grid : Message : 220.425592 s : Called warmup
+Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
+Grid : Message : 222.536267 s : mflop/s =   50365105.5
+Grid : Message : 222.536269 s : mflop/s per rank =  3147819.09
+Grid : Message : 222.536270 s : mflop/s per node =  12591276.4
+Grid : Message : 222.541053 s : norm diff   5.80156793e-14  Line 306
+Grid : Message : 232.135901 s : ----------------------------------------------------------------
+Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
+Grid : Message : 232.135916 s : ----------------------------------------------------------------
+Grid : Message : 232.135917 s : Called DwDag
+Grid : Message : 232.135918 s : norm dag result 4.12801829
+Grid : Message : 232.151938 s : norm dag ref    4.12801829
+Grid : Message : 232.154451 s : norm dag diff   3.42093991e-14  Line 377
+Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
+Grid : Message : 232.630529 s : src_e0.500004005
+Grid : Message : 232.643197 s : src_o0.499996067
+Grid : Message : 232.645527 s : *********************************************************
+Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO                
+Grid : Message : 232.645532 s : * Vectorising space-time by 8
+Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
+Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
+Grid : Message : 232.645535 s : *********************************************************
+Grid : Message : 233.774184 s : Deo mflop/s =   47432091.9
+Grid : Message : 233.774194 s : Deo mflop/s per rank   2964505.74
+Grid : Message : 233.774196 s : Deo mflop/s per node   11858023
+Grid : Message : 233.791552 s : r_e2.06443136
+Grid : Message : 233.791899 s : r_o2.06378451
+Grid : Message : 233.792204 s : res4.12821587
+Grid : Message : 234.230783 s : norm diff   0
+Grid : Message : 235.162780 s : norm diff even  0
+Grid : Message : 235.291950 s : norm diff odd   0
+Grid : Message : 235.765411 s : *******************************************
+Grid : Message : 235.765424 s : ******* Grid Finalize                ******
+Grid : Message : 235.765425 s : *******************************************
+
--- a/systems/Jupiter/benchmarks/dwf1.slurm
+++ b/systems/Jupiter/benchmarks/dwf1.slurm
@@ -0,0 +1,57 @@
+#!/bin/sh
+#SBATCH --account=jureap14
+#SBATCH --nodes=1
+#SBATCH --ntasks=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=64
+#SBATCH --time=2:00:00
+#SBATCH --partition=booster
+#SBATCH --gres=gpu:4
+
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+OPT="--comms-overlap"
+
+source ../sourceme.sh
+
+cat << EOF > bind_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3)
+export NUMA_MAP=(0 1 2 3)
+export NIC_MAP=(0 1 2 3)
+export GPU=\$SLURM_LOCALID
+export NUMA=\$SLURM_LOCALID
+export NIC=\$SLURM_LOCALID
+export CUDA_VISIBLE_DEVICES=\$GPU
+export UCX_NET_DEVICES=mlx5_\${NIC}:1
+
+echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+
+chmod +x ./bind_gpu
+
+srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
+        ./bind_gpu ./Benchmark_dwf_fp32 \
+	$OPT \
+	--mpi 1.1.2.2 \
+	--accelerator-threads 8 \
+	--grid 32.32.64.64 \
+	--shm 2048 > dwf.1node.perf
+
+srun --cpu-bind=no -N 1  -n $SLURM_NTASKS \
+	./bind_gpu ./Benchmark_comms_host_device \
+	--mpi 1.1.2.2 \
+	--accelerator-threads 8 \
+	--grid 32.32.64.64 \
+	--shm 2048 > comms.1node.perf
+
+
+
+
--- a/systems/Jupiter/benchmarks/dwf4.slurm
+++ b/systems/Jupiter/benchmarks/dwf4.slurm
@@ -0,0 +1,57 @@
+#!/bin/sh
+#SBATCH --account=jureap14
+#SBATCH --nodes=4
+#SBATCH --ntasks=16
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=64
+#SBATCH --time=2:00:00
+#SBATCH --partition=booster
+#SBATCH --gres=gpu:4
+
+export OMP_NUM_THREADS=4
+export OMPI_MCA_btl=^uct,openib
+export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
+export UCX_RNDV_SCHEME=put_zcopy
+export UCX_RNDV_THRESH=16384
+export UCX_IB_GPU_DIRECT_RDMA=yes
+export UCX_MEMTYPE_CACHE=n
+
+OPT="--comms-overlap"
+
+source ../sourceme.sh
+
+cat << EOF > bind_gpu
+#!/bin/bash
+export GPU_MAP=(0 1 2 3)
+export NUMA_MAP=(0 1 2 3)
+export NIC_MAP=(0 1 2 3)
+export GPU=\$SLURM_LOCALID
+export NUMA=\$SLURM_LOCALID
+export NIC=\$SLURM_LOCALID
+export CUDA_VISIBLE_DEVICES=\$GPU
+export UCX_NET_DEVICES=mlx5_\${NIC}:1
+
+echo RANK \$SLURM_LOCALID using NUMA \$NUMA  GPU \$GPU NIC \$UCX_NET_DEVICES
+exec numactl -m \$NUMA -N \$NUMA \$*
+EOF
+
+chmod +x ./bind_gpu
+
+srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
+        ./bind_gpu ./Benchmark_dwf_fp32 \
+	$OPT \
+	--mpi 2.2.2.2 \
+	--accelerator-threads 8 \
+	--grid 64.64.64.64 \
+	--shm 2048 > dwf.4node.perf
+
+srun --cpu-bind=no -N 4  -n $SLURM_NTASKS \
+	./bind_gpu ./Benchmark_comms_host_device \
+	--mpi 2.2.2.2 \
+	--accelerator-threads 8 \
+	--grid 32.32.64.64 \
+	--shm 2048 > comms.4node.perf
+
+
+
+
--- a/systems/Jupiter/config-command
+++ b/systems/Jupiter/config-command
@@ -0,0 +1,16 @@
+export CXX=nvcc
+export OPENMPI=/p/software/default/stages/2025/software/OpenMPI/5.0.5-NVHPC-24.9-CUDA-12/
+export LDFLAGS="-cudart shared -L${OPENMPI}/lib" 
+export CXXFLAGS="-ccbin clang++ -gencode arch=compute_90,code=sm_90 -std=c++17 -cudart shared -lcublas -lmpi -I${OPENMPI}/include"
+
+../../configure \
+    --enable-comms=mpi \
+    --enable-simd=GPU \
+    --enable-gen-simd-width=64 \
+    --enable-shm=nvlink \
+    --enable-accelerator=cuda \
+    --with-lime=$CLIME \
+    --disable-gparity \
+    --disable-fermion-reps \
+    --disable-unified 
+
--- a/systems/Jupiter/sourceme.sh
+++ b/systems/Jupiter/sourceme.sh
@@ -0,0 +1,10 @@
+CLIME=$HOME/install/
+module load Clang
+module load CUDA
+module load FFTW
+module load OpenSSL
+module load MPFR
+module load NVHPC
+module load UCX
+module load OpenMPI
+ulimit -c 0
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@@ -7,8 +7,6 @@ CXX=mpicxx ../../configure \
 	   --enable-unified=yes \
 	   --prefix /Users/peterboyle/QCD/vtk/Grid/install \
 	   --with-lime=$CLIME \
-	   --with-hdf5=$HDF5 \
-	   --with-fftw=$FFTW \
 	   --with-openssl=$OPENSSL \
 	   --with-gmp=$GMP \
 	   --with-mpfr=$MPFR \
--- a/systems/sdcc-genoa/config-command
+++ b/systems/sdcc-genoa/config-command
@@ -1,3 +1,12 @@
+
+spack load c-lime
+spack load fftw
+spack load hdf5+cxx
+
+export FFTW=`spack find --paths fftw       | grep ^fftw   | awk '{print $2}' `
+export HDF5=`spack find --paths hdf5+cxx   | grep ^hdf5   | awk '{print $2}' `
+export CLIME=`spack find --paths c-lime    | grep ^c-lime | awk '{print $2}' `
+
 ../../configure \
 --enable-comms=mpi-auto \
 --enable-unified=yes \
@@ -5,12 +14,16 @@
 --enable-shm-fast-path=shmopen \
 --enable-accelerator=none \
 --enable-simd=AVX512 \
--disable-accelerator-cshift \
+--with-lime=$CLIME \
+--with-hdf5=$HDF5 \
+--with-fftw=$FFTW \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=clang++ \
 MPICXX=mpicxx \
-CXXFLAGS="-std=c++17"
+LIBS=-llime \
+LDFLAGS=-L$CLIME/lib/ \
+CXXFLAGS="-std=c++17 -fPIE"



--- a/systems/sdcc-genoa/sourceme.sh
+++ b/systems/sdcc-genoa/sourceme.sh
@@ -1,4 +1,5 @@
 source $HOME/spack/share/spack/setup-env.sh
 spack load llvm@17.0.4
 export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
-module load openmpi
+module load openmpi/4.1.8
+spack load c-lime
--- a/tests/lanczos/LanParams.xml
+++ b/tests/lanczos/LanParams.xml
@@ -1,15 +1,14 @@
 <?xml version="1.0"?>
 <grid>
  <LanczosParameters>
-    <mass>-1.025</mass>
-    <mstep>-0.025</mstep>
+    <mass>0.00107</mass>
    <M5>1.8</M5>
    <Ls>48</Ls>
    <Nstop>10</Nstop>
-    <Nk>12</Nk>
-    <Np>30</Np>
-    <ChebyLow>0.1</ChebyLow>
-    <ChebyHigh>50</ChebyHigh>
-    <ChebyOrder>51</ChebyOrder>
+    <Nk>15</Nk>
+    <Np>85</Np>
+    <ChebyLow>0.003</ChebyLow>
+    <ChebyHigh>60</ChebyHigh>
+    <ChebyOrder>201</ChebyOrder>
  </LanczosParameters>
 </grid>
--- a/tests/lanczos/Test_dwf_G5R5.cc
+++ b/tests/lanczos/Test_dwf_G5R5.cc
@@ -31,16 +31,23 @@ directory

 using namespace std;
 using namespace Grid;
- ;

-#if 0
+//typedef WilsonFermionD FermionOp;
 typedef DomainWallFermionD FermionOp;
 typedef typename DomainWallFermionD::FermionField FermionField;
-#else
-typedef MobiusFermionD FermionOp;
-typedef typename MobiusFermionD::FermionField FermionField;
-#endif

+template <class T> void writeFile(T& in, std::string const fname){  
+#ifdef HAVE_LIME
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}

 RealD AllZero(RealD x) { return 0.; }

@@ -125,7 +132,7 @@ int main(int argc, char** argv) {

  int Ls=16;
  RealD M5=1.8;
-  RealD mass = -1.0;
+  RealD mass = 0.01;

  mass=LanParams.mass;
  Ls=LanParams.Ls;
@@ -163,22 +170,21 @@ int main(int argc, char** argv) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
-
-  int Nstop = 10;
  int Nk = 20;
+  int Nstop = Nk;
  int Np = 80;
+
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;

  int Nm = Nk + Np;
-  int MaxIt = 10000;
-  RealD resid = 1.0e-5;
-  RealD mob_b=1.5;
+  int MaxIt = 100;
+  RealD resid = 1.0e-4;
+

 //while ( mass > - 5.0){
-//  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.);
+  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
 //  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
  Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
@@ -206,8 +212,12 @@ int main(int argc, char** argv) {
  int Nconv;
  IRL.calc(eval, evec, src, Nconv);

-  std::cout << mass <<" : " << eval << std::endl;
-
+  std::cout << mass <<" : " << eval        << std::endl;
+  std::cout << " #evecs "   << evec.size() << std::endl;
+  std::cout << " Nconv  "   << Nconv       << std::endl;
+  std::cout << " Nm     "   << Nm          << std::endl;
+  if ( Nconv > evec.size() ) Nconv = evec.size();
+  
 #if 0
  Gamma g5(Gamma::Algebra::Gamma5) ;
  ComplexD dot;
@@ -237,6 +247,7 @@ int main(int argc, char** argv) {
  vector<LatticeFermion> finalevec(Nconv, FGrid);
  vector<RealD> eMe(Nconv), eMMe(Nconv);
  for(int i = 0; i < Nconv; i++){
+    cout << "calculate the matrix element["<<i<<"]" << endl;
    G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
@@ -303,7 +314,7 @@ int main(int argc, char** argv) {
      }
    }
  }
-    for(int i = 0; i < Nconv; i++){
+  for(int i = 0; i < Nconv; i++){
    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
@@ -311,6 +322,7 @@ int main(int argc, char** argv) {
  cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
  cout << eMMe << endl;

+  

 //  vector<LatticeFermion> finalevec(Nconv, FGrid);
 // temporary, until doing rotation
@@ -331,13 +343,41 @@ int main(int argc, char** argv) {
      axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
    }
  }
+  
+  for(int i = 0; i < Nconv; i++){
+    Ddwf.M(finalevec[i], G5R5Mevec[i]);
+    for(int j = 0; j < Nconv; j++){
+      std::cout << "<"<<j<<"|Ddwf|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
+    }
+  }
+  for(int i = 0; i < Nconv; i++){
+    RealD t1,t2;
+    G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], t1, t2);
+    for(int j = 0; j < Nconv; j++){
+      std::cout << "<"<<j<<"|G5R5 M|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
+    }
+  }
+  
  for(int i = 0; i < Nconv; i++){
    chiral_matrix_real[i].resize(Nconv);
    chiral_matrix[i].resize(Nconv);
+
+    std::string evfile("./evec_density");
+    evfile = evfile+"_"+std::to_string(i);
+    auto evdensity = localInnerProduct(finalevec[i],finalevec[i] );
+    writeFile(evdensity,evfile);
+
    for(int j = 0; j < Nconv; j++){
      chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
+      std::cout <<" chiral_matrix_real signed "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
      chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
      std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
+      if ( chiral_matrix_real[i][j] > 0.8 ) {
+	auto g5density = localInnerProduct(finalevec[i], G5evec[j]);
+	std::string chfile("./chiral_density_");
+	chfile = chfile +std::to_string(i)+"_"+std::to_string(j);
+	writeFile(g5density,chfile);
+      }
    }
  }
  for(int i = 0; i < Nconv; i++){
@@ -346,6 +386,43 @@ int main(int argc, char** argv) {
    }
  }

-
+  FILE *fp = fopen("lego-plot.py","w"); assert(fp!=NULL);
+#define PYTHON_LINE(A)  fprintf(fp,A"\n");
+  PYTHON_LINE("import matplotlib.pyplot as plt");
+  PYTHON_LINE("import numpy as np");
+  PYTHON_LINE("");
+  PYTHON_LINE("fig = plt.figure()");
+  PYTHON_LINE("ax = fig.add_subplot(projection='3d')");
+  PYTHON_LINE("");
+  PYTHON_LINE("x, y = np.random.rand(2, 100) * 4");
+  fprintf(fp,"hist, xedges, yedges = np.histogram2d(x, y, bins=%d, range=[[0, %d], [0, %d]])\n",Nconv,Nconv-1,Nconv-1);
+  PYTHON_LINE("");
+  PYTHON_LINE("# Construct arrays for the anchor positions of the 16 bars");
+  PYTHON_LINE("xpos, ypos = np.meshgrid(xedges[:-1] + 0.25, yedges[:-1] + 0.25, indexing=\"ij\")");
+  PYTHON_LINE("xpos = xpos.ravel()");
+  PYTHON_LINE("ypos = ypos.ravel()");
+  PYTHON_LINE("zpos = 0");
+  PYTHON_LINE("");
+  PYTHON_LINE("# Construct arrays with the dimensions for the 16 bars.");
+  PYTHON_LINE("dx = dy = 0.5 * np.ones_like(zpos)");
+  PYTHON_LINE("dz = np.array([");
+  for(int i = 0; i < Nconv; i++){
+    fprintf(fp,"\t[ ");
+    for(int j = 0; j < Nconv; j++){
+      fprintf(fp,"%lf ",chiral_matrix_real[i][j]);
+      if(j<Nconv-1) fprintf(fp,",");
+      else          fprintf(fp," ");
+    }
+    fprintf(fp,"]");
+    if(i<Nconv-1) fprintf(fp,",\n");
+    else          fprintf(fp,"\n");
+  }
+	      
+  PYTHON_LINE("\t])");
+  PYTHON_LINE("dz = dz.ravel()");
+  PYTHON_LINE("ax.bar3d(xpos, ypos, zpos, dx, dy, dz, zsort='average')");
+  PYTHON_LINE("plt.show()");
+  fclose(fp);
+  
  Grid_finalize();
 }
--- a/tests/lanczos/Test_wilson_DWFKernel.cc
+++ b/tests/lanczos/Test_wilson_DWFKernel.cc
@@ -113,9 +113,6 @@ struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 				RealD, resid,
-				Integer, Nstop,
-                                Integer, Nk,
-                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -207,6 +204,7 @@ int main(int argc, char** argv) {
  int Nstop = 5;
  int Nk = 10;
  int Np = 90;
+  int Nm = Nk + Np;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;

@@ -228,14 +226,10 @@ int main(int argc, char** argv) {
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
-  Nstop=LanParams.Nstop;
-  Nk=LanParams.Nk;
-  Np=LanParams.Np;
+
  mass=LanParams.mass;
  resid=LanParams.resid;

-  int Nm = Nk + Np;
-

 while ( mass > - 5.0){
  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
--- a/tests/lanczos/Test_wilson_lanczos.cc
+++ b/tests/lanczos/Test_wilson_lanczos.cc
@@ -61,8 +61,7 @@ int main(int argc, char** argv) {
  RNG5.SeedFixedIntegers(seeds5);

  LatticeGaugeField Umu(UGrid);
-//  SU<Nc>::HotConfiguration(RNG4, Umu);
-  SU<Nc>::ColdConfiguration(Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

 /*
  std::vector<LatticeColourMatrix> U(4, UGrid);
@@ -70,15 +69,9 @@ int main(int argc, char** argv) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
-//  std::vector<Complex> boundary = {1,1,1,-1};
-  std::vector<Complex> boundary = {1,1,1,1};
-  FermionOp::ImplParams Params(boundary);

-
-
-  RealD mass = 0.0;
-//  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
+  RealD mass = -0.1;
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
  MdagMLinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);

@@ -96,8 +89,7 @@ int main(int argc, char** argv) {
  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
     PlainHermOp<FermionField> Op     (HermOp);

-//  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
-  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);
+  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);

  std::vector<RealD> eval(Nm);
  FermionField src(FGrid);
@@ -109,8 +101,7 @@ int main(int argc, char** argv) {
  };

  int Nconv;
-//  IRL.calc(eval, evec, src, Nconv);
-  IRL.calc(eval, src, Nconv);
+  IRL.calc(eval, evec, src, Nconv);

  std::cout << eval << std::endl;

--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@@ -27,7 +27,6 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
-#include <Grid/parallelIO/IldgIOtypes.h>

 using namespace std;
 using namespace Grid;
@@ -39,29 +38,11 @@ typedef typename WilsonFermionD::FermionField FermionField;

 RealD AllZero(RealD x) { return 0.; }

-template <class T> void writeFile(T& in, std::string const fname){
-#if 1
-  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
-  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
-  Grid::emptyUserRecord record;
-  Grid::ScidacWriter WR(in.Grid()->IsBoss());
-  WR.open(fname);
-  WR.writeScidacFieldRecord(in,record,0);
-  WR.close();
-#endif
-  // What is the appropriate way to throw error?
-}
-
-
 namespace Grid {

 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
-		  		RealD, mstep , 
-				Integer, Nstop,
-                                Integer, Nk,
-                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -134,7 +115,6 @@ int main(int argc, char** argv) {

  LatticeGaugeField Umu(UGrid);
 //  SU<Nc>::HotConfiguration(RNG4, Umu);
-//  SU<Nc>::ColdConfiguration(Umu);

  FieldMetaData header;
  std::string file("./config");
@@ -178,20 +158,10 @@ int main(int argc, char** argv) {
  }

  mass=LanParams.mass;
-  Nstop=LanParams.Nstop;
-  Nk=LanParams.Nk;
-  Np=LanParams.Np;
-  Nm = Nk + Np;
-
-  FermionField src(FGrid);
-  gaussian(RNG5, src);
-  std::vector<Complex> boundary = {1,1,1,-1};
-//  std::vector<Complex> boundary = {1,1,1,1};
-  FermionOp::ImplParams Params(boundary);


-while ( mass > - 2.5){
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
+while ( mass > - 5.0){
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
@@ -210,9 +180,10 @@ while ( mass > - 2.5){
     PlainHermOp<FermionField> Op2     (HermOp2);

  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
-//  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);

  std::vector<RealD> eval(Nm);
+  FermionField src(FGrid);
+  gaussian(RNG5, src);
  std::vector<FermionField> evec(Nm, FGrid);
  for (int i = 0; i < 1; i++) {
    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
@@ -221,7 +192,6 @@ while ( mass > - 2.5){

  int Nconv;
  IRL.calc(eval, evec, src, Nconv);
-//  IRL.calc(eval,  src, Nconv);

  std::cout << mass <<" : " << eval << std::endl;

@@ -232,17 +202,9 @@ while ( mass > - 2.5){
    tmp = g5*evec[i];
    dot = innerProduct(tmp,evec[i]);
    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
-//    if ( i<1)
-    {
-	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
-        auto evdensity = localInnerProduct(evec[i],evec[i] );
-	writeFile(evdensity,evfile);
-    }
  }
  src  = evec[0]+evec[1]+evec[2];
-  src  += evec[3]+evec[4]+evec[5];
-  src  += evec[6]+evec[7]+evec[8];
-  mass += LanParams.mstep;
+  mass += -0.1;
 }

  Grid_finalize();
--- a/visualisation/CMakeLists.txt
+++ b/visualisation/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.12 FATAL_ERROR)

 project(GridViewer)

-list(APPEND CMAKE_PREFIX_PATH "/Users/peterboyle/QCD/vtk/VTK-9.4.2-install/")
+list(APPEND CMAKE_PREFIX_PATH "/home/paboyle/Visualisation/install/")

 find_package(VTK COMPONENTS 
  CommonColor
--- a/visualisation/FieldDensityAnimate.cxx
+++ b/visualisation/FieldDensityAnimate.cxx
@@ -48,15 +48,14 @@ typedef vtkMarchingCubes isosurface;

 int mpeg = 0 ;
 int xlate = 0 ;
+int framerate = 10;

 template <class T> void readFile(T& out, std::string const fname){
-#ifdef HAVE_LIME
  Grid::emptyUserRecord record;
  Grid::ScidacReader RD;
  RD.open(fname);
  RD.readScidacFieldRecord(out,record);
  RD.close();
-#endif
 }
 using namespace Grid;

@@ -208,6 +207,10 @@ int main(int argc, char* argv[])
    xlate = 1;
  }

+  if( GridCmdOptionExists(argv,argv+argc,"--fps") ){
+    arg=GridCmdOptionPayload(argv,argv+argc,"--fps");
+    GridCmdOptionInt(arg,framerate);
+  }
  if( GridCmdOptionExists(argv,argv+argc,"--isosurface") ){
    arg=GridCmdOptionPayload(argv,argv+argc,"--isosurface");
    GridCmdOptionFloat(arg,default_contour);
@@ -420,7 +423,7 @@ int main(int argc, char* argv[])
    
    vtkFFMPEGWriter *writer = vtkFFMPEGWriter::New();
    writer->SetFileName("movie.avi");
-    writer->SetRate(1);
+    writer->SetRate(framerate);
    writer->SetInputConnection(imageFilter->GetOutputPort());
    writer->Start();

@@ -477,7 +480,7 @@ int main(int argc, char* argv[])
    slidercallback->fu_list = fu_list;
    sliderWidget->AddObserver(vtkCommand::InteractionEvent, slidercallback);

-    int timerId = iren->CreateRepeatingTimer(300);
+    int timerId = iren->CreateRepeatingTimer(1000/framerate);
    std::cout << "timerId: " << timerId << std::endl;

    // Start the interaction and timer
--- a/visualisation/README
+++ b/visualisation/README
@@ -73,6 +73,21 @@ each to:

   VTK really should make it easier to pick up the flags required for FFMPEG linkage, especially as they are very quirky on MacOS.

+========================================
+Aurora compilation:
+========================================
+module load ffmpeg
+download & untar: VTK-7.0.2
+mkdir build
+cd build 
+ccmake ../
+
+"t"
+Enable: VTK_MODULE_ENABLE_VTK_IOFFMPEG   YES   
+"configure" ; should "discover" the installed ffmpeg module
+
+Still need an "X" connection to make the MPEG files.
+

 ========================================
 Grid:
@@ -110,4 +125,29 @@ Extensions

 8) Example python code: FieldDensity.py . This is not interfaced to Grid.

+================
+Windowless generation of AVI files: must enable offscreen rendering. From Shuhei Yamamoto:
+================
+Hi Peter,
+ 
+To make visualization work on Frontier, I did the following.
+ 
+For headless off-screen rendering, ccmake tabs in advanced mode shown below are set as indicated.
+VTK_OPENGL_HAS_* off  
+VTK_USE_X off  
+VTK_DEFAULT_RENDER_WINDOW_OFFSCREEN on
+VTK_DEFAULT_RENDER_WINDOW_HEADLESS on 
+The list can be greater than necessary.
+ 
+VTK can fall back to EGL or OSMesa at runtime. So I installed mesa via spack (as well as nasm and yasm).  Either mesa or meson package requires llvm-config, which is included after rocm6.1.  On Frontier, I used /opt/rocm-6.2.4.  The only problem is that llvm-config is located on /opt/rocm-6.2.4/llvm/bin, instead of /opt/rocm-6.2.4/bin.  So I edited packages.yaml for spack so that the prefix for rocm compiler is /opt/rocm-6.2.4/llvm.  Just in case, I also changed c and cxx to /opt/rocm-6.2.4/llvm/bin/amdclang, amdclang++, respectively, but this change might not be necessary. 
+After installation, I added a path to libOSMesa.so to LD_LIBRARY_PATH, for which there might be a better way such as specifying -rpath for OSMesa lib by editing cmake files.
+ 
+In addition, I have editied CMakeLists.txt for vtk to force vtk to find OSMesa package via find_package(OSMesa REQUIRED) after list(INSERT CMAKE_MODULE_PATH 0 "${vtk_cmake_dir}"), as there is Find package in vtk/CMake.  There will be more elegant method, but I was not able to find a tab to switch on OSMesa. 
+ 
+When I compiled vtk and linked to Grid visualization code, with ffmpeg option, it produces avi file.
+ 
+Best,
+Shuhei
+
+

--- a/visualisation/cmake-command
+++ b/visualisation/cmake-command
@@ -1,9 +1,17 @@
-libs=`grid-config --libs`
-ldflags=`grid-config --ldflags`
-cxxflags=`grid-config --cxxflags`
-cxx=`grid-config --cxx`
+export grid_config=/home/paboyle/GPT/install/bin/grid-config
+libs=`$grid_config --libs`
+ldflags=`$grid_config --ldflags`
+cxxflags=`$grid_config --cxxflags`
+cxx=`$grid_config --cxx`
+cc=icx

 mkdir build
 cd build

-LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_CXX_COMPILER=$cxx -DCMAKE_CXX_FLAGS=$cxxflags 
+echo CC $cc
+echo CXX $cxx
+echo CXXFLAGS $cxxflags
+echo LDFLAGS  $ldflags
+echo LIBS  $libs
+
+LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_C_COMPILER=$cc -DCMAKE_CXX_COMPILER="$cxx" -DCMAKE_CXX_FLAGS="$cxxflags "
Author	SHA1	Message	Date
Peter Boyle	73af020f98	improved	2025-06-27 06:08:54 +00:00
Peter Boyle	bffb83c46e	std::cout<<GridLogMessage<<"Debug:"<<std::endl; std::cout<<GridLogMessage<<" --dylib-map : print dynamic library map, useful for interpreting signal backtraces "<<std::endl; std::cout<<GridLogMessage<<" --heartbeat : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl; std::cout<<GridLogMessage<<" --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl; std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl; std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl; std::cout<<GridLogMessage<<" --debug-heartbeat : periodically report backtrace "<<std::endl; --dylib-map : Grid prints its dylib regions --heartbeat : itimer based / SIGALRM wake up which seems to make Aurora more stable --debug-heartbeat : periodically report to stderr where we are in code Now have libunwind option (configure: --with-unwind=<prefix>) to give an Asynch-Signal safe backtrace. Avoid glibc backtrace due to mallocs.	2025-06-27 06:08:54 +00:00
Peter Boyle	7031f37350	Use libunwind for backtrace as it is signal asynch safe	2025-06-27 06:08:54 +00:00
Peter Boyle	829dd74cb2	Verbose change	2025-06-27 06:08:54 +00:00
Peter Boyle	66e671985d	P2P	2025-06-27 06:08:54 +00:00
Peter Boyle	5afcbcf0f3	Cshift uses flight recorder	2025-06-27 06:08:54 +00:00
Peter Boyle	9730579312	Simplify and verbose	2025-06-27 06:08:51 +00:00
Peter Boyle	bfae14d035	More flight logging	2025-06-27 06:07:34 +00:00
Peter Boyle	b78fc73d19	Better signal handler	2025-06-27 06:07:34 +00:00
Peter Boyle	709f8ae76c	Update README	2025-06-26 23:06:11 -04:00
Peter Boyle	7aa06329d0	Update for new stencil compression options	2025-06-17 18:06:19 +02:00
Peter Boyle	9d6a38c44c	Compressed comms options as Sloppy	2025-06-17 16:43:53 +02:00
Peter Boyle	6ec5cee368	Preparing for compressed comms	2025-06-17 16:38:10 +02:00
Peter Boyle	f2e9a68825	Simplify	2025-06-13 17:32:05 +02:00
Peter Boyle	d88750e6b6	Sloppy + non-sloppy	2025-06-13 16:42:01 +02:00
Peter Boyle	821358eda7	Remove partial dirichlet. Favour intro reduced prec comms options	2025-06-13 05:08:45 +02:00
Peter Boyle	fce6e1f135	Kill core files for quota reasons	2025-06-13 05:08:15 +02:00
Peter Boyle	8f0bb3e676	remove partial dirichlet	2025-06-13 05:07:56 +02:00
Peter Boyle	262c70d967	USe sloppy comms options	2025-06-13 05:07:23 +02:00
Peter Boyle	da43ef7c2d	REmove partial dirichlet option. It's going nowhere	2025-06-13 05:05:15 +02:00
Peter Boyle	7b60ab5df1	Warning suppress	2025-06-13 05:04:55 +02:00
Peter Boyle	f6b961a64e	Warning suppress	2025-06-13 05:04:47 +02:00
Peter Boyle	f1ed988aa3	Interface to reduced precision comms	2025-06-13 05:04:12 +02:00
Peter Boyle	eea51bb604	Suppress annoying warns	2025-06-13 05:03:36 +02:00
Peter Boyle	9203126aa5	Scripts	2025-06-11 15:30:16 +02:00
Peter Boyle	f90ba4712a	Update for Jupiter	2025-06-11 15:24:34 +02:00
Peter Boyle	3737a24096	Updated python output	2025-06-03 14:09:29 -04:00
Peter Boyle	d418f78352	Making running on Aurora more debuggable	2025-05-23 20:58:16 +00:00
Peter Boyle	25163998a0	Makes SYCL compiler happy	2025-05-23 20:57:11 +00:00
Peter Boyle	dc546aaa4b	Updated config options for BNL cluster	2025-05-13 18:44:47 -04:00
Peter Boyle	5364d580c9	Output chirality, eigenvector density files and python source lego plot	2025-05-13 18:44:47 -04:00
Peter Boyle	2a9a6347e3	Do not require Grid format RNGs and also to the 5Li reporting	2025-05-13 18:44:47 -04:00
Peter Boyle	cfdb56f314	Run measurements at t=0 too	2025-05-13 18:44:46 -04:00
Peter Boyle	b517e88db3	Update README	2025-05-13 16:49:21 -04:00
Peter Boyle	bb317aba8d	Lattice = for sycl	2025-05-13 12:50:58 +00:00
Peter Boyle	644cc6647e	JSON update	2025-05-13 12:50:58 +00:00
Peter Boyle	72397ce23b	SYCL interface change	2025-05-13 12:50:58 +00:00
Peter Boyle	d60a80c098	Fixes and visualisation	2025-04-29 18:04:23 -04:00
Peter Boyle	bb8b6d9d73	Fix	2025-04-29 18:04:04 -04:00