Default dimensions fixed

IcosahedralVerted() checks
Config
2025-11-06 22:59:32 +00:00 · 2025-10-09 14:57:22 -04:00 · 2025-10-09 13:35:16 -04:00 · 2025-10-09 13:30:16 -04:00 · 2025-10-08 15:18:31 -04:00 · 2025-10-07 18:49:00 -04:00
73 changed files with 2828 additions and 1159 deletions
--- a/Grid/GridQCDcore.h
+++ b/Grid/GridQCDcore.h
@@ -37,6 +37,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/qcd/QCD.h>
 #include <Grid/qcd/spin/Spin.h>
 #include <Grid/qcd/gparity/Gparity.h>
+#include <Grid/qcd/spin/Pauli.h> // depends on Gparity
 #include <Grid/qcd/utils/Utils.h>
 #include <Grid/qcd/representations/Representations.h>
 NAMESPACE_CHECK(GridQCDCore);
--- a/Grid/algorithms/Algorithms.h
+++ b/Grid/algorithms/Algorithms.h
@@ -73,7 +73,6 @@ NAMESPACE_CHECK(BiCGSTAB);
 #include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
-#include <Grid/algorithms/iterative/SimpleLanczos.h>
 #include <Grid/algorithms/iterative/PowerMethod.h>
 #include <Grid/algorithms/iterative/AdefGeneric.h>
 #include <Grid/algorithms/iterative/AdefMrhs.h>
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -269,7 +269,9 @@ public:
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
    Linop.HermOp(T0,y);
+    grid->Barrier();
    axpby(T1,xscale,mscale,y,in);
+    grid->Barrier();

    // sum = .5 c[0] T0 + c[1] T1
    //    out = ()*T0 + Coeffs[1]*T1;
--- a/Grid/algorithms/iterative/SimpleLanczos.h
+++ b/Grid/algorithms/iterative/SimpleLanczos.h
@@ -1,931 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
-
-    Copyright (C) 2015
-
-Author: Chulwoo Jung <chulwoo@bnl.gov>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_LANC_H
-#define GRID_LANC_H
-
-#include <string.h>		//memset
-
-#ifdef USE_LAPACK
-#ifdef USE_MKL
-#include<mkl_lapack.h>
-#else
-void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
-		    double *vl, double *vu, int *il, int *iu, double *abstol,
-		    int *m, double *w, double *z, int *ldz, int *isuppz,
-		    double *work, int *lwork, int *iwork, int *liwork,
-		    int *info);
-//#include <lapacke/lapacke.h>
-#endif
-#endif
-
-//#include <Grid/algorithms/densematrix/DenseMatrix.h>
-
-// eliminate temorary vector in calc()
-#define MEM_SAVE
-
-namespace Grid
-{
-
-  struct Bisection
-  {
-
-#if 0
-    static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
-			  std::vector < RealD > &BETA,
-			  std::vector < RealD > &eig)
-    {
-      int i, j;
-        std::vector < RealD > evec1 (row_num + 3);
-        std::vector < RealD > evec2 (row_num + 3);
-      RealD eps2;
-        ALPHA[1] = 0.;
-        BETHA[1] = 0.;
-      for (i = 0; i < row_num - 1; i++)
-	{
-	  ALPHA[i + 1] = A[i * (row_num + 1)].real ();
-	  BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
-	}
-      ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
-        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
-        bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
-
-      // Do we really need to sort here?
-      int begin = 1;
-      int end = row_num;
-      int swapped = 1;
-      while (swapped)
-	{
-	  swapped = 0;
-	  for (i = begin; i < end; i++)
-	    {
-	      if (mag (evec2[i]) > mag (evec2[i + 1]))
-		{
-		  swap (evec2 + i, evec2 + i + 1);
-		  swapped = 1;
-		}
-	    }
-	  end--;
-	  for (i = end - 1; i >= begin; i--)
-	    {
-	      if (mag (evec2[i]) > mag (evec2[i + 1]))
-		{
-		  swap (evec2 + i, evec2 + i + 1);
-		  swapped = 1;
-		}
-	    }
-	  begin++;
-	}
-
-      for (i = 0; i < row_num; i++)
-	{
-	  for (j = 0; j < row_num; j++)
-	    {
-	      if (i == j)
-		H[i * row_num + j] = evec2[i + 1];
-	      else
-		H[i * row_num + j] = 0.;
-	    }
-	}
-    }
-#endif
-
-    static void bisec (std::vector < RealD > &c,
-		       std::vector < RealD > &b,
-		       int n,
-		       int m1,
-		       int m2,
-		       RealD eps1,
-		       RealD relfeh, std::vector < RealD > &x, RealD & eps2)
-    {
-      std::vector < RealD > wu (n + 2);
-
-      RealD h, q, x1, xu, x0, xmin, xmax;
-      int i, a, k;
-
-      b[1] = 0.0;
-      xmin = c[n] - fabs (b[n]);
-      xmax = c[n] + fabs (b[n]);
-      for (i = 1; i < n; i++)
-	{
-	  h = fabs (b[i]) + fabs (b[i + 1]);
-	  if (c[i] + h > xmax)
-	    xmax = c[i] + h;
-	  if (c[i] - h < xmin)
-	    xmin = c[i] - h;
-	}
-      xmax *= 2.;
-
-      eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
-      if (eps1 <= 0.0)
-	eps1 = eps2;
-      eps2 = 0.5 * eps1 + 7.0 * (eps2);
-      x0 = xmax;
-      for (i = m1; i <= m2; i++)
-	{
-	  x[i] = xmax;
-	  wu[i] = xmin;
-	}
-
-      for (k = m2; k >= m1; k--)
-	{
-	  xu = xmin;
-	  i = k;
-	  do
-	    {
-	      if (xu < wu[i])
-		{
-		  xu = wu[i];
-		  i = m1 - 1;
-		}
-	      i--;
-	    }
-	  while (i >= m1);
-	  if (x0 > x[k])
-	    x0 = x[k];
-	  while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
-	    {
-	      x1 = (xu + x0) / 2;
-
-	      a = 0;
-	      q = 1.0;
-	      for (i = 1; i <= n; i++)
-		{
-		  q =
-		    c[i] - x1 -
-		    ((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
-		  if (q < 0)
-		    a++;
-		}
-//      printf("x1=%0.14e a=%d\n",x1,a);
-	      if (a < k)
-		{
-		  if (a < m1)
-		    {
-		      xu = x1;
-		      wu[m1] = x1;
-		    }
-		  else
-		    {
-		      xu = x1;
-		      wu[a + 1] = x1;
-		      if (x[a] > x1)
-			x[a] = x1;
-		    }
-		}
-	      else
-		x0 = x1;
-	    }
-	  printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
-	  x[k] = (x0 + xu) / 2;
-	}
-    }
-  };
-
-/////////////////////////////////////////////////////////////
-// Implicitly restarted lanczos
-/////////////////////////////////////////////////////////////
-
-
-  template < class Field > class SimpleLanczos
-  {
-
-    const RealD small = 1.0e-16;
-  public:
-    int lock;
-    int get;
-    int Niter;
-    int converged;
-
-    int Nstop;			// Number of evecs checked for convergence
-    int Nk;			// Number of converged sought
-    int Np;			// Np -- Number of spare vecs in kryloc space
-    int Nm;			// Nm -- total number of vectors
-
-
-    RealD OrthoTime;
-
-    RealD eresid;
-
-//    SortEigen < Field > _sort;
-
-    LinearFunction < Field > &_Linop;
-
-//    OperatorFunction < Field > &_poly;
-
-    /////////////////////////
-    // Constructor
-    /////////////////////////
-    void init (void)
-    {
-    };
-//    void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector  < RealD > >&evecs);
-
-    SimpleLanczos (LinearFunction < Field > &Linop,	// op
-//		   OperatorFunction < Field > &poly,	// polynmial
-		   int _Nstop,	// sought vecs
-		   int _Nk,	// sought vecs
-		   int _Nm,	// spare vecs
-		   RealD _eresid,	// resid in lmdue deficit 
-		   int _Niter):	// Max iterations
-     
-      _Linop (Linop),
- //     _poly (poly),
-      Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
-    {
-      Np = Nm - Nk;
-      assert (Np > 0);
-    };
-
-    /////////////////////////
-    // Sanity checked this routine (step) against Saad.
-    /////////////////////////
-    void RitzMatrix (std::vector < Field > &evec, int k)
-    {
-
-      if (1)
-	return;
-
-      GridBase *grid = evec[0].Grid();
-      Field w (grid);
-      std::cout << GridLogMessage << "RitzMatrix " << std::endl;
-      for (int i = 0; i < k; i++)
-	{
-	  _Linop(evec[i], w);
-//      _poly(_Linop,evec[i],w);
-	  std::cout << GridLogMessage << "[" << i << "] ";
-	  for (int j = 0; j < k; j++)
-	    {
-	      ComplexD in = innerProduct (evec[j], w);
-	      if (fabs ((double) i - j) > 1)
-		{
-		  if (abs (in) > 1.0e-9)
-		    {
-		      std::cout << GridLogMessage << "oops" << std::endl;
-		      abort ();
-		    }
-		  else
-		    std::cout << GridLogMessage << " 0 ";
-		}
-	      else
-		{
-		  std::cout << GridLogMessage << " " << in << " ";
-		}
-	    }
-	  std::cout << GridLogMessage << std::endl;
-	}
-    }
-
-    void step (std::vector < RealD > &lmd,
-	       std::vector < RealD > &lme,
-	       Field & last, Field & current, Field & next, uint64_t k)
-    {
-      if (lmd.size () <= k)
-	lmd.resize (k + Nm);
-      if (lme.size () <= k)
-	lme.resize (k + Nm);
-
-
-//      _poly(_Linop,current,next );   // 3. wk:=Avk−βkv_{k−1}
-      _Linop(current, next);	// 3. wk:=Avk−βkv_{k−1}
-      if (k > 0)
-	{
-	  next -= lme[k - 1] * last;
-	}
-//      std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
-
-      ComplexD zalph = innerProduct (current, next);	// 4. αk:=(wk,vk)
-      RealD alph = real (zalph);
-
-      next = next - alph * current;	// 5. wk:=wk−αkvk
-//      std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
-
-      RealD beta = normalise (next);	// 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
-      // 7. vk+1 := wk/βk+1
-//       norm=beta;
-
-      int interval = Nm / 100 + 1;
-      if ((k % interval) == 0)
-	std::
-	  cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
-	  beta << std::endl;
-      const RealD tiny = 1.0e-20;
-      if (beta < tiny)
-	{
-	  std::cout << GridLogMessage << " beta is tiny " << beta << std::
-	    endl;
-	}
-      lmd[k] = alph;
-      lme[k] = beta;
-
-    }
-
-    void qr_decomp (std::vector < RealD > &lmd,
-		    std::vector  < RealD > &lme,
-		    int Nk,
-		    int Nm,
-		    std::vector  < RealD > &Qt, RealD Dsh, int kmin, int kmax)
-    {
-      int k = kmin - 1;
-      RealD x;
-
-      RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
-      RealD c = (lmd[k] - Dsh) * Fden;
-      RealD s = -lme[k] * Fden;
-
-      RealD tmpa1 = lmd[k];
-      RealD tmpa2 = lmd[k + 1];
-      RealD tmpb = lme[k];
-
-      lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
-      lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
-      lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
-      x = -s * lme[k + 1];
-      lme[k + 1] = c * lme[k + 1];
-
-      for (int i = 0; i < Nk; ++i)
-	{
-	  RealD Qtmp1 = Qt[i + Nm * k];
-	  RealD Qtmp2 = Qt[i + Nm * (k + 1)];
-	  Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
-	  Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
-	}
-
-      // Givens transformations
-      for (int k = kmin; k < kmax - 1; ++k)
-	{
-
-	  RealD Fden = 1.0 / hypot (x, lme[k - 1]);
-	  RealD c = lme[k - 1] * Fden;
-	  RealD s = -x * Fden;
-
-	  RealD tmpa1 = lmd[k];
-	  RealD tmpa2 = lmd[k + 1];
-	  RealD tmpb = lme[k];
-
-	  lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
-	  lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
-	  lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
-	  lme[k - 1] = c * lme[k - 1] - s * x;
-
-	  if (k != kmax - 2)
-	    {
-	      x = -s * lme[k + 1];
-	      lme[k + 1] = c * lme[k + 1];
-	    }
-
-	  for (int i = 0; i < Nk; ++i)
-	    {
-	      RealD Qtmp1 = Qt[i + Nm * k];
-	      RealD Qtmp2 = Qt[i + Nm * (k + 1)];
-	      Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
-	      Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
-	    }
-	}
-    }
-
-#if 0
-#ifdef USE_LAPACK
-#ifdef USE_MKL
-#define LAPACK_INT MKL_INT
-#else
-#define LAPACK_INT long long
-#endif
-    void diagonalize_lapack (std::vector  < RealD > &lmd, std::vector  < RealD > &lme, int N1,	// all
-			     int N2,	// get
-			     GridBase * grid)
-    {
-      const int size = Nm;
-      LAPACK_INT NN = N1;
-      double evals_tmp[NN];
-      double DD[NN];
-      double EE[NN];
-      for (int i = 0; i < NN; i++)
-	for (int j = i - 1; j <= i + 1; j++)
-	  if (j < NN && j >= 0)
-	    {
-	      if (i == j)
-		DD[i] = lmd[i];
-	      if (i == j)
-		evals_tmp[i] = lmd[i];
-	      if (j == (i - 1))
-		EE[j] = lme[j];
-	    }
-      LAPACK_INT evals_found;
-      LAPACK_INT lwork =
-	((18 * NN) >
-	 (1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
-      LAPACK_INT liwork = 3 + NN * 10;
-      LAPACK_INT iwork[liwork];
-      double work[lwork];
-      LAPACK_INT isuppz[2 * NN];
-      char jobz = 'N';		// calculate evals only
-      char range = 'I';		// calculate il-th to iu-th evals
-      //    char range = 'A'; // calculate all evals
-      char uplo = 'U';		// refer to upper half of original matrix
-      char compz = 'I';		// Compute eigenvectors of tridiagonal matrix
-      int ifail[NN];
-      LAPACK_INT info;
-//  int total = QMP_get_number_of_nodes();
-//  int node = QMP_get_node_number();
-//  GridBase *grid = evec[0]._grid;
-      int total = grid->_Nprocessors;
-      int node = grid->_processor;
-      int interval = (NN / total) + 1;
-      double vl = 0.0, vu = 0.0;
-      LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
-      if (iu > NN)
-	iu = NN;
-      double tol = 0.0;
-      if (1)
-	{
-	  memset (evals_tmp, 0, sizeof (double) * NN);
-	  if (il <= NN)
-	    {
-	      printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
-#ifdef USE_MKL
-	      dstegr (&jobz, &range, &NN,
-#else
-	      LAPACK_dstegr (&jobz, &range, &NN,
-#endif
-			     (double *) DD, (double *) EE, &vl, &vu, &il, &iu,	// these four are ignored if second parameteris 'A'
-			     &tol,	// tolerance
-			     &evals_found, evals_tmp, (double *) NULL, &NN,
-			     isuppz, work, &lwork, iwork, &liwork, &info);
-	      for (int i = iu - 1; i >= il - 1; i--)
-		{
-		  printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
-			  evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
-		  evals_tmp[i] = evals_tmp[i - (il - 1)];
-		  if (il > 1)
-		    evals_tmp[i - (il - 1)] = 0.;
-		}
-	    }
-	  {
-	    grid->GlobalSumVector (evals_tmp, NN);
-	  }
-	}
-// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
-    }
-#undef LAPACK_INT
-#endif
-
-
-    void diagonalize (std::vector  < RealD > &lmd,
-		      std::vector  < RealD > &lme,
-		      int N2, int N1, GridBase * grid)
-    {
-
-#ifdef USE_LAPACK
-      const int check_lapack = 0;	// just use lapack if 0, check against lapack if 1
-
-      if (!check_lapack)
-	return diagonalize_lapack (lmd, lme, N2, N1, grid);
-
-//      diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
-#endif
-    }
-#endif
-
-    static RealD normalise (Field & v)
-    {
-      RealD nn = norm2 (v);
-      nn = sqrt (nn);
-      v = v * (1.0 / nn);
-      return nn;
-    }
-
-    void orthogonalize (Field & w, std::vector < Field > &evec, int k)
-    {
-      double t0 = -usecond () / 1e6;
-      typedef typename Field::scalar_type MyComplex;
-      MyComplex ip;
-
-      if (0)
-	{
-	  for (int j = 0; j < k; ++j)
-	    {
-	      normalise (evec[j]);
-	      for (int i = 0; i < j; i++)
-		{
-		  ip = innerProduct (evec[i], evec[j]);	// are the evecs normalised? ; this assumes so.
-		  evec[j] = evec[j] - ip * evec[i];
-		}
-	    }
-	}
-
-      for (int j = 0; j < k; ++j)
-	{
-	  ip = innerProduct (evec[j], w);	// are the evecs normalised? ; this assumes so.
-	  w = w - ip * evec[j];
-	}
-      normalise (w);
-      t0 += usecond () / 1e6;
-      OrthoTime += t0;
-    }
-
-    void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
-    {
-      for (int i = 0; i < Qt.size (); ++i)
-	Qt[i] = 0.0;
-      for (int k = 0; k < Nm; ++k)
-	Qt[k + k * Nm] = 1.0;
-    }
-
-
-    void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
-    {
-
-      GridBase *grid = src.Grid();
-//      assert(grid == src._grid);
-
-      std::
-	cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
-	endl;
-      std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
-      std::cout << GridLogMessage << " -- size of eval   = " << eval.
-	size () << std::endl;
-
-//      assert(c.size() && Nm == eval.size());
-
-      std::vector < RealD > lme (Nm);
-      std::vector < RealD > lmd (Nm);
-
-
-      Field current (grid);
-      Field last (grid);
-      Field next (grid);
-
-      Nconv = 0;
-
-      RealD beta_k;
-
-      // Set initial vector
-      // (uniform vector) Why not src??
-      //      evec[0] = 1.0;
-      current = src;
-      std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
-	endl;
-      normalise (current);
-      std::
-	cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
-	std::endl;
-
-      // Initial Nk steps
-      OrthoTime = 0.;
-      double t0 = usecond () / 1e6;
-      RealD norm;		// sqrt norm of last vector
-
-      uint64_t iter = 0;
-
-      bool initted = false;
-      std::vector < RealD > low (Nstop * 10);
-      std::vector < RealD > high (Nstop * 10);
-      RealD cont = 0.;
-      while (1) {
-	  cont = 0.;
-	  std::vector < RealD > lme2 (Nm);
-	  std::vector < RealD > lmd2 (Nm);
-	  for (uint64_t k = 0; k < Nm; ++k, iter++) {
-	      step (lmd, lme, last, current, next, iter);
-	      last = current;
-	      current = next;
-	    }
-	  double t1 = usecond () / 1e6;
-	  std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
-	    t0 << "seconds" << std::endl;
-	  t0 = t1;
-	  std::
-	    cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
-	    OrthoTime << "seconds" << std::endl;
-
-	  // getting eigenvalues
-	  lmd2.resize (iter + 2);
-	  lme2.resize (iter + 2);
-	  for (uint64_t k = 0; k < iter; ++k) {
-	      lmd2[k + 1] = lmd[k];
-	      lme2[k + 2] = lme[k];
-	    }
-	  t1 = usecond () / 1e6;
-	  std::cout << GridLogMessage << "IRL:: copy: " << t1 -
-	    t0 << "seconds" << std::endl;
-	  t0 = t1;
-	  {
-	    int total = grid->_Nprocessors;
-	    int node = grid->_processor;
-	    int interval = (Nstop / total) + 1;
-	    int iu = (iter + 1) - (interval * node + 1);
-	    int il = (iter + 1) - (interval * (node + 1));
-	    std::vector < RealD > eval2 (iter + 3);
-	    RealD eps2;
-	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
-			      eps2);
-//        diagonalize(eval2,lme2,iter,Nk,grid);
-	    RealD diff = 0.;
-	    for (int i = il; i <= iu; i++) {
-		if (initted)
-		  diff =
-		    fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
-						      fabs (high[iu-i]));
-		if (initted && (diff > eresid))
-		  cont = 1.;
-		if (initted)
-		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
-			  high[iu-i], diff);
-		high[iu-i] = eval2[i];
-	      }
-	    il = (interval * node + 1);
-	    iu = (interval * (node + 1));
-	    Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
-			      eps2);
-	    for (int i = il; i <= iu; i++) {
-		if (initted)
-		  diff =
-		    fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
-						fabs (low[i]));
-		if (initted && (diff > eresid))
-		  cont = 1.;
-		if (initted)
-		  printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
-			  low[i], diff);
-		low[i] = eval2[i];
-	      }
-	    t1 = usecond () / 1e6;
-	    std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
-	      t0 << "seconds" << std::endl;
-	    t0 = t1;
-	  }
-
-	  for (uint64_t k = 0; k < Nk; ++k) {
-//          eval[k] = eval2[k];
-	    }
-	  if (initted)
-	    {
-	      grid->GlobalSumVector (&cont, 1);
-	      if (cont < 1.) return;
-	    }
-	  initted = true;
-	}
-
-    }
-
-
-
-
-
-#if 0
-
-/**
-   There is some matrix Q such that for any vector y
-   Q.e_1 = y and Q is unitary.
-**/
-    template < class T >
-      static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
-    {
-      int N = y.size ();	//Matrix Size
-      Fill (Q, 0.0);
-      T tau;
-      for (int i = 0; i < N; i++)
-	{
-	  Q[i][0] = y[i];
-	}
-      T sig = conj (y[0]) * y[0];
-      T tau0 = fabs (sqrt (sig));
-
-      for (int j = 1; j < N; j++)
-	{
-	  sig += conj (y[j]) * y[j];
-	  tau = abs (sqrt (sig));
-
-	  if (abs (tau0) > 0.0)
-	    {
-
-	      T gam = conj ((y[j] / tau) / tau0);
-	      for (int k = 0; k <= j - 1; k++)
-		{
-		  Q[k][j] = -gam * y[k];
-		}
-	      Q[j][j] = tau0 / tau;
-	    }
-	  else
-	    {
-	      Q[j - 1][j] = 1.0;
-	    }
-	  tau0 = tau;
-	}
-      return tau;
-    }
-
-/**
-	There is some matrix Q such that for any vector y
-	Q.e_k = y and Q is unitary.
-**/
-    template < class T >
-      static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
-    {
-      T tau = orthQ (Q, y);
-      SL (Q);
-      return tau;
-    }
-
-
-/**
-	Wind up with a matrix with the first con rows untouched
-
-say con = 2
-	Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
-	and the matrix is upper hessenberg
-	and with f and Q appropriately modidied with Q is the arnoldi factorization
-
-**/
-
-    template < class T > static void Lock (DenseMatrix < T > &H,	///Hess mtx     
-					   DenseMatrix < T > &Q,	///Lock Transform
-					   T val,	///value to be locked
-					   int con,	///number already locked
-					   RealD small, int dfg, bool herm)
-    {
-      //ForceTridiagonal(H);
-
-      int M = H.dim;
-      DenseVector < T > vec;
-      Resize (vec, M - con);
-
-      DenseMatrix < T > AH;
-      Resize (AH, M - con, M - con);
-      AH = GetSubMtx (H, con, M, con, M);
-
-      DenseMatrix < T > QQ;
-      Resize (QQ, M - con, M - con);
-
-      Unity (Q);
-      Unity (QQ);
-
-      DenseVector < T > evals;
-      Resize (evals, M - con);
-      DenseMatrix < T > evecs;
-      Resize (evecs, M - con, M - con);
-
-      Wilkinson < T > (AH, evals, evecs, small);
-
-      int k = 0;
-      RealD cold = abs (val - evals[k]);
-      for (int i = 1; i < M - con; i++)
-	{
-	  RealD cnew = abs (val - evals[i]);
-	  if (cnew < cold)
-	    {
-	      k = i;
-	      cold = cnew;
-	    }
-	}
-      vec = evecs[k];
-
-      ComplexD tau;
-      orthQ (QQ, vec);
-      //orthQM(QQ,AH,vec);
-
-      AH = Hermitian (QQ) * AH;
-      AH = AH * QQ;
-
-      for (int i = con; i < M; i++)
-	{
-	  for (int j = con; j < M; j++)
-	    {
-	      Q[i][j] = QQ[i - con][j - con];
-	      H[i][j] = AH[i - con][j - con];
-	    }
-	}
-
-      for (int j = M - 1; j > con + 2; j--)
-	{
-
-	  DenseMatrix < T > U;
-	  Resize (U, j - 1 - con, j - 1 - con);
-	  DenseVector < T > z;
-	  Resize (z, j - 1 - con);
-	  T nm = norm (z);
-	  for (int k = con + 0; k < j - 1; k++)
-	    {
-	      z[k - con] = conj (H (j, k + 1));
-	    }
-	  normalise (z);
-
-	  RealD tmp = 0;
-	  for (int i = 0; i < z.size () - 1; i++)
-	    {
-	      tmp = tmp + abs (z[i]);
-	    }
-
-	  if (tmp < small / ((RealD) z.size () - 1.0))
-	    {
-	      continue;
-	    }
-
-	  tau = orthU (U, z);
-
-	  DenseMatrix < T > Hb;
-	  Resize (Hb, j - 1 - con, M);
-
-	  for (int a = 0; a < M; a++)
-	    {
-	      for (int b = 0; b < j - 1 - con; b++)
-		{
-		  T sum = 0;
-		  for (int c = 0; c < j - 1 - con; c++)
-		    {
-		      sum += H[a][con + 1 + c] * U[c][b];
-		    }		//sum += H(a,con+1+c)*U(c,b);}
-		  Hb[b][a] = sum;
-		}
-	    }
-
-	  for (int k = con + 1; k < j; k++)
-	    {
-	      for (int l = 0; l < M; l++)
-		{
-		  H[l][k] = Hb[k - 1 - con][l];
-		}
-	    }			//H(Hb[k-1-con][l] , l,k);}}
-
-	  DenseMatrix < T > Qb;
-	  Resize (Qb, M, M);
-
-	  for (int a = 0; a < M; a++)
-	    {
-	      for (int b = 0; b < j - 1 - con; b++)
-		{
-		  T sum = 0;
-		  for (int c = 0; c < j - 1 - con; c++)
-		    {
-		      sum += Q[a][con + 1 + c] * U[c][b];
-		    }		//sum += Q(a,con+1+c)*U(c,b);}
-		  Qb[b][a] = sum;
-		}
-	    }
-
-	  for (int k = con + 1; k < j; k++)
-	    {
-	      for (int l = 0; l < M; l++)
-		{
-		  Q[l][k] = Qb[k - 1 - con][l];
-		}
-	    }			//Q(Qb[k-1-con][l] , l,k);}}
-
-	  DenseMatrix < T > Hc;
-	  Resize (Hc, M, M);
-
-	  for (int a = 0; a < j - 1 - con; a++)
-	    {
-	      for (int b = 0; b < M; b++)
-		{
-		  T sum = 0;
-		  for (int c = 0; c < j - 1 - con; c++)
-		    {
-		      sum += conj (U[c][a]) * H[con + 1 + c][b];
-		    }		//sum += conj( U(c,a) )*H(con+1+c,b);}
-		  Hc[b][a] = sum;
-		}
-	    }
-
-	  for (int k = 0; k < M; k++)
-	    {
-	      for (int l = con + 1; l < j; l++)
-		{
-		  H[l][k] = Hc[k][l - 1 - con];
-		}
-	    }			//H(Hc[k][l-1-con] , l,k);}}
-
-	}
-    }
-#endif
-
-
-  };
-
-}
-#endif
--- a/Grid/cartesian/Cartesian.h
+++ b/Grid/cartesian/Cartesian.h
@@ -31,5 +31,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cartesian/Cartesian_base.h>
 #include <Grid/cartesian/Cartesian_full.h>
 #include <Grid/cartesian/Cartesian_red_black.h> 
+#include <Grid/cartesian/CartesianCrossIcosahedron.h>

 #endif
--- a/Grid/cartesian/CartesianCrossIcosahedron.h
+++ b/Grid/cartesian/CartesianCrossIcosahedron.h
@@ -0,0 +1,199 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cartesian/CartesianCrossIcosahedron.h
+
+    Copyright (C) 2025
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+    
+/////////////////////////////////////////////////////////////////////////////////////////
+// Grid Support.
+/////////////////////////////////////////////////////////////////////////////////////////
+
+enum IcosahedralMeshType {
+  IcosahedralVertices,
+  IcosahedralEdges
+} ;
+enum NorthSouth {
+  North = 1,
+  South = 0
+};
+
+const int num_icosahedron_tiles = 10;
+
+class GridCartesianCrossIcosahedron: public GridCartesian {
+
+public:
+
+  IcosahedralMeshType meshType;
+
+  IcosahedralMeshType MeshType(void) { return meshType; };
+  
+  /////////////////////////////////////////////////////////////////////////
+  // Constructor takes a parent grid and possibly subdivides communicator.
+  /////////////////////////////////////////////////////////////////////////
+  /*
+  GridCartesian(const Coordinate &dimensions,
+		const Coordinate &simd_layout,
+		const Coordinate &processor_grid,
+		const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
+  {
+    assert(0); // No subdivision
+  }
+  GridCartesian(const Coordinate &dimensions,
+		const Coordinate &simd_layout,
+		const Coordinate &processor_grid,
+		const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
+  {
+    assert(0); // No subdivision
+  }
+  */
+  /////////////////////////////////////////////////////////////////////////
+  // Construct from comm world
+  /////////////////////////////////////////////////////////////////////////
+  GridCartesianCrossIcosahedron(const Coordinate &dimensions,
+				const Coordinate &simd_layout,
+				const Coordinate &processor_grid,
+				IcosahedralMeshType _meshType) : GridCartesian(dimensions,simd_layout,processor_grid)
+  {
+    meshType = _meshType;
+    Coordinate S2dimensions=dimensions;
+    Coordinate S2simd      =simd_layout;
+    Coordinate S2procs     =processor_grid;
+
+    assert(simd_layout[0]==1); // Force simd into perpendicular dimensions
+    assert(simd_layout[1]==1); // to avoid pole storage complexity interacting with SIMD.
+    assert(dimensions[_ndimension-1]==num_icosahedron_tiles);
+    assert(processor_grid[_ndimension-1]<=2); // Keeps the patches that need a pole on the same node
+
+    // allocate the pole storage if we are seeking vertex domain data
+    if ( meshType == IcosahedralVertices ) {
+      InitPoles();
+    }
+  }
+
+  virtual ~GridCartesianCrossIcosahedron() = default;
+
+  ////////////////////////////////////////////////
+  // Use to decide if a given grid is icosahedral
+  ////////////////////////////////////////////////
+  int hasNorthPole;
+  int hasSouthPole;
+  int northPoleOsite;
+  int southPoleOsite;
+  int northPoleOsites;
+  int southPoleOsites;
+
+  virtual int isIcosahedral(void)           override { return 1;}
+  virtual int isIcosahedralVertex(void)     override { return meshType==IcosahedralVertices;}
+  virtual int isIcosahedralEdge  (void)     override { return meshType==IcosahedralEdges;}
+  virtual int ownsNorthPole(void)   const override { return hasNorthPole; };
+  virtual int NorthPoleOsite(void)  const override { return northPoleOsite; };
+  virtual int NorthPoleOsites(void) const override { return northPoleOsites; };
+  virtual int ownsSouthPole(void)   const override { return hasSouthPole; };
+  virtual int SouthPoleOsite(void)  const override { return southPoleOsite; };
+  virtual int SouthPoleOsites(void) const override { return southPoleOsites; };
+
+  void InitPoles(void)
+  {
+    int Ndm1 = _ndimension-1;
+    ///////////////////////
+    // Add the extra pole storage
+    ///////////////////////
+    // Vertices = 1x LxLx D1...Dn + 2.D1...Dn
+    // Start after the LxL and don't include the 10 patch dim
+    int OrthogSize = 1;
+    for (int d = 2; d < Ndm1; d++) {
+      OrthogSize *= _gdimensions[d];
+    }
+    _fsites += OrthogSize*2;
+    _gsites += OrthogSize*2;
+
+    // Simd reduced sizes are multiplied up.
+    // If the leading LxL are simd-ized, the vector objects will contain "redundant" lanes
+    // which should contain identical north (south) pole data
+    OrthogSize = 1;
+    for (int d = 2; d < Ndm1; d++) {
+      OrthogSize *= _rdimensions[d];
+    }
+
+    // Grow the local volume to hold pole data
+    // on rank (0,0) in the LxL planes
+    // since SIMD must be placed in the orthogonal directions
+    Coordinate pcoor = this->ThisProcessorCoor();
+    Coordinate pgrid = this->ProcessorGrid();
+
+    const int xdim=0;
+    const int ydim=1;
+    /*
+     *
+     *  /\/\/\/\/\
+     * /\/\/\/\/\/
+     * \/\/\/\/\/
+     *
+     *  y
+     * /
+     * \x
+     *
+     * Labelling patches as 5 6 7 8 9
+     *                      0 1 2 3 4
+     *
+     * Will ban distribution of the patch dimension by more than 2.
+     *
+     * Hence all 5 patches associated with the pole must have the
+     * appropriate "corner" of the patch L^2 located on the SAME rank.
+     */ 
+     
+    if( (pcoor[xdim]==pgrid[xdim]-1) && (pcoor[ydim]==0) && (pcoor[Ndm1]==0) ){
+      hasSouthPole   =1;
+      southPoleOsite=this->_osites;
+      southPoleOsites=OrthogSize;
+      this->_osites += OrthogSize;
+    } else {
+      hasSouthPole   =0;
+      southPoleOsites=0;
+      southPoleOsite=0;
+    }
+    if( (pcoor[xdim]==0) && (pcoor[ydim]==pgrid[ydim]-1) && (pcoor[Ndm1]==pgrid[Ndm1]-1) ){
+      hasNorthPole   =1;
+      northPoleOsite=this->_osites;
+      northPoleOsites=OrthogSize;
+      this->_osites += OrthogSize;
+    } else {
+      hasNorthPole   =0;
+      northPoleOsites=0;
+      northPoleOsite=0;
+    }
+    std::cout << "Icosahedral vertex field volume " << this->_osites<<std::endl;
+    std::cout << "Icosahedral south pole offset   " << this->southPoleOsite<<std::endl;
+    std::cout << "Icosahedral north pole offset   " << this->northPoleOsite<<std::endl;
+    std::cout << "Icosahedral south pole size     " << this->southPoleOsites<<std::endl;
+    std::cout << "Icosahedral north pole size     " << this->northPoleOsites<<std::endl;
+  };
+
+};
+
+NAMESPACE_END(Grid);
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -86,10 +86,22 @@ public:

 public:

+  // Icosahedral decisions
+  virtual int isIcosahedral(void) { return 0;}
+  virtual int isIcosahedralVertex(void) { return 0;}
+  virtual int isIcosahedralEdge  (void) { return 0;}
+  virtual int ownsNorthPole(void) const { return 0; };
+  virtual int ownsSouthPole(void) const { return 0; };
+  virtual int NorthPoleOsite(void) const { return 0; };
+  virtual int SouthPoleOsite(void) const { return 0; };
+  virtual int NorthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
+  virtual int SouthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
+
  ////////////////////////////////////////////////////////////////
  // Checkerboarding interface is virtual and overridden by 
  // GridCartesian / GridRedBlackCartesian
  ////////////////////////////////////////////////////////////////
+
  virtual int CheckerBoarded(int dim) =0;
  virtual int CheckerBoard(const Coordinate &site)=0;
  virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
@@ -176,6 +188,8 @@ public:
    }
    return permute_type;
  }
+
+  
  ////////////////////////////////////////////////////////////////
  // Array sizing queries
  ////////////////////////////////////////////////////////////////
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@@ -260,32 +260,39 @@ CartesianCommunicator::~CartesianCommunicator()
 }
 #ifdef USE_GRID_REDUCTION
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(f);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("GlobalSumP2P");
  CartesianCommunicator::GlobalSumP2P(d);
 }
 #else
 void CartesianCommunicator::GlobalSum(float &f){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
+  FlightRecorder::StepLog("AllReduce");
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 #endif
 void CartesianCommunicator::GlobalSum(uint32_t &u){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
+  FlightRecorder::StepLog("AllReduce");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
+  FlightRecorder::StepLog("AllReduceVector");
  int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
@@ -794,6 +801,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque

 void CartesianCommunicator::StencilBarrier(void)
 {
+  FlightRecorder::StepLog("NodeBarrier");
  MPI_Barrier  (ShmComm);
 }
 //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -801,11 +809,13 @@ void CartesianCommunicator::StencilBarrier(void)
 //}
 void CartesianCommunicator::Barrier(void)
 {
+  FlightRecorder::StepLog("GridBarrier");
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("Broadcast");
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
@@ -824,6 +834,7 @@ void CartesianCommunicator::BarrierWorld(void){
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
+  FlightRecorder::StepLog("BroadcastWorld");
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
@@ -846,6 +857,7 @@ void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,
 }
 void CartesianCommunicator::AllToAll(void  *in,void *out,uint64_t words,uint64_t bytes)
 {
+  FlightRecorder::StepLog("AllToAll");
  // MPI is a pain and uses "int" arguments
  // 64*64*64*128*16 == 500Million elements of data.
  // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -990,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif

-  SharedMemoryTest();
+  //  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
--- a/Grid/communicator/SharedMemoryNone.cc
+++ b/Grid/communicator/SharedMemoryNone.cc
@@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 {
  acceleratorMemSet(dest,0,bytes);
 }
-void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
-{
-  acceleratorCopyToDevice(src,dest,bytes);
-}
+//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
+//{
+//  acceleratorCopyToDevice(src,dest,bytes);
+//}
 ////////////////////////////////////////////////////////
 // Global shared functionality finished
 // Now move to per communicator functionality
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
 const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
+  assert(!rhs.Grid()->isIcosahedral());
+  
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;

--- a/Grid/cshift/Cshift_none.h
+++ b/Grid/cshift/Cshift_none.h
@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
+  assert(!rhs.Grid()->isIcosahedral());
  Lattice<vobj> ret(rhs.Grid());
  ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
  Cshift_local(ret,rhs,dimension,shift);
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -236,7 +236,7 @@ public:
  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
    vobj vtmp;
    vtmp = r;
-#if 0
+#if 1
    deviceVector<vobj> vvtmp(1);
    acceleratorPut(vvtmp[0],vtmp);
    vobj *vvtmp_p = & vvtmp[0];
@@ -373,14 +373,17 @@ public:

 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
  typedef typename vobj::scalar_object sobj;
-  for(int64_t g=0;g<o.Grid()->_gsites;g++){
+  uint64_t gsites=1;
+  uint64_t polesites=0;
+  for(int d=0;d<o.Grid()->_ndimension;d++) gsites *= o.Grid()->_gdimensions[d];
+  for(int64_t g=0;g<gsites;g++){

    Coordinate gcoor;
    o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);

    sobj ss;
    peekSite(ss,o,gcoor);
-    stream<<"[";
+    stream<<"["<<  g<<" : ";
    for(int d=0;d<gcoor.size();d++){
      stream<<gcoor[d];
      if(d!=gcoor.size()-1) stream<<",";
@@ -388,6 +391,41 @@ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Latti
    stream<<"]\t";
    stream<<ss<<std::endl;
  }
+  if ( o.Grid()->isIcosahedral() ) {
+    uint64_t psites=1;
+    Coordinate perpdims;
+    for(int d=2;d<o.Grid()->_ndimension-1;d++){
+      int pd=o.Grid()->_gdimensions[d];
+      psites*=pd;
+      perpdims.push_back(pd);
+    }
+    for(uint64_t p=0;p<psites;p++){
+      sobj ss;
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+      peekPole(ss,o,orthog,South);
+      stream<<"[ SouthPole : ";
+      for(int d=0;d<orthog.size();d++){
+	stream<<orthog[d];
+	if(d!=orthog.size()-1) stream<<",";
+      }
+      stream<<"]\t";
+      stream<<ss<<std::endl;
+    }
+    for(uint64_t p=0;p<psites;p++){
+      sobj ss;
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+      peekPole(ss,o,orthog,North);
+      stream<<"[ NorthPole : ";
+      for(int d=0;d<orthog.size();d++){
+	stream<<orthog[d];
+	if(d!=orthog.size()-1) stream<<",";
+      }
+      stream<<"]\t";
+      stream<<ss<<std::endl;
+    }
+  }
  return stream;
 }
  
--- a/Grid/lattice/Lattice_coordinate.h
+++ b/Grid/lattice/Lattice_coordinate.h
@@ -34,22 +34,86 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
  typedef typename iobj::scalar_type scalar_type;
  typedef typename iobj::vector_type vector_type;

+  l=Zero();
+  
  GridBase *grid = l.Grid();
  int Nsimd = grid->iSites();

-  autoView(l_v, l, CpuWrite);
-  thread_for( o, grid->oSites(), {
-    vector_type vI;
-    Coordinate gcoor;
-    ExtractBuffer<scalar_type> mergebuf(Nsimd);
-    for(int i=0;i<grid->iSites();i++){
-      grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
-      mergebuf[i]=(Integer)gcoor[mu];
+  int cartesian_vol = grid->oSites();
+  if ( grid->isIcosahedral() ) {
+    cartesian_vol = cartesian_vol - grid->NorthPoleOsites()-grid->SouthPoleOsites();
+  }
+  {
+    autoView(l_v, l, CpuWrite);
+    thread_for( o, cartesian_vol, {
+	vector_type vI;
+	Coordinate gcoor;
+	ExtractBuffer<scalar_type> mergebuf(Nsimd);
+	for(int i=0;i<grid->iSites();i++){
+	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
+	  mergebuf[i]=(Integer)gcoor[mu];
+	}
+	merge<vector_type,scalar_type>(vI,mergebuf);
+	l_v[o]=vI;
+      });
+  }
+
+  if (grid->isIcosahedralVertex()) {
+    uint64_t psites=1;
+    Coordinate perpdims;
+    typename iobj::scalar_object ss;
+    for(int d=2;d<grid->_ndimension-1;d++){
+      int pd=grid->_gdimensions[d];
+      psites*=pd;
+      perpdims.push_back(pd);
    }
-    merge<vector_type,scalar_type>(vI,mergebuf);
-    l_v[o]=vI;
-  });
+    for(uint64_t p=0;p<psites;p++){
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+
+      int icoor;
+      if ( mu>=2 && mu < grid->_ndimension-1) {
+	icoor = orthog[mu-2];
+      } else {
+	icoor = -1;
+      }
+
+      ss=scalar_type(icoor);
+
+      pokePole(ss,l,orthog,South);
+      pokePole(ss,l,orthog,North);
+    }
+  }
+};
+template<class iobj> inline void LatticePole(Lattice<iobj> &l,NorthSouth pole)
+{
+  typedef typename iobj::scalar_object sobj;
+  typedef typename iobj::scalar_type scalar_type;
+  typedef typename iobj::vector_type vector_type;
+
+  GridBase *grid = l.Grid();
+
+  l=Zero();
+
+  assert(grid->isIcosahedralVertex());
+  
+  if (grid->isIcosahedralVertex()) {
+    uint64_t psites=1;
+    Coordinate perpdims;
+    sobj ss;
+    scalar_type one(1.0);
+    ss=one;
+    for(int d=2;d<l.Grid()->_ndimension-1;d++){
+      int pd=l.Grid()->_gdimensions[d];
+      psites*=pd;
+      perpdims.push_back(pd);
+    }
+    for(uint64_t p=0;p<psites;p++){
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+      pokePole(ss,l,orthog,pole);
+    }
+  }
 };

 NAMESPACE_END(Grid);
-
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
  grid->GlobalCoorToRankIndex(rank,odx,idx,site);

  ExtractBuffer<sobj> buf(Nsimd);
-  autoView( l_v , l, CpuWrite);
+  autoView( l_v , l, CpuRead);
  extract(l_v[odx],buf);

  s = buf[idx];
@@ -151,6 +151,136 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
  return;
 };

+// zero for south pole, one for north pole
+template<class vobj,class sobj>
+void peekPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
+{
+  s=Zero();
+  
+  GridBase *grid=l.Grid();
+
+  assert(grid->isIcosahedral());
+  assert(grid->isIcosahedralVertex());
+
+  int Nsimd = grid->Nsimd();
+
+  int rank;
+
+  int Ndm1         = grid->_ndimension-1;
+  Coordinate pgrid = grid->ProcessorGrid();
+  const int xdim=0;
+  const int ydim=1;
+  const int pdim=Ndm1;
+
+  int64_t pole_osite;
+  int64_t pole_isite;
+  Coordinate rdims;
+  Coordinate idims;
+  Coordinate ocoor;
+  Coordinate icoor;
+  Coordinate pcoor(grid->_ndimension);
+  for(int d=2;d<Ndm1;d++){
+    int dd=d-2;
+    rdims.push_back(grid->_rdimensions[d]);
+    idims.push_back(grid->_simd_layout[d]);
+    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
+    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
+    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
+  }
+  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
+  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
+  
+  int64_t osite;
+  if(isNorth == North){
+    pcoor[xdim] = 0;
+    pcoor[ydim] = pgrid[ydim]-1;
+    pcoor[Ndm1] = pgrid[Ndm1]-1;
+    osite = pole_osite + grid->NorthPoleOsite();
+  } else {
+    pcoor[xdim] = pgrid[xdim]-1;
+    pcoor[ydim] = 0;
+    pcoor[Ndm1] = 0;
+    osite = pole_osite + grid->SouthPoleOsite();
+  }
+
+  rank = grid->RankFromProcessorCoor(pcoor);
+
+  if ( rank == grid->ThisRank() ) {
+    ExtractBuffer<sobj> buf(Nsimd);
+    autoView( l_v , l, CpuWrite);
+    extract(l_v[osite],buf);
+    s = buf[pole_isite];
+  }
+  grid->Broadcast(rank,s);
+
+  return;
+};
+template<class vobj,class sobj>
+void pokePole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
+{
+  GridBase *grid=l.Grid();
+
+  assert(grid->isIcosahedral());
+  assert(grid->isIcosahedralVertex());
+
+  grid->Broadcast(grid->BossRank(),s);
+
+  int Nsimd = grid->Nsimd();
+  int rank;
+  int Ndm1         = grid->_ndimension-1;
+  Coordinate pgrid = grid->ProcessorGrid();
+  const int xdim=0;
+  const int ydim=1;
+  const int pdim=Ndm1;
+
+  int64_t pole_osite;
+  int64_t pole_isite;
+  Coordinate rdims;
+  Coordinate idims;
+  Coordinate ocoor;
+  Coordinate icoor;
+  Coordinate pcoor(grid->_ndimension,0);
+  for(int d=2;d<Ndm1;d++){
+    int dd = d-2;
+    rdims.push_back(grid->_rdimensions[d]);
+    idims.push_back(grid->_simd_layout[d]);
+    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
+    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
+    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
+
+    int o = orthog[dd];
+    int r = grid->_rdimensions[d];
+    int omr = o % r;
+  }
+  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
+  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
+  
+  int64_t osite;
+  if(isNorth ==North){
+    pcoor[xdim] = 0;
+    pcoor[ydim] = pgrid[ydim]-1;
+    pcoor[Ndm1] = pgrid[Ndm1]-1;
+    osite = pole_osite + grid->NorthPoleOsite();
+  } else {
+    pcoor[xdim] = pgrid[xdim]-1;
+    pcoor[ydim] = 0;
+    pcoor[Ndm1] = 0;
+    osite = pole_osite + grid->SouthPoleOsite();
+  }
+
+  rank = grid->RankFromProcessorCoor(pcoor);
+
+  // extract-modify-merge cycle is easiest way and this is not perf critical
+  if ( rank == grid->ThisRank() ) {
+    ExtractBuffer<sobj> buf(Nsimd);
+    autoView( l_v , l, CpuWrite);
+    extract(l_v[osite],buf);
+    buf[pole_isite] = s;
+    merge(l_v[osite],buf);
+  }
+  return;
+};
+
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
@@ -179,7 +309,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
  for(int w=0;w<words;w++){
    pt[w] = getlane(vp[w],idx);
  }
-  //  std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
+
  return;
 };
 template<class vobj,class sobj>
--- a/Grid/qcd/QCD.h
+++ b/Grid/qcd/QCD.h
@@ -49,7 +49,7 @@ static constexpr int Tm = 7;

 static constexpr int Nc=Config_Nc;
 static constexpr int Ns=4;
-static constexpr int Nd=4;
+static constexpr int Nd=Config_Nd;
 static constexpr int Nhs=2; // half spinor
 static constexpr int Nds=8; // double stored gauge field
 static constexpr int Ngp=2; // gparity index range
@@ -75,6 +75,7 @@ static constexpr int InverseYes=1;
 //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;

 const int SpinorIndex = 2;
+const int PauliIndex  = 2; //TensorLevel counts from the bottom!
 template<typename T> struct isSpinor {
  static constexpr bool value = (SpinorIndex==T::TensorLevel);
 };
--- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
+++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
@@ -123,10 +123,10 @@ public:
      GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
      
      peekLocalSite(ScalarUmu, Umu_v, lcoor);
-      for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
+      for (int mu = 0; mu < Nd; mu++) ScalarUds(mu) = ScalarUmu(mu);
      
      peekLocalSite(ScalarUmu, Uadj_v, lcoor);
-      for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
+      for (int mu = 0; mu < Nd; mu++) ScalarUds(mu + Nd) = ScalarUmu(mu);
      
      pokeLocalSite(ScalarUds, Uds_v, lcoor);
    });
--- a/Grid/qcd/action/fermion/Fermion.h
+++ b/Grid/qcd/action/fermion/Fermion.h
@@ -85,6 +85,15 @@ NAMESPACE_CHECK(DomainWall);
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
 #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
 NAMESPACE_CHECK(Overlap);
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Two spin wilson fermion based
+///////////////////////////////////////////////////////////////////////////////
+
+#include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
+NAMESPACE_CHECK(TwoSpinWilson);
+
 ///////////////////////////////////////////////////////////////////////////////
 // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 ///////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/FermionCore.h
+++ b/Grid/qcd/action/fermion/FermionCore.h
@@ -41,8 +41,9 @@ NAMESPACE_CHECK(Compressor);
 NAMESPACE_CHECK(FermionOperatorImpl);
 #include <Grid/qcd/action/fermion/FermionOperator.h>
 NAMESPACE_CHECK(FermionOperator);
-#include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/WilsonKernels.h>           //used by all wilson type fermions
 #include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions
+#include <Grid/qcd/action/fermion/TwoSpinWilsonKernels.h>    //used for 3D fermions, pauli in place of Dirac
 NAMESPACE_CHECK(Kernels);

 #endif
--- a/Grid/qcd/action/fermion/FermionOperatorImpl.h
+++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h
@@ -180,6 +180,12 @@ NAMESPACE_CHECK(ImplGparityWilson);
 #include <Grid/qcd/action/fermion/StaggeredImpl.h> 
 NAMESPACE_CHECK(ImplStaggered);  

+/////////////////////////////////////////////////////////////////////////////
+// Two component spinor Wilson action for 3d / Boston
+/////////////////////////////////////////////////////////////////////////////
+#include <Grid/qcd/action/fermion/TwoSpinWilsonImpl.h> 
+NAMESPACE_CHECK(ImplTwoSpinWilson);  
+
 /////////////////////////////////////////////////////////////////////////////
 // Single flavour one component spinors with colour index. 5d vec
 /////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -274,7 +274,7 @@ public:
 	autoView( Uds_v , Uds, CpuWrite);
 	autoView( Utmp_v, Utmp, CpuWrite);
 	thread_foreach(ss,Utmp_v,{
-	    Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
+	    Uds_v[ss](0)(mu+Nd) = Utmp_v[ss]();
 	  });
      }
      Utmp = Uconj;
@@ -286,7 +286,7 @@ public:
 	autoView( Uds_v , Uds, CpuWrite);
 	autoView( Utmp_v, Utmp, CpuWrite);
 	thread_foreach(ss,Utmp_v,{
-	    Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
+	    Uds_v[ss](1)(mu+Nd) = Utmp_v[ss]();
        });
      }
    }
@@ -320,7 +320,7 @@ public:
      }
      
      Uconj = conjugate(*Upoke);
-      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
+      pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + Nd);
    }
  }
      
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h
@@ -36,6 +36,8 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static const int npoint = 16;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
 };

 template <class Impl>
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -40,6 +40,8 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  const int npoint = 16;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
 };

 template<class Impl>
--- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
+++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h
@@ -36,6 +36,8 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static const int npoint = 8;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
 };

 template <class Impl>
--- a/Grid/qcd/action/fermion/StaggeredImpl.h
+++ b/Grid/qcd/action/fermion/StaggeredImpl.h
@@ -141,9 +141,9 @@ public:
      Udag = Udag *phases;

 	InsertGaugeField(Uds,U,mu);
-	InsertGaugeField(Uds,Udag,mu+4);
+	InsertGaugeField(Uds,Udag,mu+Nd);
 	//	PokeIndex<LorentzIndex>(Uds, U, mu);
-	//	PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
+	//	PokeIndex<LorentzIndex>(Uds, Udag, mu + Nd);

      // 3 hop based on thin links. Crazy huh ?
      U  = PeekIndex<LorentzIndex>(Uthin, mu);
@@ -156,7 +156,7 @@ public:
      UUUdag = UUUdag *phases;

 	InsertGaugeField(UUUds,UUU,mu);
-	InsertGaugeField(UUUds,UUUdag,mu+4);
+	InsertGaugeField(UUUds,UUUdag,mu+Nd);

    }
  }
--- a/Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
@@ -0,0 +1,175 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma one 
+
+NAMESPACE_BEGIN(Grid);
+
+class TwoSpinWilsonFermion3plus1DStatic { 
+public:
+  // S-direction is INNERMOST and takes no part in the parity.
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  static constexpr int npoint = 6;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
+};
+
+template<class Impl>
+class TwoSpinWilsonFermion3plus1D : public TwoSpinWilsonKernels<Impl>, public TwoSpinWilsonFermion3plus1DStatic
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef TwoSpinWilsonKernels<Impl> Kernels;
+
+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }
+
+  int Dirichlet;
+  Coordinate Block; 
+
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void)              { return _ThreeDimGrid ;}
+  GridBase *GaugeRedBlackGrid(void)      { return _ThreeDimRedBlackGrid ;}
+  GridBase *FermionGrid(void)            { return _FourDimGrid;}
+  GridBase *FermionRedBlackGrid(void)    { return _FourDimRedBlackGrid;}
+
+  // full checkerboard operations; leave unimplemented as abstract for now
+  virtual void   M    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};
+
+  // half checkerboard operations; leave unimplemented as abstract for now
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+
+  // These can be overridden by fancy 5d chiral action
+  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
+  //  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  
+  // Implement hopping term non-hermitian hopping term; half cb or both
+  // Implement s-diagonal DW
+  void DW    (const FermionField &in, FermionField &out,int dag);
+  void Dhop  (const FermionField &in, FermionField &out,int dag);
+  void DhopOE(const FermionField &in, FermionField &out,int dag);
+  void DhopEO(const FermionField &in, FermionField &out,int dag);
+
+  void DhopComms  (const FermionField &in, FermionField &out);
+  void DhopCalc   (const FermionField &in, FermionField &out,uint64_t *ids);
+  
+  // add a DhopComm
+  // -- suboptimal interface will presently trigger multiple comms.
+  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
+  void DhopDirComms(const FermionField &in);
+  void DhopDirCalc(const FermionField &in, FermionField &out,int point);
+    
+  ///////////////////////////////////////////////////////////////
+  // New methods added 
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl & st,
+		     DoubledGaugeField & U,
+		     GaugeField &mat,
+		     const FermionField &A,
+		     const FermionField &B,
+		     int dag);
+    
+  void DhopInternal(StencilImpl & st,
+		    DoubledGaugeField &U,
+		    const FermionField &in, 
+		    FermionField &out,
+		    int dag);
+
+  void DhopInternalOverlappedComms(StencilImpl & st,
+				   DoubledGaugeField &U,
+				   const FermionField &in, 
+				   FermionField &out,
+				   int dag);
+
+  void DhopInternalSerialComms(StencilImpl & st,
+			       DoubledGaugeField &U,
+			       const FermionField &in, 
+			       FermionField &out,
+			       int dag);
+    
+  // Constructors
+  TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
+		  GridCartesian         &FourDimGrid,
+		  GridRedBlackCartesian &FourDimRedBlackGrid,
+		  GridCartesian         &ThreeDimGrid,
+		  GridRedBlackCartesian &ThreeDimRedBlackGrid,
+		  double _M5,const ImplParams &p= ImplParams());
+
+  virtual void DirichletBlock(const Coordinate & block)
+  {
+  }
+    
+  // DoubleStore
+  void ImportGauge(const GaugeField &_Umu);
+    
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+public:
+    
+  // Add these to the support from Wilson
+  GridBase *_ThreeDimGrid;
+  GridBase *_ThreeDimRedBlackGrid;
+  GridBase *_FourDimGrid;
+  GridBase *_FourDimRedBlackGrid;
+    
+  double                        M5;
+  int Ls;
+    
+  //Defines the stencils for even and odd
+  StencilImpl Stencil; 
+  StencilImpl StencilEven; 
+  StencilImpl StencilOdd; 
+    
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;
+
+};
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/TwoSpinWilsonImpl.h
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonImpl.h
@@ -0,0 +1,222 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+  
+/////////////////////////////////////////////////////////////////////////////
+// Single flavour four spinors with colour index
+/////////////////////////////////////////////////////////////////////////////
+template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
+class TwoSpinWilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+public:
+  
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
+
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+  INHERIT_GIMPL_TYPES(Gimpl);
+      
+  //Necessary?
+  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+    
+  typedef typename Options::_Coeff_t Coeff_t;
+      
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+    
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplPropagator<Simd>        SitePropagator;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplHalfCommSpinor<Simd>    SiteHalfCommSpinor;
+  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+    
+  typedef Lattice<SiteSpinor>            FermionField;
+  typedef Lattice<SitePropagator>        PropagatorField;
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+    
+  typedef SimpleCompressor<SiteSpinor> Compressor;
+  typedef WilsonImplParams ImplParams;
+  typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
+  typedef const typename StencilImpl::View_type StencilView;
+    
+  ImplParams Params;
+
+  TwoSpinWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){
+  };
+
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi,
+					  int mu) 
+  {
+    auto UU = coalescedRead(U(mu));
+    mult(&phi(), &UU, &chi());
+  }
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi,
+					  int mu,
+					  StencilEntry *SE,
+					  StencilView &St) 
+  {
+    multLink(phi,U,chi,mu);
+  }
+
+  template<class _SpinorField> 
+  inline void multLinkField(_SpinorField & out,
+			    const DoubledGaugeField &Umu,
+			    const _SpinorField & phi,
+			    int mu)
+  {
+    const int Nsimd = SiteHalfSpinor::Nsimd();
+    autoView( out_v, out, AcceleratorWrite);
+    autoView( phi_v, phi, AcceleratorRead);
+    autoView( Umu_v, Umu, AcceleratorRead);
+    typedef decltype(coalescedRead(out_v[0]))   calcSpinor;
+    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
+	calcSpinor tmp;
+	multLink(tmp,Umu_v[sss],phi_v(sss),mu);
+	coalescedWrite(out_v[sss],tmp);
+    });
+  }
+					   
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    reg = memory;
+  }
+      
+  inline void DoubleStore(GridBase *GaugeGrid,
+			  DoubledGaugeField &Uds,
+			  const GaugeField &Umu) 
+  {
+    typedef typename Simd::scalar_type scalar_type;
+
+    conformable(Uds.Grid(), GaugeGrid);
+    conformable(Umu.Grid(), GaugeGrid);
+
+    GaugeLinkField U(GaugeGrid);
+    GaugeLinkField tmp(GaugeGrid);
+
+    Lattice<iScalar<vInteger> > coor(GaugeGrid);
+      ////////////////////////////////////////////////////
+      // apply any boundary phase or twists
+      ////////////////////////////////////////////////////
+    for (int mu = 0; mu < Nd; mu++) {
+
+	////////// boundary phase /////////////
+      auto pha = Params.boundary_phases[mu];
+      scalar_type phase( real(pha),imag(pha) );
+
+	int L   = GaugeGrid->GlobalDimensions()[mu];
+        int Lmu = L - 1;
+
+      LatticeCoordinate(coor, mu);
+
+      U = PeekIndex<LorentzIndex>(Umu, mu);
+
+	// apply any twists
+	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
+	if ( theta != 0.0) { 
+	  scalar_type twphase(::cos(theta),::sin(theta));
+	  U = twphase*U;
+	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
+	}
+
+      tmp = where(coor == Lmu, phase * U, U);
+      PokeIndex<LorentzIndex>(Uds, tmp, mu);
+
+      U = adj(Cshift(U, mu, -1));
+      U = where(coor == 0, conjugate(phase) * U, U); 
+      PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
+    }
+  }
+
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+    GaugeLinkField link(mat.Grid());
+    link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+    PokeIndex<LorentzIndex>(mat,link,mu);
+  }   
+      
+    inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
+      mat = outerProduct(B,A); 
+    }  
+
+    inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+      mat = TraceIndex<SpinIndex>(P); 
+    }
+      
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
+    {
+      for (int mu = 0; mu < Nd; mu++)
+      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
+    }
+
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
+  {
+    int Ls=Btilde.Grid()->_fdimensions[0];
+    autoView( mat_v , mat, AcceleratorWrite);
+    {
+      const int Nsimd = SiteSpinor::Nsimd();
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
+	  int sU=sss;
+  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
+  	  ColorMatrixType sum;
+	  zeroit(sum);  
+	  for(int s=0;s<Ls;s++){
+	    int sF = s+Ls*sU;
+  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
+  	      auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
+  	      auto aa = coalescedRead(Atilde_v[sF]()(spn) );
+	      auto op = outerProduct(bb,aa);
+  	      sum = sum + op;
+	    }
+	  }
+  	  coalescedWrite(mat_v[sU](mu)(), sum);
+      });
+    }
+  }
+};
+
+
+typedef TwoSpinWilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplR;  // Real.. whichever prec
+typedef TwoSpinWilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplF;  // Float
+typedef TwoSpinWilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD;  // Double
+typedef TwoSpinWilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD2;  // Double
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/TwoSpinWilsonKernels.h
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonKernels.h
@@ -0,0 +1,84 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Helper routines that implement Wilson stencil for a single site.
+// Common to both the WilsonFermion and WilsonFermion5D
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ 
+template<class Impl> class TwoSpinWilsonKernels : public FermionOperator<Impl>  { 
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  typedef FermionOperator<Impl> Base;
+  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
+public:
+
+  static void DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 int interior=1,int exterior=1) ;
+
+  static void DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 uint64_t *ids);
+  
+  static void DhopDagKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+			    int Ls, int Nsite, const FermionField &in, FermionField &out,
+			    int interior=1,int exterior=1) ;
+
+  static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
+			  int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
+
+  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteSpinor * buf,
+			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
+
+private:
+
+  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor * buf,
+				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
+
+  static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+
+ public:
+  TwoSpinWilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
+};
+    
+NAMESPACE_END(Grid);
+
+
--- a/Grid/qcd/action/fermion/WilsonFermion.h
+++ b/Grid/qcd/action/fermion/WilsonFermion.h
@@ -38,6 +38,8 @@ public:
  static int MortonOrder;
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
  static const int npoint = 8;
 };

--- a/Grid/qcd/action/fermion/WilsonFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonFermion5D.h
@@ -62,6 +62,8 @@ public:
  static const std::vector<int> directions;
  static const std::vector<int> displacements;
  static constexpr int npoint = 8;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
 };

 template<class Impl>
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -166,7 +166,7 @@ public:

      U = adj(Cshift(U, mu, -1));
      U = where(coor == 0, conjugate(phase) * U, U); 
-      PokeIndex<LorentzIndex>(Uds, U, mu + 4);
+      PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
    }
  }

--- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h
+++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h
@@ -56,7 +56,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
 			Frbgrid,
 			Ugrid,
 			Urbgrid,
-			4.0,p)
+			Nd*1.0,p)
   
    {
      update(_mass,_mu);
@@ -83,7 +83,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    out.Checkerboard() = in.Checkerboard();
    //axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
    for (int s=0;s<(int)this->mass.size();s++) {
-      ComplexD a = 4.0+this->mass[s];
+      ComplexD a = Nd*1.0+this->mass[s];
      ComplexD b(0.0,this->mu[s]);
      axpbg5y_ssp(out,a,in,b,in,s,s);
    }
@@ -92,7 +92,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
  virtual void MooeeDag(const FermionField &in, FermionField &out) {
    out.Checkerboard() = in.Checkerboard();
    for (int s=0;s<(int)this->mass.size();s++) {
-      ComplexD a = 4.0+this->mass[s];
+      ComplexD a = Nd*1.0+this->mass[s];
      ComplexD b(0.0,-this->mu[s]);
      axpbg5y_ssp(out,a,in,b,in,s,s);
    }
@@ -101,7 +101,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    for (int s=0;s<(int)this->mass.size();s++) {
      RealD m    = this->mass[s];
      RealD tm   = this->mu[s];
-      RealD mtil = 4.0+this->mass[s];
+      RealD mtil = Nd*1.0+this->mass[s];
      RealD sq   = mtil*mtil+tm*tm;
      ComplexD a    = mtil/sq;
      ComplexD b(0.0, -tm /sq);
@@ -112,7 +112,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    for (int s=0;s<(int)this->mass.size();s++) {
      RealD m    = this->mass[s];
      RealD tm   = this->mu[s];
-      RealD mtil = 4.0+this->mass[s];
+      RealD mtil = Nd*1.0+this->mass[s];
      RealD sq   = mtil*mtil+tm*tm;
      ComplexD a    = mtil/sq;
      ComplexD b(0.0,tm /sq);
@@ -126,7 +126,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
    this->Dhop(in, out, DaggerNo);
    FermionField tmp(out.Grid());
    for (int s=0;s<(int)this->mass.size();s++) {
-      ComplexD a = 4.0+this->mass[s];
+      ComplexD a = Nd*1.0+this->mass[s];
      ComplexD b(0.0,this->mu[s]);
      axpbg5y_ssp(tmp,a,in,b,in,s,s);
    }
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h
@@ -240,7 +240,7 @@ void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::ve
  this->ceo.resize(Ls);

  for(int i=0; i<Ls; ++i){
-    this->bee[i] = 4.0 - this->M5 + 1.0;
+    this->bee[i] = Nd*1.0 - this->M5 + 1.0;
    this->cee[i] = 1.0;
  }

--- a/Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h
@@ -0,0 +1,486 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion2plus1D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
+#include <Grid/perfmon/PerfCount.h>
+
+NAMESPACE_BEGIN(Grid);
+  
+  // 5d lattice for DWF.
+template<class Impl>
+TwoSpinWilsonFermion3plus15D<Impl>::TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
+								GridCartesian         &FourDimGrid,
+								GridRedBlackCartesian &FourDimRedBlackGrid,
+								GridCartesian         &ThreeDimGrid,
+								GridRedBlackCartesian &ThreeDimRedBlackGrid,
+               RealD _M5,const ImplParams &p) :
+  Kernels(p),
+  _FourDimGrid        (&FourDimGrid),
+  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
+  _ThreeDimGrid        (&ThreeDimGrid),
+  _ThreeDimRedBlackGrid(&ThreeDimRedBlackGrid),
+  Stencil    (_FourDimGrid,npoint,Even,directions,displacements,p),
+  StencilEven(_FourDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
+  StencilOdd (_FourDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
+  M5(_M5),
+  Umu(_ThreeDimGrid),
+  UmuEven(_ThreeDimRedBlackGrid),
+  UmuOdd (_ThreeDimRedBlackGrid),
+  _tmp(&FourDimRedBlackGrid),
+  Dirichlet(0)
+{
+  // some assertions
+  assert(FourDimGrid._ndimension==Nd+1);
+  assert(ThreeDimGrid._ndimension==Nd);
+  assert(ThreeDimRedBlackGrid._ndimension==Nd);
+  assert(FourDimRedBlackGrid._ndimension==Nd+1);
+  assert(FourDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+
+  // extent of fifth dim and not spread out
+  Ls=FourDimGrid._fdimensions[0];
+  assert(FourDimRedBlackGrid._fdimensions[0]==Ls);
+  assert(FourDimGrid._processors[0]         ==1);
+  assert(FourDimRedBlackGrid._processors[0] ==1);
+
+  // Other dimensions must match the decomposition of the four-D fields 
+  for(int d=0;d<Nd;d++){
+
+    assert(FourDimGrid._processors[d+1]         ==ThreeDimGrid._processors[d]);
+    assert(FourDimRedBlackGrid._processors[d+1] ==ThreeDimGrid._processors[d]);
+    assert(ThreeDimRedBlackGrid._processors[d]   ==ThreeDimGrid._processors[d]);
+
+    assert(FourDimGrid._fdimensions[d+1]        ==ThreeDimGrid._fdimensions[d]);
+    assert(FourDimRedBlackGrid._fdimensions[d+1]==ThreeDimGrid._fdimensions[d]);
+    assert(ThreeDimRedBlackGrid._fdimensions[d]  ==ThreeDimGrid._fdimensions[d]);
+
+    assert(FourDimGrid._simd_layout[d+1]        ==ThreeDimGrid._simd_layout[d]);
+    assert(FourDimRedBlackGrid._simd_layout[d+1]==ThreeDimGrid._simd_layout[d]);
+    assert(ThreeDimRedBlackGrid._simd_layout[d]  ==ThreeDimGrid._simd_layout[d]);
+  }
+
+  if ( p.dirichlet.size() == Nd+1) {
+    Coordinate block = p.dirichlet;
+    for(int d=0;d<Nd+1;d++) {
+      if ( block[d] ){
+	Dirichlet = 1;
+	std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
+	std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
+	Block = block;
+      }
+    }
+  } else {
+    Coordinate block(Nd+1,0);
+    Block = block;
+  }
+
+  // Dimension zero of the five-d is the Ls direction
+  assert(FourDimRedBlackGrid._simd_layout[0]==1);
+  assert(FourDimGrid._simd_layout[0]        ==1);
+    
+  // Allocate the required comms buffer
+  ImportGauge(_Umu);
+  // Build lists of exterior only nodes
+  int LLs = FourDimGrid._rdimensions[0];
+  int vol3;
+  vol3=ThreeDimGrid.oSites();
+  Stencil.BuildSurfaceList(LLs,vol3);
+
+  vol3=ThreeDimRedBlackGrid.oSites();
+  StencilEven.BuildSurfaceList(LLs,vol3);
+   StencilOdd.BuildSurfaceList(LLs,vol3);
+
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::ImportGauge(const GaugeField &_Umu)
+{
+  GaugeField HUmu(_Umu.Grid());
+  HUmu = _Umu*(-0.5);
+  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
+  pickCheckerboard(Even,UmuEven,Umu);
+  pickCheckerboard(Odd ,UmuOdd,Umu);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
+{
+  int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
+                    // we drop off the innermost fifth dimension
+  //  assert( (disp==1)||(disp==-1) );
+  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
+
+  int skip = (disp==1) ? 0 : 1;
+  int dirdisp = dir+skip*Nd;
+  int gamma   = dir+(1-skip)*Nd;
+
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+  
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);
+
+};
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
+{
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out);
+};
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DerivInternal(StencilImpl & st,
+					  DoubledGaugeField & U,
+					  GaugeField &mat,
+					  const FermionField &A,
+					  const FermionField &B,
+					  int dag)
+{
+  assert((dag==DaggerNo) ||(dag==DaggerYes));
+
+  conformable(st.Grid(),A.Grid());
+  conformable(st.Grid(),B.Grid());
+
+  Compressor compressor(dag);
+  
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());
+
+  st.HaloExchange(B,compressor);
+
+  Atilde=A;
+  int LLs = B.Grid()->_rdimensions[0];
+
+
+  for (int mu = 0; mu < Nd; mu++) {
+    ////////////////////////////////////////////////////////////////////////
+    // Flip gamma if dag
+    ////////////////////////////////////////////////////////////////////////
+    int gamma = mu;
+    if (!dag) gamma += Nd;
+
+    ////////////////////////
+    // Call the single hop
+    ////////////////////////
+
+    int Usites = U.Grid()->oSites();
+
+    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
+
+    ////////////////////////////
+    // spin trace outer product
+    ////////////////////////////
+    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
+  }
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDeriv(GaugeField &mat,
+                                      const FermionField &A,
+                                      const FermionField &B,
+                                      int dag)
+{
+  conformable(A.Grid(),FermionGrid());  
+  conformable(A.Grid(),B.Grid());
+
+  //conformable(GaugeGrid(),mat.Grid());// this is not general! leaving as a comment
+
+  mat.Checkerboard() = A.Checkerboard();
+  //  mat.checkerboard = A.checkerboard;
+
+  DerivInternal(Stencil,Umu,mat,A,B,dag);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivEO(GaugeField &mat,
+                                        const FermionField &A,
+                                        const FermionField &B,
+                                        int dag)
+{
+  conformable(A.Grid(),FermionRedBlackGrid());
+  conformable(A.Grid(),B.Grid());
+
+  assert(B.Checkerboard()==Odd);
+  assert(A.Checkerboard()==Even);
+  mat.Checkerboard() = Even;
+
+  DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivOE(GaugeField &mat,
+                                        const FermionField &A,
+                                        const FermionField &B,
+                                        int dag)
+{
+  conformable(A.Grid(),FermionRedBlackGrid());
+  conformable(A.Grid(),B.Grid());
+
+  assert(B.Checkerboard()==Even);
+  assert(A.Checkerboard()==Odd);
+  mat.Checkerboard() = Odd;
+
+  DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternal(StencilImpl & st,
+                                         DoubledGaugeField & U,
+                                         const FermionField &in, FermionField &out,int dag)
+{
+  DhopInternalSerialComms(st,U,in,out,dag);
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
+							DoubledGaugeField & U,
+							const FermionField &in, FermionField &out,int dag)
+{
+  GRID_TRACE("DhopInternalOverlappedComms");
+  Compressor compressor(dag);
+
+  int LLs = in.Grid()->_rdimensions[0];
+  int len =  U.Grid()->oSites();
+
+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
+  {
+    //    std::cout << " TwoSpinWilsonFermion3plus1D gather " <<std::endl;
+    GRID_TRACE("Gather");
+    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
+  }
+  
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Communicate Begin " <<std::endl;
+  std::vector<std::vector<CommsRequest_t> > requests;
+
+#if 1
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms 
+#endif
+
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagInterior");
+    Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
+  } else {
+    GRID_TRACE("DhopInterior");
+    Kernels::DhopKernel   (st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
+  }
+  
+  //ifdef GRID_ACCELERATED
+#if 0
+  /////////////////////////////
+  // Overlap with comms -- on GPU the interior kernel call is nonblocking
+  /////////////////////////////
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+#endif
+  
+  
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Comms Complete " <<std::endl;
+  st.CommunicateComplete(requests);
+  //  traceStop(id);
+
+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
+  {
+    //    std::cout << " TwoSpinWilsonFermion3plus1D Comms Merge " <<std::endl;
+    GRID_TRACE("Merge");
+    st.CommsMerge(compressor);
+  }
+  
+
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Exterior " <<std::endl;
+  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagExterior");
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
+  } else {
+    GRID_TRACE("DhopExterior");
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
+  }
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalSerialComms(StencilImpl & st, 
+						    DoubledGaugeField & U,
+						    const FermionField &in, 
+						    FermionField &out,int dag)
+{
+  GRID_TRACE("DhopInternalSerialComms");
+  Compressor compressor(dag);
+
+  int LLs = in.Grid()->_rdimensions[0];
+
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Halo exch " <<std::endl;
+  {
+    GRID_TRACE("HaloExchange");
+    st.HaloExchangeOpt(in,compressor);
+  }
+  
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Dhop " <<std::endl;
+  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDag");
+    Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
+  } else {
+    GRID_TRACE("Dhop");
+    Kernels::DhopKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
+  }
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
+{
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check
+
+  assert(in.Checkerboard()==Even);
+  out.Checkerboard() = Odd;
+
+  DhopInternal(StencilEven,UmuOdd,in,out,dag);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
+{
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check
+
+  assert(in.Checkerboard()==Odd);
+  out.Checkerboard() = Even;
+
+  DhopInternal(StencilOdd,UmuEven,in,out,dag);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopComms(const FermionField &in, FermionField &out)
+{
+  int dag =0 ;
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+  out.Checkerboard() = in.Checkerboard();
+  Compressor compressor(dag);
+  Stencil.HaloExchangeOpt(in,compressor);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
+{
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  int LLs = in.Grid()->_rdimensions[0];
+  Kernels::DhopKernel(Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
+{
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  DhopInternal(Stencil,Umu,in,out,dag);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
+{
+  out.Checkerboard()=in.Checkerboard();
+  Dhop(in,out,dag); // -0.5 is included
+  axpy(out,Nd*1.0-M5,in,out);
+}
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::Meooe(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  typename FermionField::scalar_type scal(Nd*1.0 + M5);
+  out = scal * in;
+}
+
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  Mooee(in, out);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  out = (1.0/(Nd*1.0 + M5))*in;
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  MooeeInv(in,out);
+}
+  
+NAMESPACE_END(Grid);
+
+
+
+
--- a/Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h
@@ -0,0 +1,441 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/TwoSpinWilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+
+////////////////////////////////////////////
+// Generic implementation; move to different file?
+////////////////////////////////////////////
+
+#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    int perm= SE->_permute;					\
+    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+    spProj(chi,tmp);						\
+  } else {							\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
+  }								\
+  acceleratorSynchronise();					\
+  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+  Recon(result, Uchi);
+
+#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    int perm= SE->_permute;					\
+    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+    spProj(chi,tmp);							\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);			\
+    Recon(result, Uchi);						\
+  }									\
+  acceleratorSynchronise();
+
+#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (!SE->_is_local ) {		\
+    auto chi = coalescedRead(buf[SE->_offset],lane);		\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+    Recon(result, Uchi);					\
+    nmu++;							\
+  }								\
+  acceleratorSynchronise();
+
+#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
+    if (SE->_is_local ) {					\
+      int perm= SE->_permute;					\
+      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+      spProj(chi,tmp);						\
+    } else {							\
+      chi = coalescedRead(buf[SE->_offset],lane);		\
+    }								\
+    acceleratorSynchronise();					\
+    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
+    Recon(result, Uchi);
+
+#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
+  if (gamma == Dir) {						\
+    GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon);			\
+  }
+
+////////////////////////////////////////////////////////////////////
+// All legs kernels ; comms then compute
+////////////////////////////////////////////////////////////////////
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::DhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
+					     SiteSpinor *buf, int sF,
+					     int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  GENERIC_STENCIL_LEG(Xp,pauliProjXp,pauliAssign);
+  GENERIC_STENCIL_LEG(Yp,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG(Zp,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG(Xm,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG(Ym,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG(Zm,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF],result,lane);
+};
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
+					  SiteSpinor *buf, int sF,
+					  int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcSpinor chi;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  GENERIC_STENCIL_LEG(Xm,pauliProjXp,pauliAssign);
+  GENERIC_STENCIL_LEG(Ym,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG(Zm,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG(Xp,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG(Yp,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG(Zp,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF], result,lane);
+};
+  ////////////////////////////////////////////////////////////////////
+  // Interior kernels
+  ////////////////////////////////////////////////////////////////////
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
+						       SiteSpinor *buf, int sF,
+						       int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcSpinor chi;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  result=Zero();
+  GENERIC_STENCIL_LEG_INT(Xp,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Yp,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zp,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Xm,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Ym,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zm,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF], result,lane);
+};
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
+						    SiteSpinor *buf, int sF,
+						    int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  calcSpinor chi;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  result=Zero();
+  GENERIC_STENCIL_LEG_INT(Xm,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Ym,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zm,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Xp,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Yp,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zp,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF], result,lane);
+};
+////////////////////////////////////////////////////////////////////
+// Exterior kernels
+////////////////////////////////////////////////////////////////////
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
+						SiteSpinor *buf, int sF,
+						int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  result=Zero();
+  GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZm,pauliAdd);
+  if ( nmu ) {
+    auto out_t = coalescedRead(out[sF],lane);
+    out_t = out_t + result;
+    coalescedWrite(out[sF],out_t,lane);
+  }
+};
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
+					     SiteSpinor *buf, int sF,
+					     int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  result=Zero();
+  GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZm,pauliAdd);
+  if ( nmu ) {
+    auto out_t = coalescedRead(out[sF],lane);
+    out_t = out_t + result;
+    coalescedWrite(out[sF],out_t,lane);
+  }
+};
+
+#define DhopDirMacro(Dir,spProj,spRecon)	\
+  template <class Impl> accelerator_inline				\
+  void TwoSpinWilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF, \
+					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
+  {									\
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;			\
+  calcSpinor chi;							\
+  calcSpinor result;							\
+  calcSpinor Uchi;							\
+  StencilEntry *SE;							\
+  int ptype;								\
+  const int Nsimd = SiteSpinor::Nsimd();				\
+  const int lane=acceleratorSIMTlane(Nsimd);					\
+									\
+  SE = st.GetEntry(ptype, dir, sF);					\
+  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
+  coalescedWrite(out[sF], result,lane);					\
+  }
+
+DhopDirMacro(Xp,pauliProjXp,pauliAssign);
+DhopDirMacro(Yp,pauliProjYp,pauliAssign);
+DhopDirMacro(Zp,pauliProjZp,pauliAssign);
+DhopDirMacro(Xm,pauliProjXm,pauliAssign);
+DhopDirMacro(Ym,pauliProjYm,pauliAssign);
+DhopDirMacro(Zm,pauliProjZm,pauliAssign);
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF,
+				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcSpinor chi;
+  calcSpinor result;
+  calcSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  SE = st.GetEntry(ptype, dir, sF);
+  GENERIC_DHOPDIR_LEG(Xp,pauliProjXp,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Yp,pauliProjYp,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Zp,pauliProjZp,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Xm,pauliProjXm,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Ym,pauliProjYm,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Zm,pauliProjZm,pauliAssign);
+  coalescedWrite(out[sF], result,lane);
+}
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
+				      int Nsite, const FermionField &in, std::vector<FermionField> &out)
+{
+   autoView(U_v  ,U,AcceleratorRead);
+   autoView(in_v ,in,AcceleratorRead);
+   autoView(st_v ,st,AcceleratorRead);
+
+   autoView(out_Xm,out[0],AcceleratorWrite);
+   autoView(out_Ym,out[1],AcceleratorWrite);
+   autoView(out_Zm,out[2],AcceleratorWrite);
+   autoView(out_Xp,out[4],AcceleratorWrite);
+   autoView(out_Yp,out[5],AcceleratorWrite);
+   autoView(out_Zp,out[6],AcceleratorWrite);
+   auto CBp=st.CommBuf();
+   accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
+      int sU=sss/Ls;
+      int sF =sss;
+      DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
+      DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
+      DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
+      DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,3);
+      DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,4);
+      DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,5);
+   });
+}
+
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
+					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma)
+{
+  assert(dirdisp<=5);
+  assert(dirdisp>=0);
+
+   autoView(U_v  ,U  ,AcceleratorRead);
+   autoView(in_v ,in ,AcceleratorRead);
+   autoView(out_v,out,AcceleratorWrite);
+   autoView(st_v ,st ,AcceleratorRead);
+   auto CBp=st.CommBuf();
+#define LoopBody(Dir)				\
+   case Dir :					\
+     accelerator_for(ss,Nsite,Simd::Nsimd(),{	\
+       for(int s=0;s<Ls;s++){			\
+	 int sU=ss;				\
+	 int sF = s+Ls*sU;						\
+	 DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
+       }							       \
+       });							       \
+     break;
+
+   switch(gamma){
+   LoopBody(Xp);
+   LoopBody(Yp);
+   LoopBody(Zp);
+
+   LoopBody(Xm);
+   LoopBody(Ym);
+   LoopBody(Zm);
+   default:
+     assert(0);
+     break;
+   }
+#undef LoopBody
+}
+
+
+#define KERNEL_CALLNB(A)						\
+  const uint64_t    NN = Nsite*Ls;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+    });
+
+#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
+
+#define KERNEL_CALL_EXT(A)						\
+  const uint64_t    sz = st.surface_list.size();			\
+  auto ptr = &st.surface_list[0];					\
+  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
+      int sF = ptr[ss];							\
+      int sU = sF/Ls;							\
+      TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+    });									\
+  accelerator_barrier();
+
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+					    int Ls, int Nsite, const FermionField &in, FermionField &out,
+					    int interior,int exterior)
+{
+  autoView(U_v  ,  U,AcceleratorRead);
+  autoView(in_v , in,AcceleratorRead);
+  autoView(out_v,out,AcceleratorWrite);
+  autoView(st_v , st,AcceleratorRead);
+  
+  if( interior && exterior ) {
+    acceleratorFenceComputeStream();
+    KERNEL_CALL(GenericDhopSite);
+    return;
+  } else if( interior ) {
+    KERNEL_CALLNB(GenericDhopSiteInt);
+    return;
+  } else if( exterior ) {
+    //     // dependent on result of merge
+    acceleratorFenceComputeStream();
+    KERNEL_CALL_EXT(GenericDhopSiteExt);
+    return;
+  }
+  assert(0 && " Kernel optimisation case not covered ");
+}
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopDagKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+					       int Ls, int Nsite, const FermionField &in, FermionField &out,
+					       int interior,int exterior)
+{
+  autoView(U_v  ,U,AcceleratorRead);
+  autoView(in_v ,in,AcceleratorRead);
+  autoView(out_v,out,AcceleratorWrite);
+  autoView(st_v ,st,AcceleratorRead);
+  
+  if( interior && exterior ) {
+    acceleratorFenceComputeStream();
+    KERNEL_CALL(GenericDhopSiteDag);
+    return;
+  } else if( interior ) {
+    KERNEL_CALLNB(GenericDhopSiteDagInt); return;
+  } else if( exterior ) {
+    // Dependent on result of merge
+    acceleratorFenceComputeStream();
+    KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;
+  }
+  assert(0 && " Kernel optimisation case not covered ");
+}
+
+#undef KERNEL_CALLNB
+#undef KERNEL_CALL
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h
@@ -61,7 +61,7 @@ WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&
    diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
  } else {
    csw_r     = _csw_r * 0.5;
-    diag_mass = 4.0 + _mass;
+    diag_mass = Nd*1.0 + _mass;
  }
  csw_t = _csw_t * 0.5;

@@ -297,9 +297,9 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
    {
      if (mu == nu)
      continue;
-
+      
      RealD factor;
-      if (nu == 4 || mu == 4)
+      if (nu == (Nd-1) || mu == (Nd-1)) // This was a bug - surely mu/nu is NEVER 4 but rather (Nd-1)=3 ??
      {
        factor = 2.0 * csw_t;
      }
@@ -307,9 +307,11 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
      {
        factor = 2.0 * csw_r;
      }
-      PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
-      Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
-      force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
+      if ( mu < Nd && nu < Nd ) { // Allow to restrict range to Nd=3, but preserve orders of SigmaMuNu in table by counting ALL
+	PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
+	Impl::TraceSpinImpl(lambda, Slambda);                   // traceSpin ok
+	force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu);                   // checked
+      }
      count++;
    }

--- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h
@@ -63,10 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  Dirichlet(0)
 {
  // some assertions
-  assert(FiveDimGrid._ndimension==5);
-  assert(FourDimGrid._ndimension==4);
-  assert(FourDimRedBlackGrid._ndimension==4);
-  assert(FiveDimRedBlackGrid._ndimension==5);
+  assert(FiveDimGrid._ndimension==Nd+1);
+  assert(FourDimGrid._ndimension==Nd);
+  assert(FourDimRedBlackGrid._ndimension==Nd);
+  assert(FiveDimRedBlackGrid._ndimension==Nd+1);
  assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction

  // extent of fifth dim and not spread out
@@ -76,7 +76,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  assert(FiveDimRedBlackGrid._processors[0] ==1);

  // Other dimensions must match the decomposition of the four-D fields 
-  for(int d=0;d<4;d++){
+  for(int d=0;d<Nd;d++){

    assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]);
    assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
@@ -93,11 +93,13 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,

  if ( p.dirichlet.size() == Nd+1) {
    Coordinate block = p.dirichlet;
-    if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
-      Dirichlet = 1;
-      std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
-      std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
-      Block = block;
+    for(int d=0;d<Nd+1;d++) {
+      if ( block[d] ){
+	Dirichlet = 1;
+	std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
+	std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
+	Block = block;
+      }
    }
  } else {
    Coordinate block(Nd+1,0);
@@ -112,7 +114,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
    assert(FiveDimGrid._simd_layout[0]        ==nsimd);
    assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);

-    for(int d=0;d<4;d++){
+    for(int d=0;d<Nd;d++){
      assert(FourDimGrid._simd_layout[d]==1);
      assert(FourDimRedBlackGrid._simd_layout[d]==1);
      assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
@@ -183,8 +185,8 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;

  int skip = (disp==1) ? 0 : 1;
-  int dirdisp = dir+skip*4;
-  int gamma   = dir+(1-skip)*4;
+  int dirdisp = dir+skip*Nd;
+  int gamma   = dir+(1-skip)*Nd;

  Compressor compressor(DaggerNo);
  Stencil.HaloExchange(in,compressor);
@@ -483,7 +485,7 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
 {
  out.Checkerboard()=in.Checkerboard();
  Dhop(in,out,dag); // -0.5 is included
-  axpy(out,4.0-M5,in,out);
+  axpy(out,Nd*1.0-M5,in,out);
 }
 template <class Impl>
 void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
@@ -509,7 +511,7 @@ template <class Impl>
 void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
-  typename FermionField::scalar_type scal(4.0 + M5);
+  typename FermionField::scalar_type scal(Nd*1.0 + M5);
  out = scal * in;
 }

@@ -524,7 +526,7 @@ template<class Impl>
 void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
 {
  out.Checkerboard() = in.Checkerboard();
-  out = (1.0/(4.0 + M5))*in;
+  out = (1.0/(Nd*1.0 + M5))*in;
 }

 template<class Impl>
@@ -635,7 +637,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
  A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
  F = eaLs * (one - Wea + (Wema - one) * mass*mass);
  F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
-  F = F - abs(W) * sinha * 4.0 * mass;
+  F = F - abs(W) * sinha * (Nd* 1.0) * mass;

  Bpp =  (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
  Bmm =  (A/F) * (one - ea2Ls)  * (one - Wea) * (one - mass*mass * one);
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -63,7 +63,7 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
  if  (anisotropyCoeff.isAnisotropic){
    diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
  } else {
-    diag_mass = 4.0 + mass;
+    diag_mass = Nd*1.0 + mass;
  }

  int vol4;
@@ -354,8 +354,8 @@ void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int
  Stencil.HaloExchange(in, compressor);

  int skip = (disp == 1) ? 0 : 1;
-  int dirdisp = dir + skip * 4;
-  int gamma = dir + (1 - skip) * 4;
+  int dirdisp = dir + skip * Nd;
+  int gamma = dir + (1 - skip) * Nd;

  DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
 };
@@ -370,8 +370,8 @@ void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<Fermion
    for(int disp=-1;disp<=1;disp+=2){

      int skip = (disp == 1) ? 0 : 1;
-      int dirdisp = dir + skip * 4;
-      int gamma = dir + (1 - skip) * 4;
+      int dirdisp = dir + skip * Nd;
+      int gamma = dir + (1 - skip) * Nd;

      DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
    }
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandGparityImplementation.h
@@ -97,7 +97,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
  distance = st._distances[DIR];				\
  sl = st._simd_layout[direction];			        \
  inplace_twist = 0;						\
-  if(SE->_around_the_world && st.parameters.twists[DIR % 4]){		\
+  if(SE->_around_the_world && st.parameters.twists[DIR % Nd]){		\
    if(sl == 1){							\
      g = (F+1) % 2;							\
    }else{								\
--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermion5DInstantiation.cc
@@ -32,8 +32,30 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
  
 // S-direction is INNERMOST and takes no part in the parity.
-const std::vector<int> ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
-const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
+const std::vector<int> ImprovedStaggeredFermion5DStatic::directions(ImprovedStaggeredFermion5DStatic::MakeDirections());
+const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements(ImprovedStaggeredFermion5DStatic::MakeDisplacements());
+std::vector<int> ImprovedStaggeredFermion5DStatic::MakeDirections(void)
+{
+  std::vector<int> directions(4*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d+Nd*0] = d+1;
+    directions[d+Nd*1] = d+1;
+    directions[d+Nd*2] = d+1;
+    directions[d+Nd*3] = d+1;
+  }
+  return directions;
+}
+std::vector<int> ImprovedStaggeredFermion5DStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(4*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d+Nd*0] =+1;
+    displacements[d+Nd*1] =-1;
+    displacements[d+Nd*2] =+3;
+    displacements[d+Nd*3] =-3;
+  }
+  return displacements;
+}
  
 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/ImprovedStaggeredFermionInstantiation.cc
@@ -32,5 +32,26 @@ NAMESPACE_BEGIN(Grid);

 const std::vector<int> ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
 const std::vector<int> ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
-
+std::vector<int> ImprovedStaggeredFermionStatic::MakeDirections(void)
+{
+  std::vector<int> directions(4*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d+Nd*0] = d;
+    directions[d+Nd*1] = d;
+    directions[d+Nd*2] = d;
+    directions[d+Nd*3] = d;
+  }
+  return directions;
+}
+std::vector<int> ImprovedStaggeredFermionStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(4*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d+Nd*0] =+1;
+    displacements[d+Nd*1] =-1;
+    displacements[d+Nd*2] =+3;
+    displacements[d+Nd*3] =-3;
+  }
+  return displacements;
+}
 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/NaiveStaggeredFermionInstantiation.cc
@@ -30,7 +30,27 @@ directory

 NAMESPACE_BEGIN(Grid);

-const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+//const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
+//const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+const std::vector<int> NaiveStaggeredFermionStatic::directions(NaiveStaggeredFermionStatic::MakeDirections());
+const std::vector<int> NaiveStaggeredFermionStatic::displacements(NaiveStaggeredFermionStatic::MakeDisplacements());
+std::vector<int> NaiveStaggeredFermionStatic::MakeDirections(void)
+{
+  std::vector<int> directions(4*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d+Nd*0] = d;
+    directions[d+Nd*1] = d;
+  }
+  return directions;
+}
+std::vector<int> NaiveStaggeredFermionStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(4*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d+Nd*0] =+1;
+    displacements[d+Nd*1] =-1;
+  }
+  return displacements;
+}

 NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc
@@ -0,0 +1,61 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// S-direction is INNERMOST and takes no part in the parity.
+
+const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::directions   (TwoSpinWilsonFermion3plus1DStatic::MakeDirections());
+const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::displacements(TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements());
+
+std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDirections (void)
+{
+  std::vector<int> directions(2*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d]    = d+1;
+    directions[d+Nd] = d+1;
+  }
+  return directions;
+}
+std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(2*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d]    = +1;
+    displacements[d+Nd] = -1;
+  }
+  return displacements;
+}
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc.master
@@ -0,0 +1,40 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class TwoSpinWilsonFermion3plus1D<IMPLEMENTATION>; 
+
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonKernelsInstantiation.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonKernelsInstantiation.cc.master
@@ -0,0 +1,40 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class TwoSpinWilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
--- a/Grid/qcd/action/fermion/instantiation/WilsonFermion5DInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonFermion5DInstantiation.cc
@@ -34,8 +34,28 @@ directory
 NAMESPACE_BEGIN(Grid);

 // S-direction is INNERMOST and takes no part in the parity.
-const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
-const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
+
+const std::vector<int> WilsonFermion5DStatic::directions   (WilsonFermion5DStatic::MakeDirections());
+const std::vector<int> WilsonFermion5DStatic::displacements(WilsonFermion5DStatic::MakeDisplacements());
+
+std::vector<int> WilsonFermion5DStatic::MakeDirections (void)
+{
+  std::vector<int> directions(2*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d]    = d+1;
+    directions[d+Nd] = d+1;
+  }
+  return directions;
+}
+std::vector<int> WilsonFermion5DStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(2*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d]    = +1;
+    displacements[d+Nd] = -1;
+  }
+  return displacements;
+}

 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/instantiation/WilsonFermionInstantiation.cc
+++ b/Grid/qcd/action/fermion/instantiation/WilsonFermionInstantiation.cc
@@ -33,9 +33,27 @@ directory

 NAMESPACE_BEGIN(Grid);

-const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
-const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
+const std::vector<int> WilsonFermionStatic::directions(WilsonFermionStatic::MakeDirections());
+const std::vector<int> WilsonFermionStatic::displacements(WilsonFermionStatic::MakeDisplacements());
 int WilsonFermionStatic::HandOptDslash;
+std::vector<int> WilsonFermionStatic::MakeDirections (void)
+{
+  std::vector<int> directions(2*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d]    = d;
+    directions[d+Nd] = d;
+  }
+  return directions;
+}
+std::vector<int> WilsonFermionStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(2*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d]    = +1;
+    displacements[d+Nd] = -1;
+  }
+  return displacements;
+}

 NAMESPACE_END(Grid);

--- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
+++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh
@@ -36,11 +36,16 @@ DWF_IMPL_LIST=" \
 	   ZWilsonImplF \
 	   ZWilsonImplD2 "

+TWOSPIN_WILSON_IMPL_LIST=" \
+	   TwoSpinWilsonImplF \
+	   TwoSpinWilsonImplD "
+
+
 GDWF_IMPL_LIST=" \
 	   GparityWilsonImplF \
 	   GparityWilsonImplD "

-IMPL_LIST="$STAG_IMPL_LIST  $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST"
+IMPL_LIST="$STAG_IMPL_LIST  $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST $TWOSPIN_WILSON_IMPL_LIST"

 for impl in $IMPL_LIST
 do
@@ -110,7 +115,12 @@ do
 done
 done

-CC_LIST=" \
-  ImprovedStaggeredFermion5DInstantiation \
-  StaggeredKernelsInstantiation "
+CC_LIST="TwoSpinWilsonFermion3plus1DInstantiation.cc.master	TwoSpinWilsonKernelsInstantiation.cc.master"

+for impl in $TWOSPIN_WILSON_IMPL_LIST
+do
+for f in $CC_LIST
+do
+  ln -f -s ../$f.cc.master $impl/$f$impl.cc
+done
+done
--- a/Grid/qcd/smearing/WilsonFlow.h
+++ b/Grid/qcd/smearing/WilsonFlow.h
@@ -158,8 +158,8 @@ RealD WilsonFlowBase<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeF
  LatticeComplexD R(U.Grid());
  R = Zero();
  
-  for(int mu=0;mu<3;mu++){
-    for(int nu=mu+1;nu<4;nu++){
+  for(int mu=0;mu<Nd-1;mu++){
+    for(int nu=mu+1;nu<Nd;nu++){
      WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
      R = R + trace(F*F);
    }
@@ -252,6 +252,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{

  out = in;
  RealD taus = 0.;
+
+  // Perform initial t=0 measurements
+  for(auto const &meas : this->functions)
+    meas.second(0,taus,out);
+  
  for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
    auto start = std::chrono::high_resolution_clock::now();
    evolve_step(out, taus);
@@ -336,6 +341,11 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
  RealD taus = 0.;
  RealD eps = init_epsilon;
  unsigned int step = 0;
+
+  // Perform initial t=0 measurements
+  for(auto const &meas : this->functions)
+    meas.second(step,taus,out);
+  
  do{
    int step_success = evolve_step_adaptive(out, taus, eps); 
    step += step_success; //step will not be incremented if the integration step fails
--- a/Grid/qcd/spin/Pauli.h
+++ b/Grid/qcd/spin/Pauli.h
@@ -0,0 +1,220 @@
+#ifndef GRID_QCD_PAULI_H
+#define GRID_QCD_PAULI_H
+
+#include <array>
+
+NAMESPACE_BEGIN(Grid);
+//
+/*
+ * Pauli basis
+ * sx        sy       sz       ident
+ * (0 1)  , (0 -i) , ( 1 0 )
+ * (1 0)    (i  0)   ( 0 -1)
+ *
+ * These are hermitian.
+ *
+ * Also supply wilson "projectors" (1+/-sx), (1+/-sy), (1+/-sz)
+ *
+ * spPauliProjXm
+ * spPauliProjYm etc...
+ */
+class Pauli {
+  public:
+    GRID_SERIALIZABLE_ENUM(Algebra, undef,
+                           SigmaX           , 0,
+                           MinusSigmaX      , 1,
+                           SigmaY           , 2,
+                           MinusSigmaY      , 3,
+                           SigmaZ           , 4,
+                           MinusSigmaZ      , 5,
+                           Identity         , 6,
+			   MinusIdentity    , 7);
+  
+    static constexpr unsigned int nPauli = 8;
+    static const std::array<const char *, nPauli>                name;
+    static const std::array<std::array<Algebra, nPauli>, nPauli> mul;
+    static const std::array<Algebra, nPauli>                     adj;
+    static const std::array<const Pauli, 4>                      gmu;
+    static const std::array<const Pauli, 16>                     gall;
+    Algebra                                                      g;
+  public:
+  accelerator Pauli(Algebra initg): g(initg) {}  
+};
+
+#define CopyImplementation(iTemplate,multPauli,multFlavour)	\
+  template<class vtype>							\
+  accelerator_inline void multPauli(iTemplate<vtype, Nhs> &ret, const iTemplate<vtype, Nhs> &rhs) {	\
+    multFlavour(ret,rhs);						\
+}
+
+CopyImplementation(iVector,multPauliSigmaX,multFlavourSigmaX);
+CopyImplementation(iMatrix,lmultPauliSigmaX,lmultFlavourSigmaX);
+CopyImplementation(iMatrix,rmultPauliSigmaX,rmultFlavourSigmaX);
+
+CopyImplementation(iVector,multPauliMinusSigmaX ,multFlavourMinusSigmaX);
+CopyImplementation(iMatrix,lmultPauliMinusSigmaX,lmultFlavourMinusSigmaX);
+CopyImplementation(iMatrix,rmultPauliMinusSigmaX,rmultFlavourMinusSigmaX);
+
+CopyImplementation(iVector,multPauliSigmaY,multFlavourSigmaY);
+CopyImplementation(iMatrix,lmultPauliSigmaY,lmultFlavourSigmaY);
+CopyImplementation(iMatrix,rmultPauliSigmaY,rmultFlavourSigmaY);
+
+CopyImplementation(iVector,multPauliMinusSigmaY ,multFlavourMinusSigmaY);
+CopyImplementation(iMatrix,lmultPauliMinusSigmaY,lmultFlavourMinusSigmaY);
+CopyImplementation(iMatrix,rmultPauliMinusSigmaY,rmultFlavourMinusSigmaY);
+
+CopyImplementation(iVector,multPauliSigmaZ,multFlavourSigmaZ);
+CopyImplementation(iMatrix,lmultPauliSigmaZ,lmultFlavourSigmaZ);
+CopyImplementation(iMatrix,rmultPauliSigmaZ,rmultFlavourSigmaZ);
+
+CopyImplementation(iVector,multPauliMinusSigmaZ ,multFlavourMinusSigmaZ);
+CopyImplementation(iMatrix,lmultPauliMinusSigmaZ,lmultFlavourMinusSigmaZ);
+CopyImplementation(iMatrix,rmultPauliMinusSigmaZ,rmultFlavourMinusSigmaZ);
+
+CopyImplementation(iVector,multPauliIdentity,multFlavourIdentity);
+CopyImplementation(iMatrix,lmultPauliIdentity,lmultFlavourIdentity);
+CopyImplementation(iMatrix,rmultPauliIdentity,rmultFlavourIdentity);
+
+CopyImplementation(iVector,multPauliMinusIdentity ,multFlavourMinusIdentity);
+CopyImplementation(iMatrix,lmultPauliMinusIdentity,lmultFlavourMinusIdentity);
+CopyImplementation(iMatrix,rmultPauliMinusIdentity,rmultFlavourMinusIdentity);
+
+/*
+ * sx        sy       sz       ident
+ * (0 1)  , (0 -i) , ( 1 0 )
+ * (1 0)    (i  0)   ( 0 -1)
+ */
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)+fspin(1);
+  hspin(1)=fspin(1)+fspin(0);
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)-fspin(1);
+  hspin(1)=fspin(1)-fspin(0);
+}
+
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)-timesI(fspin(1));
+  hspin(1)=fspin(1)+timesI(fspin(0));
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)+timesI(fspin(1));
+  hspin(1)=fspin(1)-timesI(fspin(0));
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)+fspin(0);
+  hspin(1)=Zero();
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=Zero();
+  hspin(1)=fspin(1)+fspin(1);
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAssign(iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
+{
+  fspin = hspin;
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAdd   (iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
+{
+  fspin = fspin + hspin;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const Pauli &G, const iVector<vtype, Nhs> &arg)
+->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Nhs>, PauliIndex>::value, iVector<vtype, Nhs>>::type
+{
+  iVector<vtype, Nhs> ret;
+
+  switch (G.g) 
+  {
+  case Pauli::Algebra::SigmaX:
+    multPauliSigmaX(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaX:
+    multPauliMinusSigmaX(ret, arg); break;
+  case Pauli::Algebra::SigmaY:
+    multPauliSigmaY(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaY:
+    multPauliMinusSigmaY(ret, arg); break;
+  case Pauli::Algebra::SigmaZ:
+    multPauliSigmaZ(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaZ:
+    multPauliMinusSigmaZ(ret, arg); break;
+  case Pauli::Algebra::Identity:
+    multPauliIdentity(ret, arg); break;
+  case Pauli::Algebra::MinusIdentity:
+    multPauliMinusIdentity(ret, arg); break;
+  default: assert(0);
+  }
+ 
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const Pauli &G, const iMatrix<vtype, Nhs> &arg)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
+{
+  iMatrix<vtype, Nhs> ret;
+
+  switch (G.g) 
+  {
+  case Pauli::Algebra::SigmaX:
+    lmultPauliSigmaX(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaX:
+    lmultPauliMinusSigmaX(ret, arg); break;
+  case Pauli::Algebra::SigmaY:
+    lmultPauliSigmaY(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaY:
+    lmultPauliMinusSigmaY(ret, arg); break;
+  case Pauli::Algebra::SigmaZ:
+    lmultPauliSigmaZ(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaZ:
+    lmultPauliMinusSigmaZ(ret, arg); break;
+  case Pauli::Algebra::Identity:
+    lmultPauliIdentity(ret, arg); break;
+  case Pauli::Algebra::MinusIdentity:
+    lmultPauliMinusIdentity(ret, arg); break;
+  default: assert(0);
+  }
+  
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const iMatrix<vtype, Nhs> &arg, const Pauli &G)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
+{
+  iMatrix<vtype, Nhs> ret;
+
+  switch (G.g) 
+  {
+  case Pauli::Algebra::SigmaX:
+    rmultPauliSigmaX(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaX:
+    rmultPauliMinusSigmaX(ret, arg); break;
+  case Pauli::Algebra::SigmaY:
+    rmultPauliSigmaY(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaY:
+    rmultPauliMinusSigmaY(ret, arg); break;
+  case Pauli::Algebra::SigmaZ:
+    rmultPauliSigmaZ(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaZ:
+    rmultPauliMinusSigmaZ(ret, arg); break;
+  case Pauli::Algebra::Identity:
+    rmultPauliIdentity(ret, arg); break;
+  case Pauli::Algebra::MinusIdentity:
+    rmultPauliMinusIdentity(ret, arg); break;
+  default: assert(0);
+  }
+
+  return ret;
+}
+
+
+NAMESPACE_END(Grid);
+
+#endif // GRID_QCD_GAMMA_H
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@@ -179,20 +179,17 @@ public:
  //////////////////////////////////////////////////
  // average over all x,y,z the temporal loop
  //////////////////////////////////////////////////
-  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
+  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  
    GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
    ComplexD out;
-    int T = Umu.Grid()->GlobalDimensions()[3];
-    int X = Umu.Grid()->GlobalDimensions()[0];
-    int Y = Umu.Grid()->GlobalDimensions()[1];
-    int Z = Umu.Grid()->GlobalDimensions()[2];
-
-    Ut = peekLorentz(Umu,3); //Select temporal direction
+    uint64_t vol = Umu.Grid()->gSites();
+    int T = Umu.Grid()->GlobalDimensions()[Nd-1];
+    Ut = peekLorentz(Umu,Nd-1); //Select temporal direction
    P = Ut;
    for (int t=1;t<T;t++){ 
-      P = Gimpl::CovShiftForward(Ut,3,P);
+      P = Gimpl::CovShiftForward(Ut,Nd-1,P);
    }
-   RealD norm = 1.0/(Nc*X*Y*Z*T);
+   RealD norm = 1.0/(Nc*vol);
   out = sum(trace(P))*norm;
   return out;   
 }
@@ -215,7 +212,7 @@ public:

    double vol = Umu.Grid()->gSites();

-    return p.real() / vol / (4.0 * Nc ) ;
+    return p.real() / vol / (Nd * Nc ) ;
  };

  //////////////////////////////////////////////////
@@ -740,6 +737,7 @@ public:
  //cf  https://arxiv.org/pdf/hep-lat/9701012.pdf  Eq 6
  //output is the charge by timeslice: sum over timeslices to obtain the total
  static std::vector<Real> TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
+    // Audit: 4D epsilon is hard coded
    assert(Nd == 4);
    std::vector<std::vector<GaugeMat*> > F(Nd,std::vector<GaugeMat*>(Nd,nullptr));
    //Note F_numu = - F_munu
@@ -829,6 +827,25 @@ public:
    return out;
  }

+  //Compute the 5Li topological charge density
+  static std::vector<Real> TopologicalChargeDensity5Li(const GaugeLorentz &U){
+
+    static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };
+    std::vector<std::vector<Real> > loops = TimesliceTopologicalCharge5LiContributions(U);
+
+    double c5=1./20.;
+    double c4=1./5.-2.*c5;
+    double c3=(-64.+640.*c5)/45.;
+    double c2=(1-64.*c5)/9.;
+    double c1=(19.-55.*c5)/9.;
+
+    int Lt = loops[0].size();
+    std::vector<Real> out(Lt,0.);
+    for(int t=0;t<Lt;t++)
+      out[t] += c1*loops[0][t] + c2*loops[1][t] + c3*loops[2][t] + c4*loops[3][t] + c5*loops[4][t];
+    return out;
+  }
+  
  static Real TopologicalCharge5Li(const GaugeLorentz &U){
    std::vector<Real> Qt = TimesliceTopologicalCharge5Li(U);
    Real Q = 0.;
@@ -1455,7 +1472,7 @@ public:
  //////////////////////////////////////////////////
  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu.Grid());
+    std::vector<GaugeMat> U(Nd, Umu.Grid());

    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
@@ -1474,7 +1491,7 @@ public:
  //////////////////////////////////////////////////
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu.Grid());
+    std::vector<GaugeMat> U(Nd, Umu.Grid());

    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
@@ -1492,8 +1509,8 @@ public:
  // sum over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
-                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu.Grid());
+				   const int R1, const int R2) {
+    std::vector<GaugeMat> U(Nd, Umu.Grid());

    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@@ -396,6 +396,7 @@ public:
 					Packets[i].from_rank,Packets[i].do_recv,
 					Packets[i].xbytes,Packets[i].rbytes,i);
    }
+    FlightRecorder::StepLog("Communicate begin has finished");
    // Get comms started then run checksums
    // Having this PRIOR to the dslash seems to make Sunspot work... (!)
    for(int i=0;i<Packets.size();i++){
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -251,7 +251,7 @@ inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes)  { c
 inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
 inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
-  acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
+  acceleratorCopyToDevice(from,to,bytes);
  return 0;
 }
 inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
@@ -337,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
    cgh.parallel_for(							\
 		     sycl::nd_range<3>(global,local),			\
 		     [=] (sycl::nd_item<3> item) /*mutable*/		\
-		     [[intel::reqd_sub_group_size(16)]]			\
+		     [[sycl::reqd_sub_group_size(16)]]			\
 		     {							\
 		       auto iter1    = item.get_global_id(0);		\
 		       auto iter2    = item.get_global_id(1);		\
--- a/Grid/threads/ThreadReduction.h
+++ b/Grid/threads/ThreadReduction.h
@@ -28,11 +28,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 /*  END LEGAL */
 #pragma once 

-#ifndef MIN
-#define MIN(x,y) ((x)>(y)?(y):(x))
-#endif
-
-
 // Introduce a class to gain deterministic bit reproducible reduction.
 // make static; perhaps just a namespace is required.
 NAMESPACE_BEGIN(Grid);
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -187,9 +187,10 @@ void GridParseLayout(char **argv,int argc,
 		     Coordinate &latt_c,
 		     Coordinate &mpi_c)
 {
-  auto mpi =std::vector<int>({1,1,1,1});
-  auto latt=std::vector<int>({8,8,8,8});
-
+  auto mpi =std::vector<int>(Nd,1);
+  auto latt=std::vector<int>(Nd,8);
+  std::cout << "Default mpi "<<mpi<<std::endl;
+  std::cout << "Default latt"<<latt<<std::endl;
  GridThread::SetMaxThreads();

  std::string arg;
@@ -228,6 +229,9 @@ void GridParseLayout(char **argv,int argc,
  }
  // Copy back into coordinate format
  int nd = mpi.size();
+  std::cout << "mpi.size() "<<nd<<std::endl;
+  std::cout << "latt.size() "<<latt.size()<<std::endl;
+  std::cout << "Nd "<<Nd<<std::endl;
  assert(latt.size()==nd);
  latt_c.resize(nd);
   mpi_c.resize(nd);
@@ -638,12 +642,11 @@ void Grid_debug_handler_init(void)
  sa.sa_flags    = SA_SIGINFO;
  //  sigaction(SIGSEGV,&sa,NULL);
  sigaction(SIGTRAP,&sa,NULL);
-  sigaction(SIGBUS,&sa,NULL);
+  //  sigaction(SIGBUS,&sa,NULL);
  //  sigaction(SIGUSR2,&sa,NULL);

-  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
-
-  sigaction(SIGFPE,&sa,NULL);
+  //  feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
+  //  sigaction(SIGFPE,&sa,NULL);
  sigaction(SIGKILL,&sa,NULL);
  sigaction(SIGILL,&sa,NULL);

--- a/HMC/ComputeWilsonFlow.cc
+++ b/HMC/ComputeWilsonFlow.cc
@@ -66,6 +66,7 @@ namespace Grid{
  };
 }

+
 template <class T> void writeFile(T& in, std::string const fname){  
 #ifdef HAVE_LIME
  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
@@ -73,7 +74,7 @@ template <class T> void writeFile(T& in, std::string const fname){
  Grid::emptyUserRecord record;
  Grid::ScidacWriter WR(in.Grid()->IsBoss());
  WR.open(fname);
-  WR.writeScidacFieldRecord(in,record,0);
+  WR.writeScidacFieldRecord(in,record,0); // Lexico
  WR.close();
 #endif
  // What is the appropriate way to throw error?
@@ -107,8 +108,18 @@ int main(int argc, char **argv) {

  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){

+#if 0    
  CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
+#else
+  // Don't require Grid format RNGs
+  FieldMetaData header;
+  std::string file, filesmr;
+  file    = CPar.conf_path + "/" + CPar.conf_prefix      + "." + std::to_string(conf);
+  filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix  + "." + std::to_string(conf);

+  NerscIO::readConfiguration(Umu,header,file);
+#endif
+  
  std::cout << std::setprecision(15);
  std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
  
@@ -116,6 +127,7 @@ int main(int argc, char **argv) {
  std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);

  WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
+  
  WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
    
    typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
@@ -165,33 +177,48 @@ int main(int argc, char **argv) {
    //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
    //Plq = coeff * Plq;

-    int tau = std::round(t);
-    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
-    writeFile(R,efile);
-    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
-    writeFile(qfield,tfile);

+    RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
+
+    int tau = std::round(t);
+
+    std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
+    //    writeFile(R,efile);
+
+    std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
+    //    writeFile(qfield,tfile);
+
+    std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
+    {
+      //      PeriodicGimplR::GaugeField Ucopy = U;
+      //      NerscIO::writeConfiguration(Ucopy,ufile);
+    }
+    
    RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
    RealD T = real( sum(qfield) );
    Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
    RealD E0 = real(peekSite(R,scoor));
    RealD T0 = real(peekSite(qfield,scoor));
    std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: "  << conf << " " << step << "  " << tau << "  "
-	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
+	      << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
    
  });
  
  int t=WFPar.maxTau;
  WF.smear(Uflow, Umu);
-
+  //  NerscIO::writeConfiguration(Uflow,filesmr);
+  
+  
  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
+  RealD WFlow_TC5Li   = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
  RealD WFlow_T0   = WF.energyDensityPlaquette(t,Uflow); // t
  RealD WFlow_EC   = WF.energyDensityCloverleaf(t,Uflow);
-  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
-  std::cout << GridLogMessage << "T0                 "<< conf << "   " << WFlow_T0 << std::endl;
-  std::cout << GridLogMessage << "TC0                 "<< conf << "   " << WFlow_EC << std::endl;
-  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;
+  std::cout << GridLogMessage << "Plaquette            "<< conf << "   " << WFlow_plaq << std::endl;
+  std::cout << GridLogMessage << "T0                   "<< conf << "   " << WFlow_T0 << std::endl;
+  std::cout << GridLogMessage << "TC0                  "<< conf << "   " << WFlow_EC << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge    "<< conf << "   " << WFlow_TC   << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << "   " << WFlow_TC5Li<< std::endl;

  std::cout<< GridLogMessage << " Admissibility check:\n";
  const double sp_adm = 0.067;                // admissible threshold
--- a/5
+++ b/5
@@ -1,3 +1,8 @@
+
+* Clean up the extract merge and replace with insertLane/extractLane
+
+-----
+
 i)    Refine subspace with HDCG & recompute
 ii)   Block Lanczos in coarse space
 iii)  Batched block project in the operator computation
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -873,7 +873,7 @@ int main (int argc, char ** argv)
  int do_su4=0;
  int do_memory=1;
  int do_comms =1;
-  int do_blas  =0;
+  int do_blas  =1;
  int do_dslash=1;

  int sel=4;
--- a/configure.ac
+++ b/configure.ac
@@ -198,6 +198,8 @@ AC_ARG_ENABLE([Nc],
    [ac_Nc=${enable_Nc}], [ac_Nc=3])

 case ${ac_Nc} in
+     1)
+        AC_DEFINE([Config_Nc],[1],[Gauge group Nc]);;
    2)
        AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);;
    3)
@@ -211,6 +213,21 @@ case ${ac_Nc} in
    *)
      AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
 esac
+############### Nd
+AC_ARG_ENABLE([Nd],
+    [AS_HELP_STRING([--enable-Nd=2|3|4],[enable default LGT dimension])],
+    [ac_Nd=${enable_Nd}], [ac_Nd=4])
+
+case ${ac_Nd} in
+    2)
+        AC_DEFINE([Config_Nd],[2],[Gauge field dimension Nd]);;
+    3)
+        AC_DEFINE([Config_Nd],[3],[Gauge field dimension Nd]);;
+    4)
+        AC_DEFINE([Config_Nd],[4],[Gauge field dimension Nd]);;
+    *)
+      AC_MSG_ERROR(["Unsupport dimension Nd = ${ac_Nd}"]);;
+esac

 ############### Symplectic group
 AC_ARG_ENABLE([Sp],
@@ -818,6 +835,7 @@ os (target)                 : $target_os
 compiler vendor             : ${ax_cv_cxx_compiler_vendor}
 compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
+Nd                          : ${ac_Nd}
 Nc                          : ${ac_Nc}
 SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 Threading                   : ${ac_openmp}
--- a/systems/mac-arm/config-command
+++ b/systems/mac-arm/config-command
@@ -0,0 +1,11 @@
+MPICXX=mpicxx CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ CXX=clang++ ../../configure \
+	      --enable-simd=GEN \
+	      --enable-comms=mpi-auto \
+	      --enable-unified=yes \
+	      --prefix $HOME/QCD/GridInstall \
+	      --with-lime=/Users/peterboyle/QCD/SciDAC/install/ \
+	      --with-openssl=$BREW \
+	      --disable-fermion-reps \
+	      --disable-gparity \
+	      --enable-debug
+
--- a/systems/mac-arm/config-command-mpi
+++ b/systems/mac-arm/config-command-mpi
@@ -7,8 +7,6 @@ CXX=mpicxx ../../configure \
 	   --enable-unified=yes \
 	   --prefix /Users/peterboyle/QCD/vtk/Grid/install \
 	   --with-lime=$CLIME \
-	   --with-hdf5=$HDF5 \
-	   --with-fftw=$FFTW \
 	   --with-openssl=$OPENSSL \
 	   --with-gmp=$GMP \
 	   --with-mpfr=$MPFR \
--- a/systems/sdcc-genoa/config-command
+++ b/systems/sdcc-genoa/config-command
@@ -1,3 +1,12 @@
+
+spack load c-lime
+spack load fftw
+spack load hdf5+cxx
+
+export FFTW=`spack find --paths fftw       | grep ^fftw   | awk '{print $2}' `
+export HDF5=`spack find --paths hdf5+cxx   | grep ^hdf5   | awk '{print $2}' `
+export CLIME=`spack find --paths c-lime    | grep ^c-lime | awk '{print $2}' `
+
 ../../configure \
 --enable-comms=mpi-auto \
 --enable-unified=yes \
@@ -5,12 +14,16 @@
 --enable-shm-fast-path=shmopen \
 --enable-accelerator=none \
 --enable-simd=AVX512 \
--disable-accelerator-cshift \
+--with-lime=$CLIME \
+--with-hdf5=$HDF5 \
+--with-fftw=$FFTW \
 --disable-fermion-reps \
 --disable-gparity \
 CXX=clang++ \
 MPICXX=mpicxx \
-CXXFLAGS="-std=c++17"
+LIBS=-llime \
+LDFLAGS=-L$CLIME/lib/ \
+CXXFLAGS="-std=c++17 -fPIE"



--- a/systems/sdcc-genoa/sourceme.sh
+++ b/systems/sdcc-genoa/sourceme.sh
@@ -1,4 +1,5 @@
 source $HOME/spack/share/spack/setup-env.sh
 spack load llvm@17.0.4
 export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
-module load openmpi
+module load openmpi/4.1.8
+spack load c-lime
--- a/tests/debug/Test_icosahedron.cc
+++ b/tests/debug/Test_icosahedron.cc
@@ -0,0 +1,93 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/debug/Test_icosahedron.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+const int MyNd=3;
+template<typename vtype> using iIcosahedralLorentzComplex = iVector<iScalar<iScalar<vtype> >, MyNd+1 > ;
+
+typedef iIcosahedralLorentzComplex<ComplexD >  IcosahedralLorentzComplexD;
+typedef iIcosahedralLorentzComplex<vComplexD> vIcosahedralLorentzComplexD;
+typedef Lattice<vIcosahedralLorentzComplexD> LatticeIcosahedralLorentzComplexD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int L=8;
+  const int Npatch = num_icosahedron_tiles;
+
+  // Put SIMD all in time direction
+  Coordinate latt_size = GridDefaultLatt();
+  Coordinate simd_layout({1,1,vComplexD::Nsimd(),1});
+  Coordinate mpi_layout = GridDefaultMpi();
+
+  std::cout << GridLogMessage << " mpi "<<mpi_layout<<std::endl;
+  std::cout << GridLogMessage << " simd "<<simd_layout<<std::endl;
+  std::cout << GridLogMessage << " latt "<<latt_size<<std::endl;
+  GridCartesianCrossIcosahedron EdgeGrid(latt_size,simd_layout,mpi_layout,IcosahedralEdges);
+  std::cout << GridLogMessage << " Created edge grid "<<std::endl;
+  GridCartesianCrossIcosahedron VertexGrid(latt_size,simd_layout,mpi_layout,IcosahedralVertices);
+
+  std::cout << GridLogMessage << " Created vertex grid "<<std::endl;
+  LatticeIcosahedralLorentzComplexD Umu(&EdgeGrid);
+  LatticeComplex Phi(&VertexGrid);
+  std::cout << GridLogMessage << " Created two fields "<<std::endl;
+
+  Phi = Zero();
+  Umu = Zero();
+  std::cout << GridLogMessage << " Zeroed two fields "<<std::endl;
+
+  ComplexD one (1.0);
+  Phi = one;
+  Umu = one;
+  
+  std::cout << GridLogMessage << " V = "<<norm2(Phi)<<std::endl;
+  std::cout << GridLogMessage << " Expect "<<latt_size[0]*latt_size[1]*latt_size[2]*10+2*latt_size[2]<<std::endl;
+  
+  std::cout << GridLogMessage << " E = "<<norm2(Umu)<<std::endl;
+  std::cout << GridLogMessage << " Expect "<<latt_size[0]*latt_size[1]*latt_size[2]*10*4<<std::endl;
+
+  //  std::cout << " Umu "<<Umu<<std::endl;
+  //  std::cout << " Phi "<<Phi<<std::endl;
+  LatticePole(Phi,South);
+  std::cout << " Phi South Pole set\n"<<Phi<<std::endl;
+
+  LatticePole(Phi,North);
+  std::cout << " Phi North Pole set\n"<<Phi<<std::endl;
+
+  for(int mu=0;mu<VertexGrid._ndimension;mu++){
+    std::cout << " Calling lattice coordinate mu="<<mu<<std::endl;
+    LatticeCoordinate(Phi,mu);
+    std::cout << " Phi coor mu="<<mu<<"\n"<<Phi<<std::endl;
+  }
+
+  Grid_finalize();
+}
--- a/tests/lanczos/LanParams.xml
+++ b/tests/lanczos/LanParams.xml
@@ -1,15 +1,14 @@
 <?xml version="1.0"?>
 <grid>
  <LanczosParameters>
-    <mass>-1.025</mass>
-    <mstep>-0.025</mstep>
+    <mass>0.00107</mass>
    <M5>1.8</M5>
    <Ls>48</Ls>
    <Nstop>10</Nstop>
-    <Nk>12</Nk>
-    <Np>30</Np>
-    <ChebyLow>0.1</ChebyLow>
-    <ChebyHigh>50</ChebyHigh>
-    <ChebyOrder>51</ChebyOrder>
+    <Nk>15</Nk>
+    <Np>85</Np>
+    <ChebyLow>0.003</ChebyLow>
+    <ChebyHigh>60</ChebyHigh>
+    <ChebyOrder>201</ChebyOrder>
  </LanczosParameters>
 </grid>
--- a/tests/lanczos/Test_dwf_G5R5.cc
+++ b/tests/lanczos/Test_dwf_G5R5.cc
@@ -31,16 +31,23 @@ directory

 using namespace std;
 using namespace Grid;
- ;

-#if 0
+//typedef WilsonFermionD FermionOp;
 typedef DomainWallFermionD FermionOp;
 typedef typename DomainWallFermionD::FermionField FermionField;
-#else
-typedef MobiusFermionD FermionOp;
-typedef typename MobiusFermionD::FermionField FermionField;
-#endif

+template <class T> void writeFile(T& in, std::string const fname){  
+#ifdef HAVE_LIME
+  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
+  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
+  Grid::emptyUserRecord record;
+  Grid::ScidacWriter WR(in.Grid()->IsBoss());
+  WR.open(fname);
+  WR.writeScidacFieldRecord(in,record,0);
+  WR.close();
+#endif
+  // What is the appropriate way to throw error?
+}

 RealD AllZero(RealD x) { return 0.; }

@@ -125,7 +132,7 @@ int main(int argc, char** argv) {

  int Ls=16;
  RealD M5=1.8;
-  RealD mass = -1.0;
+  RealD mass = 0.01;

  mass=LanParams.mass;
  Ls=LanParams.Ls;
@@ -163,10 +170,10 @@ int main(int argc, char** argv) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
-
-  int Nstop = 10;
  int Nk = 20;
+  int Nstop = Nk;
  int Np = 80;
+
  Nstop=LanParams.Nstop;
  Nk=LanParams.Nk;
  Np=LanParams.Np;
@@ -174,11 +181,10 @@ int main(int argc, char** argv) {
  int Nm = Nk + Np;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;
-  RealD mob_b=1.5;
+

 //while ( mass > - 5.0){
-//  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.);
+  FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
 //  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
  Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
@@ -206,8 +212,12 @@ int main(int argc, char** argv) {
  int Nconv;
  IRL.calc(eval, evec, src, Nconv);

-  std::cout << mass <<" : " << eval << std::endl;
-
+  std::cout << mass <<" : " << eval        << std::endl;
+  std::cout << " #evecs "   << evec.size() << std::endl;
+  std::cout << " Nconv  "   << Nconv       << std::endl;
+  std::cout << " Nm     "   << Nm          << std::endl;
+  if ( Nconv > evec.size() ) Nconv = evec.size();
+  
 #if 0
  Gamma g5(Gamma::Algebra::Gamma5) ;
  ComplexD dot;
@@ -237,6 +247,7 @@ int main(int argc, char** argv) {
  vector<LatticeFermion> finalevec(Nconv, FGrid);
  vector<RealD> eMe(Nconv), eMMe(Nconv);
  for(int i = 0; i < Nconv; i++){
+    cout << "calculate the matrix element["<<i<<"]" << endl;
    G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
  }
  cout << "Re<evec, G5R5M(evec)>: " << endl;
@@ -331,13 +342,27 @@ int main(int argc, char** argv) {
      axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
    }
  }
+  
  for(int i = 0; i < Nconv; i++){
    chiral_matrix_real[i].resize(Nconv);
    chiral_matrix[i].resize(Nconv);
+
+    std::string evfile("./evec_density");
+    evfile = evfile+"_"+std::to_string(i);
+    auto evdensity = localInnerProduct(finalevec[i],finalevec[i] );
+    writeFile(evdensity,evfile);
+
    for(int j = 0; j < Nconv; j++){
      chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
+      std::cout <<" chiral_matrix_real signed "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
      chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
      std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
+      if ( chiral_matrix_real[i][j] > 0.8 ) {
+	auto g5density = localInnerProduct(finalevec[i], G5evec[j]);
+	std::string chfile("./chiral_density_");
+	chfile = chfile +std::to_string(i)+"_"+std::to_string(j);
+	writeFile(g5density,chfile);
+      }
    }
  }
  for(int i = 0; i < Nconv; i++){
@@ -346,6 +371,43 @@ int main(int argc, char** argv) {
    }
  }

-
+  FILE *fp = fopen("lego-plot.py","w"); assert(fp!=NULL);
+#define PYTHON_LINE(A)  fprintf(fp,A"\n");
+  PYTHON_LINE("import matplotlib.pyplot as plt");
+  PYTHON_LINE("import numpy as np");
+  PYTHON_LINE("");
+  PYTHON_LINE("fig = plt.figure()");
+  PYTHON_LINE("ax = fig.add_subplot(projection='3d')");
+  PYTHON_LINE("");
+  PYTHON_LINE("x, y = np.random.rand(2, 100) * 4");
+  PYTHON_LINE("hist, xedges, yedges = np.histogram2d(x, y, bins=10, range=[[0, 9], [0, 9]])");
+  PYTHON_LINE("");
+  PYTHON_LINE("# Construct arrays for the anchor positions of the 16 bars");
+  PYTHON_LINE("xpos, ypos = np.meshgrid(xedges[:-1] + 0.25, yedges[:-1] + 0.25, indexing=\"ij\")");
+  PYTHON_LINE("xpos = xpos.ravel()");
+  PYTHON_LINE("ypos = ypos.ravel()");
+  PYTHON_LINE("zpos = 0");
+  PYTHON_LINE("");
+  PYTHON_LINE("# Construct arrays with the dimensions for the 16 bars.");
+  PYTHON_LINE("dx = dy = 0.5 * np.ones_like(zpos)");
+  PYTHON_LINE("dz = np.array([");
+  for(int i = 0; i < Nconv; i++){
+    fprintf(fp,"\t[ ");
+    for(int j = 0; j < Nconv; j++){
+      fprintf(fp,"%lf ",chiral_matrix_real[i][j]);
+      if(j<Nconv-1) fprintf(fp,",");
+      else          fprintf(fp," ");
+    }
+    fprintf(fp,"]");
+    if(i<Nconv-1) fprintf(fp,",\n");
+    else          fprintf(fp,"\n");
+  }
+	      
+  PYTHON_LINE("\t])");
+  PYTHON_LINE("dz = dz.ravel()");
+  PYTHON_LINE("ax.bar3d(xpos, ypos, zpos, dx, dy, dz, zsort='average')");
+  PYTHON_LINE("plt.show()");
+  fclose(fp);
+  
  Grid_finalize();
 }
--- a/tests/lanczos/Test_wilson_DWFKernel.cc
+++ b/tests/lanczos/Test_wilson_DWFKernel.cc
@@ -113,9 +113,6 @@ struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
 				RealD, resid,
-				Integer, Nstop,
-                                Integer, Nk,
-                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -207,6 +204,7 @@ int main(int argc, char** argv) {
  int Nstop = 5;
  int Nk = 10;
  int Np = 90;
+  int Nm = Nk + Np;
  int MaxIt = 10000;
  RealD resid = 1.0e-5;

@@ -228,14 +226,10 @@ int main(int argc, char** argv) {
    XmlWriter HMCwr("LanParams.xml.out");
    write(HMCwr,"LanczosParameters",LanParams);
  }
-  Nstop=LanParams.Nstop;
-  Nk=LanParams.Nk;
-  Np=LanParams.Np;
+
  mass=LanParams.mass;
  resid=LanParams.resid;

-  int Nm = Nk + Np;
-

 while ( mass > - 5.0){
  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
--- a/tests/lanczos/Test_wilson_lanczos.cc
+++ b/tests/lanczos/Test_wilson_lanczos.cc
@@ -61,8 +61,7 @@ int main(int argc, char** argv) {
  RNG5.SeedFixedIntegers(seeds5);

  LatticeGaugeField Umu(UGrid);
-//  SU<Nc>::HotConfiguration(RNG4, Umu);
-  SU<Nc>::ColdConfiguration(Umu);
+  SU<Nc>::HotConfiguration(RNG4, Umu);

 /*
  std::vector<LatticeColourMatrix> U(4, UGrid);
@@ -70,15 +69,9 @@ int main(int argc, char** argv) {
    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
  }
 */
-//  std::vector<Complex> boundary = {1,1,1,-1};
-  std::vector<Complex> boundary = {1,1,1,1};
-  FermionOp::ImplParams Params(boundary);

-
-
-  RealD mass = 0.0;
-//  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
+  RealD mass = -0.1;
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
  MdagMLinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);

@@ -96,8 +89,7 @@ int main(int argc, char** argv) {
  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
     PlainHermOp<FermionField> Op     (HermOp);

-//  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
-  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);
+  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);

  std::vector<RealD> eval(Nm);
  FermionField src(FGrid);
@@ -109,8 +101,7 @@ int main(int argc, char** argv) {
  };

  int Nconv;
-//  IRL.calc(eval, evec, src, Nconv);
-  IRL.calc(eval, src, Nconv);
+  IRL.calc(eval, evec, src, Nconv);

  std::cout << eval << std::endl;

--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@@ -27,7 +27,6 @@ directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
-#include <Grid/parallelIO/IldgIOtypes.h>

 using namespace std;
 using namespace Grid;
@@ -39,29 +38,11 @@ typedef typename WilsonFermionD::FermionField FermionField;

 RealD AllZero(RealD x) { return 0.; }

-template <class T> void writeFile(T& in, std::string const fname){
-#if 1
-  // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
-  std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
-  Grid::emptyUserRecord record;
-  Grid::ScidacWriter WR(in.Grid()->IsBoss());
-  WR.open(fname);
-  WR.writeScidacFieldRecord(in,record,0);
-  WR.close();
-#endif
-  // What is the appropriate way to throw error?
-}
-
-
 namespace Grid {

 struct LanczosParameters: Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
 		  		RealD, mass , 
-		  		RealD, mstep , 
-				Integer, Nstop,
-                                Integer, Nk,
-                                Integer, Np,
 	  			RealD, ChebyLow,
 	  			RealD, ChebyHigh,
 	  			Integer, ChebyOrder)
@@ -134,7 +115,6 @@ int main(int argc, char** argv) {

  LatticeGaugeField Umu(UGrid);
 //  SU<Nc>::HotConfiguration(RNG4, Umu);
-//  SU<Nc>::ColdConfiguration(Umu);

  FieldMetaData header;
  std::string file("./config");
@@ -178,20 +158,10 @@ int main(int argc, char** argv) {
  }

  mass=LanParams.mass;
-  Nstop=LanParams.Nstop;
-  Nk=LanParams.Nk;
-  Np=LanParams.Np;
-  Nm = Nk + Np;
-
-  FermionField src(FGrid);
-  gaussian(RNG5, src);
-  std::vector<Complex> boundary = {1,1,1,-1};
-//  std::vector<Complex> boundary = {1,1,1,1};
-  FermionOp::ImplParams Params(boundary);


-while ( mass > - 2.5){
-  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass,Params);
+while ( mass > - 5.0){
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
@@ -210,9 +180,10 @@ while ( mass > - 2.5){
     PlainHermOp<FermionField> Op2     (HermOp2);

  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
-//  SimpleLanczos<FermionField> IRL(Op,Nstop, Nk, Nm, resid, MaxIt);

  std::vector<RealD> eval(Nm);
+  FermionField src(FGrid);
+  gaussian(RNG5, src);
  std::vector<FermionField> evec(Nm, FGrid);
  for (int i = 0; i < 1; i++) {
    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
@@ -221,7 +192,6 @@ while ( mass > - 2.5){

  int Nconv;
  IRL.calc(eval, evec, src, Nconv);
-//  IRL.calc(eval,  src, Nconv);

  std::cout << mass <<" : " << eval << std::endl;

@@ -232,17 +202,9 @@ while ( mass > - 2.5){
    tmp = g5*evec[i];
    dot = innerProduct(tmp,evec[i]);
    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
-//    if ( i<1)
-    {
-	std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
-        auto evdensity = localInnerProduct(evec[i],evec[i] );
-	writeFile(evdensity,evfile);
-    }
  }
  src  = evec[0]+evec[1]+evec[2];
-  src  += evec[3]+evec[4]+evec[5];
-  src  += evec[6]+evec[7]+evec[8];
-  mass += LanParams.mstep;
+  mass += -0.1;
 }

  Grid_finalize();
--- a/visualisation/CMakeLists.txt
+++ b/visualisation/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.12 FATAL_ERROR)

 project(GridViewer)

-list(APPEND CMAKE_PREFIX_PATH "/Users/peterboyle/QCD/vtk/VTK-9.4.2-install/")
+list(APPEND CMAKE_PREFIX_PATH "/home/paboyle/Visualisation/install/")

 find_package(VTK COMPONENTS 
  CommonColor
--- a/visualisation/FieldDensityAnimate.cxx
+++ b/visualisation/FieldDensityAnimate.cxx
@@ -48,15 +48,14 @@ typedef vtkMarchingCubes isosurface;

 int mpeg = 0 ;
 int xlate = 0 ;
+int framerate = 10;

 template <class T> void readFile(T& out, std::string const fname){
-#ifdef HAVE_LIME
  Grid::emptyUserRecord record;
  Grid::ScidacReader RD;
  RD.open(fname);
  RD.readScidacFieldRecord(out,record);
  RD.close();
-#endif
 }
 using namespace Grid;

@@ -208,6 +207,10 @@ int main(int argc, char* argv[])
    xlate = 1;
  }

+  if( GridCmdOptionExists(argv,argv+argc,"--fps") ){
+    arg=GridCmdOptionPayload(argv,argv+argc,"--fps");
+    GridCmdOptionInt(arg,framerate);
+  }
  if( GridCmdOptionExists(argv,argv+argc,"--isosurface") ){
    arg=GridCmdOptionPayload(argv,argv+argc,"--isosurface");
    GridCmdOptionFloat(arg,default_contour);
@@ -420,7 +423,7 @@ int main(int argc, char* argv[])
    
    vtkFFMPEGWriter *writer = vtkFFMPEGWriter::New();
    writer->SetFileName("movie.avi");
-    writer->SetRate(1);
+    writer->SetRate(framerate);
    writer->SetInputConnection(imageFilter->GetOutputPort());
    writer->Start();

@@ -477,7 +480,7 @@ int main(int argc, char* argv[])
    slidercallback->fu_list = fu_list;
    sliderWidget->AddObserver(vtkCommand::InteractionEvent, slidercallback);

-    int timerId = iren->CreateRepeatingTimer(300);
+    int timerId = iren->CreateRepeatingTimer(1000/framerate);
    std::cout << "timerId: " << timerId << std::endl;

    // Start the interaction and timer
--- a/visualisation/README
+++ b/visualisation/README
@@ -73,6 +73,21 @@ each to:

   VTK really should make it easier to pick up the flags required for FFMPEG linkage, especially as they are very quirky on MacOS.

+========================================
+Aurora compilation:
+========================================
+module load ffmpeg
+download & untar: VTK-7.0.2
+mkdir build
+cd build 
+ccmake ../
+
+"t"
+Enable: VTK_MODULE_ENABLE_VTK_IOFFMPEG   YES   
+"configure" ; should "discover" the installed ffmpeg module
+
+Still need an "X" connection to make the MPEG files.
+

 ========================================
 Grid:
--- a/visualisation/cmake-command
+++ b/visualisation/cmake-command
@@ -1,9 +1,17 @@
-libs=`grid-config --libs`
-ldflags=`grid-config --ldflags`
-cxxflags=`grid-config --cxxflags`
-cxx=`grid-config --cxx`
+export grid_config=/home/paboyle/GPT/install/bin/grid-config
+libs=`$grid_config --libs`
+ldflags=`$grid_config --ldflags`
+cxxflags=`$grid_config --cxxflags`
+cxx=`$grid_config --cxx`
+cc=icx

 mkdir build
 cd build

-LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_CXX_COMPILER=$cxx -DCMAKE_CXX_FLAGS=$cxxflags 
+echo CC $cc
+echo CXX $cxx
+echo CXXFLAGS $cxxflags
+echo LDFLAGS  $ldflags
+echo LIBS  $libs
+
+LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_C_COMPILER=$cc -DCMAKE_CXX_COMPILER="$cxx" -DCMAKE_CXX_FLAGS="$cxxflags "
Author	SHA1	Message	Date
Peter Boyle	c7b74db317	Default dimensions fixed	2025-10-09 14:57:22 -04:00
Peter Boyle	0ce201efbe	IcosahedralVerted() checks	2025-10-09 13:35:16 -04:00
Peter Boyle	6d8a3d8bb2	Config	2025-10-09 13:30:16 -04:00
Peter Boyle	7dfd207ebb	Need to protect pole operatoins to only take place on IcosahedralVertices mesh	2025-10-08 15:18:31 -04:00
Peter Boyle	3a65a096f2	Nd verbose	2025-10-07 18:49:00 -04:00
Peter Boyle	85b2bd4c93	Beginnings of S2xR	2025-10-07 16:11:06 -04:00
Peter Boyle	35e10a1159	Changes for Nd=3	2025-10-03 12:17:13 -04:00
Peter Boyle	d418f78352	Making running on Aurora more debuggable	2025-05-23 20:58:16 +00:00
Peter Boyle	25163998a0	Makes SYCL compiler happy	2025-05-23 20:57:11 +00:00
Peter Boyle	dc546aaa4b	Updated config options for BNL cluster	2025-05-13 18:44:47 -04:00
Peter Boyle	5364d580c9	Output chirality, eigenvector density files and python source lego plot	2025-05-13 18:44:47 -04:00
Peter Boyle	2a9a6347e3	Do not require Grid format RNGs and also to the 5Li reporting	2025-05-13 18:44:47 -04:00
Peter Boyle	cfdb56f314	Run measurements at t=0 too	2025-05-13 18:44:46 -04:00
Peter Boyle	b517e88db3	Update README	2025-05-13 16:49:21 -04:00
Peter Boyle	bb317aba8d	Lattice = for sycl	2025-05-13 12:50:58 +00:00
Peter Boyle	644cc6647e	JSON update	2025-05-13 12:50:58 +00:00
Peter Boyle	72397ce23b	SYCL interface change	2025-05-13 12:50:58 +00:00
Peter Boyle	d60a80c098	Fixes and visualisation	2025-04-29 18:04:23 -04:00
Peter Boyle	bb8b6d9d73	Fix	2025-04-29 18:04:04 -04:00