From 85b2bd4c93314b759b426278f314ed4dec195abd Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 7 Oct 2025 16:11:06 -0400
Subject: [PATCH] Beginnings of S2xR

---
 Grid/cartesian/Cartesian.h                    |   1 +
 Grid/cartesian/CartesianCrossIcosahedron.h    | 197 +++++++
 Grid/cartesian/Cartesian_base.h               |  13 +-
 Grid/cshift/Cshift_mpi.h                      |   2 +
 Grid/cshift/Cshift_none.h                     |   1 +
 Grid/lattice/Lattice_base.h                   |  42 +-
 Grid/lattice/Lattice_coordinate.h             |  86 +++-
 Grid/lattice/Lattice_peekpoke.h               | 132 ++++-
 .../fermion/TwoSpinWilsonFermion3plus1D.h     | 175 +++++++
 Grid/qcd/action/fermion/TwoSpinWilsonImpl.h   | 222 ++++++++
 .../qcd/action/fermion/TwoSpinWilsonKernels.h |  84 +++
 ...woSpinWilsonFermion3plus1DImplementation.h | 486 ++++++++++++++++++
 .../TwoSpinWilsonKernelsImplementation.h      | 441 ++++++++++++++++
 ...woSpinWilsonFermion3plus1DInstantiation.cc |  61 +++
 ...ilsonFermion3plus1DInstantiation.cc.master |  40 ++
 ...woSpinWilsonKernelsInstantiation.cc.master |  40 ++
 Grid/qcd/spin/Pauli.h                         | 220 ++++++++
 TODO                                          |   5 +
 tests/debug/Test_icosahedron.cc               |  93 ++++
 19 files changed, 2324 insertions(+), 17 deletions(-)
 create mode 100644 Grid/cartesian/CartesianCrossIcosahedron.h
 create mode 100644 Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
 create mode 100644 Grid/qcd/action/fermion/TwoSpinWilsonImpl.h
 create mode 100644 Grid/qcd/action/fermion/TwoSpinWilsonKernels.h
 create mode 100644 Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h
 create mode 100644 Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h
 create mode 100644 Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc
 create mode 100644 Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc.master
 create mode 100644 Grid/qcd/action/fermion/instantiation/TwoSpinWilsonKernelsInstantiation.cc.master
 create mode 100644 Grid/qcd/spin/Pauli.h
 create mode 100644 tests/debug/Test_icosahedron.cc

diff --git a/Grid/cartesian/Cartesian.h b/Grid/cartesian/Cartesian.h
index 070cad95..79fef8ec 100644
--- a/Grid/cartesian/Cartesian.h
+++ b/Grid/cartesian/Cartesian.h
@@ -31,5 +31,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cartesian/Cartesian_base.h>
 #include <Grid/cartesian/Cartesian_full.h>
 #include <Grid/cartesian/Cartesian_red_black.h> 
+#include <Grid/cartesian/CartesianCrossIcosahedron.h>
 
 #endif
diff --git a/Grid/cartesian/CartesianCrossIcosahedron.h b/Grid/cartesian/CartesianCrossIcosahedron.h
new file mode 100644
index 00000000..76b17a3e
--- /dev/null
+++ b/Grid/cartesian/CartesianCrossIcosahedron.h
@@ -0,0 +1,197 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cartesian/CartesianCrossIcosahedron.h
+
+    Copyright (C) 2025
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+    
+/////////////////////////////////////////////////////////////////////////////////////////
+// Grid Support.
+/////////////////////////////////////////////////////////////////////////////////////////
+
+enum IcosahedralMeshType {
+  IcosahedralVertices,
+  IcosahedralEdges
+} ;
+enum NorthSouth {
+  North = 1,
+  South = 0
+};
+
+const int num_icosahedron_tiles = 10;
+
+class GridCartesianCrossIcosahedron: public GridCartesian {
+
+public:
+
+  IcosahedralMeshType meshType;
+
+  IcosahedralMeshType MeshType(void) { return meshType; };
+  
+  /////////////////////////////////////////////////////////////////////////
+  // Constructor takes a parent grid and possibly subdivides communicator.
+  /////////////////////////////////////////////////////////////////////////
+  /*
+  GridCartesian(const Coordinate &dimensions,
+		const Coordinate &simd_layout,
+		const Coordinate &processor_grid,
+		const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
+  {
+    assert(0); // No subdivision
+  }
+  GridCartesian(const Coordinate &dimensions,
+		const Coordinate &simd_layout,
+		const Coordinate &processor_grid,
+		const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
+  {
+    assert(0); // No subdivision
+  }
+  */
+  /////////////////////////////////////////////////////////////////////////
+  // Construct from comm world
+  /////////////////////////////////////////////////////////////////////////
+  GridCartesianCrossIcosahedron(const Coordinate &dimensions,
+				const Coordinate &simd_layout,
+				const Coordinate &processor_grid,
+				IcosahedralMeshType _meshType) : GridCartesian(dimensions,simd_layout,processor_grid)
+  {
+    meshType = _meshType;
+    Coordinate S2dimensions=dimensions;
+    Coordinate S2simd      =simd_layout;
+    Coordinate S2procs     =processor_grid;
+
+    assert(simd_layout[0]==1); // Force simd into perpendicular dimensions
+    assert(simd_layout[1]==1); // to avoid pole storage complexity interacting with SIMD.
+    assert(dimensions[_ndimension-1]==num_icosahedron_tiles);
+    assert(processor_grid[_ndimension-1]<=2); // Keeps the patches that need a pole on the same node
+
+    // allocate the pole storage if we are seeking vertex domain data
+    if ( meshType == IcosahedralVertices ) {
+      InitPoles();
+    }
+  }
+
+  virtual ~GridCartesianCrossIcosahedron() = default;
+
+  ////////////////////////////////////////////////
+  // Use to decide if a given grid is icosahedral
+  ////////////////////////////////////////////////
+  int hasNorthPole;
+  int hasSouthPole;
+  int northPoleOsite;
+  int southPoleOsite;
+  int northPoleOsites;
+  int southPoleOsites;
+
+  virtual int Icosahedral(void)     override { return 1;}
+  virtual int ownsNorthPole(void)   const override { return hasNorthPole; };
+  virtual int NorthPoleOsite(void)  const override { return northPoleOsite; };
+  virtual int NorthPoleOsites(void) const override { return northPoleOsites; };
+  virtual int ownsSouthPole(void)   const override { return hasSouthPole; };
+  virtual int SouthPoleOsite(void)  const override { return southPoleOsite; };
+  virtual int SouthPoleOsites(void) const override { return southPoleOsites; };
+
+  void InitPoles(void)
+  {
+    int Ndm1 = _ndimension-1;
+    ///////////////////////
+    // Add the extra pole storage
+    ///////////////////////
+    // Vertices = 1x LxLx D1...Dn + 2.D1...Dn
+    // Start after the LxL and don't include the 10 patch dim
+    int OrthogSize = 1;
+    for (int d = 2; d < Ndm1; d++) {
+      OrthogSize *= _gdimensions[d];
+    }
+    _fsites += OrthogSize*2;
+    _gsites += OrthogSize*2;
+
+    // Simd reduced sizes are multiplied up.
+    // If the leading LxL are simd-ized, the vector objects will contain "redundant" lanes
+    // which should contain identical north (south) pole data
+    OrthogSize = 1;
+    for (int d = 2; d < Ndm1; d++) {
+      OrthogSize *= _rdimensions[d];
+    }
+
+    // Grow the local volume to hold pole data
+    // on rank (0,0) in the LxL planes
+    // since SIMD must be placed in the orthogonal directions
+    Coordinate pcoor = this->ThisProcessorCoor();
+    Coordinate pgrid = this->ProcessorGrid();
+
+    const int xdim=0;
+    const int ydim=1;
+    /*
+     *
+     *  /\/\/\/\/\
+     * /\/\/\/\/\/
+     * \/\/\/\/\/
+     *
+     *  y
+     * /
+     * \x
+     *
+     * Labelling patches as 5 6 7 8 9
+     *                      0 1 2 3 4
+     *
+     * Will ban distribution of the patch dimension by more than 2.
+     *
+     * Hence all 5 patches associated with the pole must have the
+     * appropriate "corner" of the patch L^2 located on the SAME rank.
+     */ 
+     
+    if( (pcoor[xdim]==pgrid[xdim]-1) && (pcoor[ydim]==0) && (pcoor[Ndm1]==0) ){
+      hasSouthPole   =1;
+      southPoleOsite=this->_osites;
+      southPoleOsites=OrthogSize;
+      this->_osites += OrthogSize;
+    } else {
+      hasSouthPole   =0;
+      southPoleOsites=0;
+      southPoleOsite=0;
+    }
+    if( (pcoor[xdim]==0) && (pcoor[ydim]==pgrid[ydim]-1) && (pcoor[Ndm1]==pgrid[Ndm1]-1) ){
+      hasNorthPole   =1;
+      northPoleOsite=this->_osites;
+      northPoleOsites=OrthogSize;
+      this->_osites += OrthogSize;
+    } else {
+      hasNorthPole   =0;
+      northPoleOsites=0;
+      northPoleOsite=0;
+    }
+    std::cout << "Icosahedral vertex field volume " << this->_osites<<std::endl;
+    std::cout << "Icosahedral south pole offset   " << this->southPoleOsite<<std::endl;
+    std::cout << "Icosahedral north pole offset   " << this->northPoleOsite<<std::endl;
+    std::cout << "Icosahedral south pole size     " << this->southPoleOsites<<std::endl;
+    std::cout << "Icosahedral north pole size     " << this->northPoleOsites<<std::endl;
+  };
+
+};
+
+NAMESPACE_END(Grid);
diff --git a/Grid/cartesian/Cartesian_base.h b/Grid/cartesian/Cartesian_base.h
index d029b9a7..f825011c 100644
--- a/Grid/cartesian/Cartesian_base.h
+++ b/Grid/cartesian/Cartesian_base.h
@@ -86,11 +86,20 @@ public:
 
 public:
 
+  // Icosahedral decisions
+  virtual int Icosahedral(void) { return 0;}
+  virtual int ownsNorthPole(void) const { return 0; };
+  virtual int ownsSouthPole(void) const { return 0; };
+  virtual int NorthPoleOsite(void) const { return 0; };
+  virtual int SouthPoleOsite(void) const { return 0; };
+  virtual int NorthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
+  virtual int SouthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
+
   ////////////////////////////////////////////////////////////////
   // Checkerboarding interface is virtual and overridden by 
   // GridCartesian / GridRedBlackCartesian
   ////////////////////////////////////////////////////////////////
-  virtual int Icosahedral(void) { return 0;}
+
   virtual int CheckerBoarded(int dim) =0;
   virtual int CheckerBoard(const Coordinate &site)=0;
   virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
@@ -177,6 +186,8 @@ public:
     }
     return permute_type;
   }
+
+  
   ////////////////////////////////////////////////////////////////
   // Array sizing queries
   ////////////////////////////////////////////////////////////////
diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h
index 6f2e2699..b74ef958 100644
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
 const int Cshift_verbose=0;
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
+  assert(!rhs.Grid()->Icosahedral());
+  
   typedef typename vobj::vector_type vector_type;
   typedef typename vobj::scalar_type scalar_type;
 
diff --git a/Grid/cshift/Cshift_none.h b/Grid/cshift/Cshift_none.h
index d3c0bfa0..79a360b4 100644
--- a/Grid/cshift/Cshift_none.h
+++ b/Grid/cshift/Cshift_none.h
@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 NAMESPACE_BEGIN(Grid);
 template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
 {
+  assert(!rhs.Grid()->Icosahedral());
   Lattice<vobj> ret(rhs.Grid());
   ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
   Cshift_local(ret,rhs,dimension,shift);
diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h
index d8b9a618..e6b0603f 100644
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -373,14 +373,17 @@ public:
 
 template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
   typedef typename vobj::scalar_object sobj;
-  for(int64_t g=0;g<o.Grid()->_gsites;g++){
+  uint64_t gsites=1;
+  uint64_t polesites=0;
+  for(int d=0;d<o.Grid()->_ndimension;d++) gsites *= o.Grid()->_gdimensions[d];
+  for(int64_t g=0;g<gsites;g++){
 
     Coordinate gcoor;
     o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
 
     sobj ss;
     peekSite(ss,o,gcoor);
-    stream<<"[";
+    stream<<"["<<  g<<" : ";
     for(int d=0;d<gcoor.size();d++){
       stream<<gcoor[d];
       if(d!=gcoor.size()-1) stream<<",";
@@ -388,6 +391,41 @@ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Latti
     stream<<"]\t";
     stream<<ss<<std::endl;
   }
+  if ( o.Grid()->Icosahedral() ) {
+    uint64_t psites=1;
+    Coordinate perpdims;
+    for(int d=2;d<o.Grid()->_ndimension-1;d++){
+      int pd=o.Grid()->_gdimensions[d];
+      psites*=pd;
+      perpdims.push_back(pd);
+    }
+    for(uint64_t p=0;p<psites;p++){
+      sobj ss;
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+      peekPole(ss,o,orthog,South);
+      stream<<"[ SouthPole : ";
+      for(int d=0;d<orthog.size();d++){
+	stream<<orthog[d];
+	if(d!=orthog.size()-1) stream<<",";
+      }
+      stream<<"]\t";
+      stream<<ss<<std::endl;
+    }
+    for(uint64_t p=0;p<psites;p++){
+      sobj ss;
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+      peekPole(ss,o,orthog,North);
+      stream<<"[ NorthPole : ";
+      for(int d=0;d<orthog.size();d++){
+	stream<<orthog[d];
+	if(d!=orthog.size()-1) stream<<",";
+      }
+      stream<<"]\t";
+      stream<<ss<<std::endl;
+    }
+  }
   return stream;
 }
   
diff --git a/Grid/lattice/Lattice_coordinate.h b/Grid/lattice/Lattice_coordinate.h
index cd0f11ee..e08019a1 100644
--- a/Grid/lattice/Lattice_coordinate.h
+++ b/Grid/lattice/Lattice_coordinate.h
@@ -34,22 +34,84 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
   typedef typename iobj::scalar_type scalar_type;
   typedef typename iobj::vector_type vector_type;
 
+  l=Zero();
+  
   GridBase *grid = l.Grid();
   int Nsimd = grid->iSites();
 
-  autoView(l_v, l, CpuWrite);
-  thread_for( o, grid->oSites(), {
-    vector_type vI;
-    Coordinate gcoor;
-    ExtractBuffer<scalar_type> mergebuf(Nsimd);
-    for(int i=0;i<grid->iSites();i++){
-      grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
-      mergebuf[i]=(Integer)gcoor[mu];
+  int cartesian_vol = grid->oSites();
+  if ( grid->Icosahedral() ) {
+    cartesian_vol = cartesian_vol - grid->NorthPoleOsites()-grid->SouthPoleOsites();
+  }
+  {
+    autoView(l_v, l, CpuWrite);
+    thread_for( o, cartesian_vol, {
+	vector_type vI;
+	Coordinate gcoor;
+	ExtractBuffer<scalar_type> mergebuf(Nsimd);
+	for(int i=0;i<grid->iSites();i++){
+	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
+	  mergebuf[i]=(Integer)gcoor[mu];
+	}
+	merge<vector_type,scalar_type>(vI,mergebuf);
+	l_v[o]=vI;
+      });
+  }
+
+  if (grid->Icosahedral()) {
+    uint64_t psites=1;
+    Coordinate perpdims;
+    typename iobj::scalar_object ss;
+    for(int d=2;d<grid->_ndimension-1;d++){
+      int pd=grid->_gdimensions[d];
+      psites*=pd;
+      perpdims.push_back(pd);
     }
-    merge<vector_type,scalar_type>(vI,mergebuf);
-    l_v[o]=vI;
-  });
+    for(uint64_t p=0;p<psites;p++){
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+
+      int icoor;
+      if ( mu>=2 && mu < grid->_ndimension-1) {
+	icoor = orthog[mu-2];
+      } else {
+	icoor = -1;
+      }
+
+      ss=scalar_type(icoor);
+
+      pokePole(ss,l,orthog,South);
+      pokePole(ss,l,orthog,North);
+    }
+  }
+};
+template<class iobj> inline void LatticePole(Lattice<iobj> &l,NorthSouth pole)
+{
+  typedef typename iobj::scalar_object sobj;
+  typedef typename iobj::scalar_type scalar_type;
+  typedef typename iobj::vector_type vector_type;
+
+  GridBase *grid = l.Grid();
+
+  l=Zero();
+
+  if (grid->Icosahedral()) {
+    uint64_t psites=1;
+    Coordinate perpdims;
+    sobj ss;
+    scalar_type one(1.0);
+    ss=one;
+    for(int d=2;d<l.Grid()->_ndimension-1;d++){
+      int pd=l.Grid()->_gdimensions[d];
+      psites*=pd;
+      perpdims.push_back(pd);
+    }
+    for(uint64_t p=0;p<psites;p++){
+      Coordinate orthog;
+      Lexicographic::CoorFromIndex(orthog,p,perpdims);
+      pokePole(ss,l,orthog,pole);
+    }
+  }
 };
 
 NAMESPACE_END(Grid);
-
diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h
index 6106962c..422d28f1 100644
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
   grid->GlobalCoorToRankIndex(rank,odx,idx,site);
 
   ExtractBuffer<sobj> buf(Nsimd);
-  autoView( l_v , l, CpuWrite);
+  autoView( l_v , l, CpuRead);
   extract(l_v[odx],buf);
 
   s = buf[idx];
@@ -151,6 +151,134 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
   return;
 };
 
+// zero for south pole, one for north pole
+template<class vobj,class sobj>
+void peekPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
+{
+  s=Zero();
+  
+  GridBase *grid=l.Grid();
+
+  assert(grid->Icosahedral());
+
+  int Nsimd = grid->Nsimd();
+
+  int rank;
+
+  int Ndm1         = grid->_ndimension-1;
+  Coordinate pgrid = grid->ProcessorGrid();
+  const int xdim=0;
+  const int ydim=1;
+  const int pdim=Ndm1;
+
+  int64_t pole_osite;
+  int64_t pole_isite;
+  Coordinate rdims;
+  Coordinate idims;
+  Coordinate ocoor;
+  Coordinate icoor;
+  Coordinate pcoor(grid->_ndimension);
+  for(int d=2;d<Ndm1;d++){
+    int dd=d-2;
+    rdims.push_back(grid->_rdimensions[d]);
+    idims.push_back(grid->_simd_layout[d]);
+    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
+    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
+    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
+  }
+  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
+  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
+  
+  int64_t osite;
+  if(isNorth == North){
+    pcoor[xdim] = 0;
+    pcoor[ydim] = pgrid[ydim]-1;
+    pcoor[Ndm1] = pgrid[Ndm1]-1;
+    osite = pole_osite + grid->NorthPoleOsite();
+  } else {
+    pcoor[xdim] = pgrid[xdim]-1;
+    pcoor[ydim] = 0;
+    pcoor[Ndm1] = 0;
+    osite = pole_osite + grid->SouthPoleOsite();
+  }
+
+  rank = grid->RankFromProcessorCoor(pcoor);
+
+  if ( rank == grid->ThisRank() ) {
+    ExtractBuffer<sobj> buf(Nsimd);
+    autoView( l_v , l, CpuWrite);
+    extract(l_v[osite],buf);
+    s = buf[pole_isite];
+  }
+  grid->Broadcast(rank,s);
+
+  return;
+};
+template<class vobj,class sobj>
+void pokePole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
+{
+  GridBase *grid=l.Grid();
+
+  assert(grid->Icosahedral());
+
+  grid->Broadcast(grid->BossRank(),s);
+
+  int Nsimd = grid->Nsimd();
+  int rank;
+  int Ndm1         = grid->_ndimension-1;
+  Coordinate pgrid = grid->ProcessorGrid();
+  const int xdim=0;
+  const int ydim=1;
+  const int pdim=Ndm1;
+
+  int64_t pole_osite;
+  int64_t pole_isite;
+  Coordinate rdims;
+  Coordinate idims;
+  Coordinate ocoor;
+  Coordinate icoor;
+  Coordinate pcoor(grid->_ndimension,0);
+  for(int d=2;d<Ndm1;d++){
+    int dd = d-2;
+    rdims.push_back(grid->_rdimensions[d]);
+    idims.push_back(grid->_simd_layout[d]);
+    icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
+    ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
+    pcoor[d] = orthog[dd]/grid->_ldimensions[d];
+
+    int o = orthog[dd];
+    int r = grid->_rdimensions[d];
+    int omr = o % r;
+  }
+  Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
+  Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
+  
+  int64_t osite;
+  if(isNorth ==North){
+    pcoor[xdim] = 0;
+    pcoor[ydim] = pgrid[ydim]-1;
+    pcoor[Ndm1] = pgrid[Ndm1]-1;
+    osite = pole_osite + grid->NorthPoleOsite();
+  } else {
+    pcoor[xdim] = pgrid[xdim]-1;
+    pcoor[ydim] = 0;
+    pcoor[Ndm1] = 0;
+    osite = pole_osite + grid->SouthPoleOsite();
+  }
+
+  rank = grid->RankFromProcessorCoor(pcoor);
+
+  // extract-modify-merge cycle is easiest way and this is not perf critical
+  if ( rank == grid->ThisRank() ) {
+    ExtractBuffer<sobj> buf(Nsimd);
+    autoView( l_v , l, CpuWrite);
+    extract(l_v[osite],buf);
+    buf[pole_isite] = s;
+    merge(l_v[osite],buf);
+  }
+  return;
+};
+
 //////////////////////////////////////////////////////////
 // Peek a scalar object from the SIMD array
 //////////////////////////////////////////////////////////
@@ -179,7 +307,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
   for(int w=0;w<words;w++){
     pt[w] = getlane(vp[w],idx);
   }
-  //  std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
+
   return;
 };
 template<class vobj,class sobj>
diff --git a/Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h b/Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
new file mode 100644
index 00000000..0a8cc02e
--- /dev/null
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
@@ -0,0 +1,175 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma one 
+
+NAMESPACE_BEGIN(Grid);
+
+class TwoSpinWilsonFermion3plus1DStatic { 
+public:
+  // S-direction is INNERMOST and takes no part in the parity.
+  static const std::vector<int> directions;
+  static const std::vector<int> displacements;
+  static constexpr int npoint = 6;
+  static std::vector<int> MakeDirections(void);
+  static std::vector<int> MakeDisplacements(void);
+};
+
+template<class Impl>
+class TwoSpinWilsonFermion3plus1D : public TwoSpinWilsonKernels<Impl>, public TwoSpinWilsonFermion3plus1DStatic
+{
+public:
+  INHERIT_IMPL_TYPES(Impl);
+  typedef TwoSpinWilsonKernels<Impl> Kernels;
+
+  FermionField _tmp;
+  FermionField &tmp(void) { return _tmp; }
+
+  int Dirichlet;
+  Coordinate Block; 
+
+  ///////////////////////////////////////////////////////////////
+  // Implement the abstract base
+  ///////////////////////////////////////////////////////////////
+  GridBase *GaugeGrid(void)              { return _ThreeDimGrid ;}
+  GridBase *GaugeRedBlackGrid(void)      { return _ThreeDimRedBlackGrid ;}
+  GridBase *FermionGrid(void)            { return _FourDimGrid;}
+  GridBase *FermionRedBlackGrid(void)    { return _FourDimRedBlackGrid;}
+
+  // full checkerboard operations; leave unimplemented as abstract for now
+  virtual void   M    (const FermionField &in, FermionField &out){assert(0);};
+  virtual void   Mdag (const FermionField &in, FermionField &out){assert(0);};
+
+  // half checkerboard operations; leave unimplemented as abstract for now
+  virtual void   Meooe       (const FermionField &in, FermionField &out);
+  virtual void   Mooee       (const FermionField &in, FermionField &out);
+  virtual void   MooeeInv    (const FermionField &in, FermionField &out);
+
+  virtual void   MeooeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeDag    (const FermionField &in, FermionField &out);
+  virtual void   MooeeInvDag (const FermionField &in, FermionField &out);
+  virtual void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+  virtual void   MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);};   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
+
+  // These can be overridden by fancy 5d chiral action
+  virtual void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+  virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
+
+  //  void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
+  
+  // Implement hopping term non-hermitian hopping term; half cb or both
+  // Implement s-diagonal DW
+  void DW    (const FermionField &in, FermionField &out,int dag);
+  void Dhop  (const FermionField &in, FermionField &out,int dag);
+  void DhopOE(const FermionField &in, FermionField &out,int dag);
+  void DhopEO(const FermionField &in, FermionField &out,int dag);
+
+  void DhopComms  (const FermionField &in, FermionField &out);
+  void DhopCalc   (const FermionField &in, FermionField &out,uint64_t *ids);
+  
+  // add a DhopComm
+  // -- suboptimal interface will presently trigger multiple comms.
+  void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
+  void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
+  void DhopDirComms(const FermionField &in);
+  void DhopDirCalc(const FermionField &in, FermionField &out,int point);
+    
+  ///////////////////////////////////////////////////////////////
+  // New methods added 
+  ///////////////////////////////////////////////////////////////
+  void DerivInternal(StencilImpl & st,
+		     DoubledGaugeField & U,
+		     GaugeField &mat,
+		     const FermionField &A,
+		     const FermionField &B,
+		     int dag);
+    
+  void DhopInternal(StencilImpl & st,
+		    DoubledGaugeField &U,
+		    const FermionField &in, 
+		    FermionField &out,
+		    int dag);
+
+  void DhopInternalOverlappedComms(StencilImpl & st,
+				   DoubledGaugeField &U,
+				   const FermionField &in, 
+				   FermionField &out,
+				   int dag);
+
+  void DhopInternalSerialComms(StencilImpl & st,
+			       DoubledGaugeField &U,
+			       const FermionField &in, 
+			       FermionField &out,
+			       int dag);
+    
+  // Constructors
+  TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
+		  GridCartesian         &FourDimGrid,
+		  GridRedBlackCartesian &FourDimRedBlackGrid,
+		  GridCartesian         &ThreeDimGrid,
+		  GridRedBlackCartesian &ThreeDimRedBlackGrid,
+		  double _M5,const ImplParams &p= ImplParams());
+
+  virtual void DirichletBlock(const Coordinate & block)
+  {
+  }
+    
+  // DoubleStore
+  void ImportGauge(const GaugeField &_Umu);
+    
+  ///////////////////////////////////////////////////////////////
+  // Data members require to support the functionality
+  ///////////////////////////////////////////////////////////////
+public:
+    
+  // Add these to the support from Wilson
+  GridBase *_ThreeDimGrid;
+  GridBase *_ThreeDimRedBlackGrid;
+  GridBase *_FourDimGrid;
+  GridBase *_FourDimRedBlackGrid;
+    
+  double                        M5;
+  int Ls;
+    
+  //Defines the stencils for even and odd
+  StencilImpl Stencil; 
+  StencilImpl StencilEven; 
+  StencilImpl StencilOdd; 
+    
+  // Copy of the gauge field , with even and odd subsets
+  DoubledGaugeField Umu;
+  DoubledGaugeField UmuEven;
+  DoubledGaugeField UmuOdd;
+
+};
+
+NAMESPACE_END(Grid);
+
diff --git a/Grid/qcd/action/fermion/TwoSpinWilsonImpl.h b/Grid/qcd/action/fermion/TwoSpinWilsonImpl.h
new file mode 100644
index 00000000..cde08b81
--- /dev/null
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonImpl.h
@@ -0,0 +1,222 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+  
+/////////////////////////////////////////////////////////////////////////////
+// Single flavour four spinors with colour index
+/////////////////////////////////////////////////////////////////////////////
+template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
+class TwoSpinWilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+public:
+  
+  static const int Dimension = Representation::Dimension;
+  static const bool isFundamental = Representation::isFundamental;
+
+  typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
+  INHERIT_GIMPL_TYPES(Gimpl);
+      
+  //Necessary?
+  constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
+    
+  typedef typename Options::_Coeff_t Coeff_t;
+      
+  template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplPropagator        = iScalar<iMatrix<iMatrix<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfSpinor        = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplHalfCommSpinor    = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
+  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
+    
+  typedef iImplSpinor<Simd>            SiteSpinor;
+  typedef iImplPropagator<Simd>        SitePropagator;
+  typedef iImplHalfSpinor<Simd>        SiteHalfSpinor;
+  typedef iImplHalfCommSpinor<Simd>    SiteHalfCommSpinor;
+  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
+    
+  typedef Lattice<SiteSpinor>            FermionField;
+  typedef Lattice<SitePropagator>        PropagatorField;
+  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
+    
+  typedef SimpleCompressor<SiteSpinor> Compressor;
+  typedef WilsonImplParams ImplParams;
+  typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
+  typedef const typename StencilImpl::View_type StencilView;
+    
+  ImplParams Params;
+
+  TwoSpinWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){
+  };
+
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi,
+					  int mu) 
+  {
+    auto UU = coalescedRead(U(mu));
+    mult(&phi(), &UU, &chi());
+  }
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi,
+					  int mu,
+					  StencilEntry *SE,
+					  StencilView &St) 
+  {
+    multLink(phi,U,chi,mu);
+  }
+
+  template<class _SpinorField> 
+  inline void multLinkField(_SpinorField & out,
+			    const DoubledGaugeField &Umu,
+			    const _SpinorField & phi,
+			    int mu)
+  {
+    const int Nsimd = SiteHalfSpinor::Nsimd();
+    autoView( out_v, out, AcceleratorWrite);
+    autoView( phi_v, phi, AcceleratorRead);
+    autoView( Umu_v, Umu, AcceleratorRead);
+    typedef decltype(coalescedRead(out_v[0]))   calcSpinor;
+    accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
+	calcSpinor tmp;
+	multLink(tmp,Umu_v[sss],phi_v(sss),mu);
+	coalescedWrite(out_v[sss],tmp);
+    });
+  }
+					   
+  template <class ref>
+  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
+  {
+    reg = memory;
+  }
+      
+  inline void DoubleStore(GridBase *GaugeGrid,
+			  DoubledGaugeField &Uds,
+			  const GaugeField &Umu) 
+  {
+    typedef typename Simd::scalar_type scalar_type;
+
+    conformable(Uds.Grid(), GaugeGrid);
+    conformable(Umu.Grid(), GaugeGrid);
+
+    GaugeLinkField U(GaugeGrid);
+    GaugeLinkField tmp(GaugeGrid);
+
+    Lattice<iScalar<vInteger> > coor(GaugeGrid);
+      ////////////////////////////////////////////////////
+      // apply any boundary phase or twists
+      ////////////////////////////////////////////////////
+    for (int mu = 0; mu < Nd; mu++) {
+
+	////////// boundary phase /////////////
+      auto pha = Params.boundary_phases[mu];
+      scalar_type phase( real(pha),imag(pha) );
+
+	int L   = GaugeGrid->GlobalDimensions()[mu];
+        int Lmu = L - 1;
+
+      LatticeCoordinate(coor, mu);
+
+      U = PeekIndex<LorentzIndex>(Umu, mu);
+
+	// apply any twists
+	RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
+	if ( theta != 0.0) { 
+	  scalar_type twphase(::cos(theta),::sin(theta));
+	  U = twphase*U;
+	  std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
+	}
+
+      tmp = where(coor == Lmu, phase * U, U);
+      PokeIndex<LorentzIndex>(Uds, tmp, mu);
+
+      U = adj(Cshift(U, mu, -1));
+      U = where(coor == 0, conjugate(phase) * U, U); 
+      PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
+    }
+  }
+
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
+    GaugeLinkField link(mat.Grid());
+    link = TraceIndex<SpinIndex>(outerProduct(Btilde,A)); 
+    PokeIndex<LorentzIndex>(mat,link,mu);
+  }   
+      
+    inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
+      mat = outerProduct(B,A); 
+    }  
+
+    inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
+      mat = TraceIndex<SpinIndex>(P); 
+    }
+      
+    inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
+    {
+      for (int mu = 0; mu < Nd; mu++)
+      mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
+    }
+
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
+  {
+    int Ls=Btilde.Grid()->_fdimensions[0];
+    autoView( mat_v , mat, AcceleratorWrite);
+    {
+      const int Nsimd = SiteSpinor::Nsimd();
+      autoView( Btilde_v , Btilde, AcceleratorRead);
+      autoView( Atilde_v , Atilde, AcceleratorRead);
+      accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
+	  int sU=sss;
+  	  typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
+  	  ColorMatrixType sum;
+	  zeroit(sum);  
+	  for(int s=0;s<Ls;s++){
+	    int sF = s+Ls*sU;
+  	    for(int spn=0;spn<Ns;spn++){ //sum over spin
+  	      auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
+  	      auto aa = coalescedRead(Atilde_v[sF]()(spn) );
+	      auto op = outerProduct(bb,aa);
+  	      sum = sum + op;
+	    }
+	  }
+  	  coalescedWrite(mat_v[sU](mu)(), sum);
+      });
+    }
+  }
+};
+
+
+typedef TwoSpinWilsonImpl<vComplex,  FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplR;  // Real.. whichever prec
+typedef TwoSpinWilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplF;  // Float
+typedef TwoSpinWilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD;  // Double
+typedef TwoSpinWilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD2;  // Double
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/fermion/TwoSpinWilsonKernels.h b/Grid/qcd/action/fermion/TwoSpinWilsonKernels.h
new file mode 100644
index 00000000..5f843a81
--- /dev/null
+++ b/Grid/qcd/action/fermion/TwoSpinWilsonKernels.h
@@ -0,0 +1,84 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Helper routines that implement Wilson stencil for a single site.
+// Common to both the WilsonFermion and WilsonFermion5D
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ 
+template<class Impl> class TwoSpinWilsonKernels : public FermionOperator<Impl>  { 
+public:
+
+  INHERIT_IMPL_TYPES(Impl);
+  typedef FermionOperator<Impl> Base;
+  typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;   
+public:
+
+  static void DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 int interior=1,int exterior=1) ;
+
+  static void DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+			 int Ls, int Nsite, const FermionField &in, FermionField &out,
+			 uint64_t *ids);
+  
+  static void DhopDagKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+			    int Ls, int Nsite, const FermionField &in, FermionField &out,
+			    int interior=1,int exterior=1) ;
+
+  static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
+			  int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
+
+  static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteSpinor * buf,
+			    int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
+
+private:
+
+  static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor * buf,
+				   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
+
+  static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+  static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
+
+ public:
+  TwoSpinWilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
+};
+    
+NAMESPACE_END(Grid);
+
+
diff --git a/Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h b/Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h
new file mode 100644
index 00000000..ef499ce6
--- /dev/null
+++ b/Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h
@@ -0,0 +1,486 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion2plus1D.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
+#include <Grid/perfmon/PerfCount.h>
+
+NAMESPACE_BEGIN(Grid);
+  
+  // 5d lattice for DWF.
+template<class Impl>
+TwoSpinWilsonFermion3plus15D<Impl>::TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
+								GridCartesian         &FourDimGrid,
+								GridRedBlackCartesian &FourDimRedBlackGrid,
+								GridCartesian         &ThreeDimGrid,
+								GridRedBlackCartesian &ThreeDimRedBlackGrid,
+               RealD _M5,const ImplParams &p) :
+  Kernels(p),
+  _FourDimGrid        (&FourDimGrid),
+  _FourDimRedBlackGrid(&FourDimRedBlackGrid),
+  _ThreeDimGrid        (&ThreeDimGrid),
+  _ThreeDimRedBlackGrid(&ThreeDimRedBlackGrid),
+  Stencil    (_FourDimGrid,npoint,Even,directions,displacements,p),
+  StencilEven(_FourDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
+  StencilOdd (_FourDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
+  M5(_M5),
+  Umu(_ThreeDimGrid),
+  UmuEven(_ThreeDimRedBlackGrid),
+  UmuOdd (_ThreeDimRedBlackGrid),
+  _tmp(&FourDimRedBlackGrid),
+  Dirichlet(0)
+{
+  // some assertions
+  assert(FourDimGrid._ndimension==Nd+1);
+  assert(ThreeDimGrid._ndimension==Nd);
+  assert(ThreeDimRedBlackGrid._ndimension==Nd);
+  assert(FourDimRedBlackGrid._ndimension==Nd+1);
+  assert(FourDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
+
+  // extent of fifth dim and not spread out
+  Ls=FourDimGrid._fdimensions[0];
+  assert(FourDimRedBlackGrid._fdimensions[0]==Ls);
+  assert(FourDimGrid._processors[0]         ==1);
+  assert(FourDimRedBlackGrid._processors[0] ==1);
+
+  // Other dimensions must match the decomposition of the four-D fields 
+  for(int d=0;d<Nd;d++){
+
+    assert(FourDimGrid._processors[d+1]         ==ThreeDimGrid._processors[d]);
+    assert(FourDimRedBlackGrid._processors[d+1] ==ThreeDimGrid._processors[d]);
+    assert(ThreeDimRedBlackGrid._processors[d]   ==ThreeDimGrid._processors[d]);
+
+    assert(FourDimGrid._fdimensions[d+1]        ==ThreeDimGrid._fdimensions[d]);
+    assert(FourDimRedBlackGrid._fdimensions[d+1]==ThreeDimGrid._fdimensions[d]);
+    assert(ThreeDimRedBlackGrid._fdimensions[d]  ==ThreeDimGrid._fdimensions[d]);
+
+    assert(FourDimGrid._simd_layout[d+1]        ==ThreeDimGrid._simd_layout[d]);
+    assert(FourDimRedBlackGrid._simd_layout[d+1]==ThreeDimGrid._simd_layout[d]);
+    assert(ThreeDimRedBlackGrid._simd_layout[d]  ==ThreeDimGrid._simd_layout[d]);
+  }
+
+  if ( p.dirichlet.size() == Nd+1) {
+    Coordinate block = p.dirichlet;
+    for(int d=0;d<Nd+1;d++) {
+      if ( block[d] ){
+	Dirichlet = 1;
+	std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
+	std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
+	Block = block;
+      }
+    }
+  } else {
+    Coordinate block(Nd+1,0);
+    Block = block;
+  }
+
+  // Dimension zero of the five-d is the Ls direction
+  assert(FourDimRedBlackGrid._simd_layout[0]==1);
+  assert(FourDimGrid._simd_layout[0]        ==1);
+    
+  // Allocate the required comms buffer
+  ImportGauge(_Umu);
+  // Build lists of exterior only nodes
+  int LLs = FourDimGrid._rdimensions[0];
+  int vol3;
+  vol3=ThreeDimGrid.oSites();
+  Stencil.BuildSurfaceList(LLs,vol3);
+
+  vol3=ThreeDimRedBlackGrid.oSites();
+  StencilEven.BuildSurfaceList(LLs,vol3);
+   StencilOdd.BuildSurfaceList(LLs,vol3);
+
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::ImportGauge(const GaugeField &_Umu)
+{
+  GaugeField HUmu(_Umu.Grid());
+  HUmu = _Umu*(-0.5);
+  Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
+  pickCheckerboard(Even,UmuEven,Umu);
+  pickCheckerboard(Odd ,UmuOdd,Umu);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
+{
+  int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
+                    // we drop off the innermost fifth dimension
+  //  assert( (disp==1)||(disp==-1) );
+  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
+
+  int skip = (disp==1) ? 0 : 1;
+  int dirdisp = dir+skip*Nd;
+  int gamma   = dir+(1-skip)*Nd;
+
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+  
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);
+
+};
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
+{
+  Compressor compressor(DaggerNo);
+  Stencil.HaloExchange(in,compressor);
+  uint64_t Nsite = Umu.Grid()->oSites();
+  Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out);
+};
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DerivInternal(StencilImpl & st,
+					  DoubledGaugeField & U,
+					  GaugeField &mat,
+					  const FermionField &A,
+					  const FermionField &B,
+					  int dag)
+{
+  assert((dag==DaggerNo) ||(dag==DaggerYes));
+
+  conformable(st.Grid(),A.Grid());
+  conformable(st.Grid(),B.Grid());
+
+  Compressor compressor(dag);
+  
+  FermionField Btilde(B.Grid());
+  FermionField Atilde(B.Grid());
+
+  st.HaloExchange(B,compressor);
+
+  Atilde=A;
+  int LLs = B.Grid()->_rdimensions[0];
+
+
+  for (int mu = 0; mu < Nd; mu++) {
+    ////////////////////////////////////////////////////////////////////////
+    // Flip gamma if dag
+    ////////////////////////////////////////////////////////////////////////
+    int gamma = mu;
+    if (!dag) gamma += Nd;
+
+    ////////////////////////
+    // Call the single hop
+    ////////////////////////
+
+    int Usites = U.Grid()->oSites();
+
+    Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
+
+    ////////////////////////////
+    // spin trace outer product
+    ////////////////////////////
+    Impl::InsertForce5D(mat, Btilde, Atilde, mu);
+  }
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDeriv(GaugeField &mat,
+                                      const FermionField &A,
+                                      const FermionField &B,
+                                      int dag)
+{
+  conformable(A.Grid(),FermionGrid());  
+  conformable(A.Grid(),B.Grid());
+
+  //conformable(GaugeGrid(),mat.Grid());// this is not general! leaving as a comment
+
+  mat.Checkerboard() = A.Checkerboard();
+  //  mat.checkerboard = A.checkerboard;
+
+  DerivInternal(Stencil,Umu,mat,A,B,dag);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivEO(GaugeField &mat,
+                                        const FermionField &A,
+                                        const FermionField &B,
+                                        int dag)
+{
+  conformable(A.Grid(),FermionRedBlackGrid());
+  conformable(A.Grid(),B.Grid());
+
+  assert(B.Checkerboard()==Odd);
+  assert(A.Checkerboard()==Even);
+  mat.Checkerboard() = Even;
+
+  DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivOE(GaugeField &mat,
+                                        const FermionField &A,
+                                        const FermionField &B,
+                                        int dag)
+{
+  conformable(A.Grid(),FermionRedBlackGrid());
+  conformable(A.Grid(),B.Grid());
+
+  assert(B.Checkerboard()==Even);
+  assert(A.Checkerboard()==Odd);
+  mat.Checkerboard() = Odd;
+
+  DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternal(StencilImpl & st,
+                                         DoubledGaugeField & U,
+                                         const FermionField &in, FermionField &out,int dag)
+{
+  DhopInternalSerialComms(st,U,in,out,dag);
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
+							DoubledGaugeField & U,
+							const FermionField &in, FermionField &out,int dag)
+{
+  GRID_TRACE("DhopInternalOverlappedComms");
+  Compressor compressor(dag);
+
+  int LLs = in.Grid()->_rdimensions[0];
+  int len =  U.Grid()->oSites();
+
+  /////////////////////////////
+  // Start comms  // Gather intranode and extra node differentiated??
+  /////////////////////////////
+  {
+    //    std::cout << " TwoSpinWilsonFermion3plus1D gather " <<std::endl;
+    GRID_TRACE("Gather");
+    st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
+  }
+  
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Communicate Begin " <<std::endl;
+  std::vector<std::vector<CommsRequest_t> > requests;
+
+#if 1
+  /////////////////////////////
+  // Overlap with comms
+  /////////////////////////////
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms 
+#endif
+
+  /////////////////////////////
+  // do the compute interior
+  /////////////////////////////
+  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagInterior");
+    Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
+  } else {
+    GRID_TRACE("DhopInterior");
+    Kernels::DhopKernel   (st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
+  }
+  
+  //ifdef GRID_ACCELERATED
+#if 0
+  /////////////////////////////
+  // Overlap with comms -- on GPU the interior kernel call is nonblocking
+  /////////////////////////////
+  st.CommunicateBegin(requests);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
+#endif
+  
+  
+  /////////////////////////////
+  // Complete comms
+  /////////////////////////////
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Comms Complete " <<std::endl;
+  st.CommunicateComplete(requests);
+  //  traceStop(id);
+
+  /////////////////////////////
+  // do the compute exterior
+  /////////////////////////////
+  {
+    //    std::cout << " TwoSpinWilsonFermion3plus1D Comms Merge " <<std::endl;
+    GRID_TRACE("Merge");
+    st.CommsMerge(compressor);
+  }
+  
+
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Exterior " <<std::endl;
+  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDagExterior");
+    Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
+  } else {
+    GRID_TRACE("DhopExterior");
+    Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
+  }
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalSerialComms(StencilImpl & st, 
+						    DoubledGaugeField & U,
+						    const FermionField &in, 
+						    FermionField &out,int dag)
+{
+  GRID_TRACE("DhopInternalSerialComms");
+  Compressor compressor(dag);
+
+  int LLs = in.Grid()->_rdimensions[0];
+
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Halo exch " <<std::endl;
+  {
+    GRID_TRACE("HaloExchange");
+    st.HaloExchangeOpt(in,compressor);
+  }
+  
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Dhop " <<std::endl;
+  if (dag == DaggerYes) {
+    GRID_TRACE("DhopDag");
+    Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
+  } else {
+    GRID_TRACE("Dhop");
+    Kernels::DhopKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
+  }
+  //  std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
+}
+
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
+{
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check
+
+  assert(in.Checkerboard()==Even);
+  out.Checkerboard() = Odd;
+
+  DhopInternal(StencilEven,UmuOdd,in,out,dag);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
+{
+  conformable(in.Grid(),FermionRedBlackGrid());    // verifies half grid
+  conformable(in.Grid(),out.Grid()); // drops the cb check
+
+  assert(in.Checkerboard()==Odd);
+  out.Checkerboard() = Even;
+
+  DhopInternal(StencilOdd,UmuEven,in,out,dag);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopComms(const FermionField &in, FermionField &out)
+{
+  int dag =0 ;
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+  out.Checkerboard() = in.Checkerboard();
+  Compressor compressor(dag);
+  Stencil.HaloExchangeOpt(in,compressor);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
+{
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  int LLs = in.Grid()->_rdimensions[0];
+  Kernels::DhopKernel(Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
+{
+  conformable(in.Grid(),FermionGrid()); // verifies full grid
+  conformable(in.Grid(),out.Grid());
+
+  out.Checkerboard() = in.Checkerboard();
+
+  DhopInternal(Stencil,Umu,in,out,dag);
+}
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
+{
+  out.Checkerboard()=in.Checkerboard();
+  Dhop(in,out,dag); // -0.5 is included
+  axpy(out,Nd*1.0-M5,in,out);
+}
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::Meooe(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerNo);
+  } else {
+    DhopOE(in, out, DaggerNo);
+  }
+}
+
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
+{
+  if (in.Checkerboard() == Odd) {
+    DhopEO(in, out, DaggerYes);
+  } else {
+    DhopOE(in, out, DaggerYes);
+  }
+}
+
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::Mooee(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  typename FermionField::scalar_type scal(Nd*1.0 + M5);
+  out = scal * in;
+}
+
+template <class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  Mooee(in, out);
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  out = (1.0/(Nd*1.0 + M5))*in;
+}
+
+template<class Impl>
+void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
+{
+  out.Checkerboard() = in.Checkerboard();
+  MooeeInv(in,out);
+}
+  
+NAMESPACE_END(Grid);
+
+
+
+
diff --git a/Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h
new file mode 100644
index 00000000..6957cac6
--- /dev/null
+++ b/Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h
@@ -0,0 +1,441 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/TwoSpinWilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+
+////////////////////////////////////////////
+// Generic implementation; move to different file?
+////////////////////////////////////////////
+
+#define GENERIC_STENCIL_LEG(Dir,spProj,Recon)			\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    int perm= SE->_permute;					\
+    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+    spProj(chi,tmp);						\
+  } else {							\
+    chi = coalescedRead(buf[SE->_offset],lane);			\
+  }								\
+  acceleratorSynchronise();					\
+  Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+  Recon(result, Uchi);
+
+#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (SE->_is_local) {						\
+    int perm= SE->_permute;					\
+    auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+    spProj(chi,tmp);							\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);			\
+    Recon(result, Uchi);						\
+  }									\
+  acceleratorSynchronise();
+
+#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon)		\
+  SE = st.GetEntry(ptype, Dir, sF);				\
+  if (!SE->_is_local ) {		\
+    auto chi = coalescedRead(buf[SE->_offset],lane);		\
+    Impl::multLink(Uchi, U[sU], chi, Dir, SE, st);		\
+    Recon(result, Uchi);					\
+    nmu++;							\
+  }								\
+  acceleratorSynchronise();
+
+#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon)		\
+    if (SE->_is_local ) {					\
+      int perm= SE->_permute;					\
+      auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);	\
+      spProj(chi,tmp);						\
+    } else {							\
+      chi = coalescedRead(buf[SE->_offset],lane);		\
+    }								\
+    acceleratorSynchronise();					\
+    Impl::multLink(Uchi, U[sU], chi, dir, SE, st);		\
+    Recon(result, Uchi);
+
+#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon)			\
+  if (gamma == Dir) {						\
+    GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon);			\
+  }
+
+////////////////////////////////////////////////////////////////////
+// All legs kernels ; comms then compute
+////////////////////////////////////////////////////////////////////
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::DhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
+					     SiteSpinor *buf, int sF,
+					     int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0])) calcSpinor;
+  calcSpinor chi;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  GENERIC_STENCIL_LEG(Xp,pauliProjXp,pauliAssign);
+  GENERIC_STENCIL_LEG(Yp,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG(Zp,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG(Xm,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG(Ym,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG(Zm,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF],result,lane);
+};
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
+					  SiteSpinor *buf, int sF,
+					  int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcSpinor chi;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  GENERIC_STENCIL_LEG(Xm,pauliProjXp,pauliAssign);
+  GENERIC_STENCIL_LEG(Ym,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG(Zm,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG(Xp,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG(Yp,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG(Zp,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF], result,lane);
+};
+  ////////////////////////////////////////////////////////////////////
+  // Interior kernels
+  ////////////////////////////////////////////////////////////////////
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U,
+						       SiteSpinor *buf, int sF,
+						       int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcSpinor chi;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  result=Zero();
+  GENERIC_STENCIL_LEG_INT(Xp,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Yp,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zp,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Xm,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Ym,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zm,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF], result,lane);
+};
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U,
+						    SiteSpinor *buf, int sF,
+						    int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  calcSpinor chi;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  result=Zero();
+  GENERIC_STENCIL_LEG_INT(Xm,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Ym,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zm,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Xp,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Yp,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_INT(Zp,pauliProjZm,pauliAdd);
+  coalescedWrite(out[sF], result,lane);
+};
+////////////////////////////////////////////////////////////////////
+// Exterior kernels
+////////////////////////////////////////////////////////////////////
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U,
+						SiteSpinor *buf, int sF,
+						int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  result=Zero();
+  GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZm,pauliAdd);
+  if ( nmu ) {
+    auto out_t = coalescedRead(out[sF],lane);
+    out_t = out_t + result;
+    coalescedWrite(out[sF],out_t,lane);
+  }
+};
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U,
+					     SiteSpinor *buf, int sF,
+					     int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  //  calcSpinor *chi_p;
+  calcSpinor Uchi;
+  calcSpinor result;
+  StencilEntry *SE;
+  int ptype;
+  int nmu=0;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+  result=Zero();
+  GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZp,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYm,pauliAdd);
+  GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZm,pauliAdd);
+  if ( nmu ) {
+    auto out_t = coalescedRead(out[sF],lane);
+    out_t = out_t + result;
+    coalescedWrite(out[sF],out_t,lane);
+  }
+};
+
+#define DhopDirMacro(Dir,spProj,spRecon)	\
+  template <class Impl> accelerator_inline				\
+  void TwoSpinWilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF, \
+					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
+  {									\
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;			\
+  calcSpinor chi;							\
+  calcSpinor result;							\
+  calcSpinor Uchi;							\
+  StencilEntry *SE;							\
+  int ptype;								\
+  const int Nsimd = SiteSpinor::Nsimd();				\
+  const int lane=acceleratorSIMTlane(Nsimd);					\
+									\
+  SE = st.GetEntry(ptype, dir, sF);					\
+  GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon);				\
+  coalescedWrite(out[sF], result,lane);					\
+  }
+
+DhopDirMacro(Xp,pauliProjXp,pauliAssign);
+DhopDirMacro(Yp,pauliProjYp,pauliAssign);
+DhopDirMacro(Zp,pauliProjZp,pauliAssign);
+DhopDirMacro(Xm,pauliProjXm,pauliAssign);
+DhopDirMacro(Ym,pauliProjYm,pauliAssign);
+DhopDirMacro(Zm,pauliProjZm,pauliAssign);
+
+template <class Impl> accelerator_inline
+void TwoSpinWilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF,
+				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
+{
+  typedef decltype(coalescedRead(in[0]))  calcSpinor;
+  calcSpinor chi;
+  calcSpinor result;
+  calcSpinor Uchi;
+  StencilEntry *SE;
+  int ptype;
+  const int Nsimd = SiteSpinor::Nsimd();
+  const int lane=acceleratorSIMTlane(Nsimd);
+
+  SE = st.GetEntry(ptype, dir, sF);
+  GENERIC_DHOPDIR_LEG(Xp,pauliProjXp,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Yp,pauliProjYp,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Zp,pauliProjZp,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Xm,pauliProjXm,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Ym,pauliProjYm,pauliAssign);
+  GENERIC_DHOPDIR_LEG(Zm,pauliProjZm,pauliAssign);
+  coalescedWrite(out[sF], result,lane);
+}
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
+				      int Nsite, const FermionField &in, std::vector<FermionField> &out)
+{
+   autoView(U_v  ,U,AcceleratorRead);
+   autoView(in_v ,in,AcceleratorRead);
+   autoView(st_v ,st,AcceleratorRead);
+
+   autoView(out_Xm,out[0],AcceleratorWrite);
+   autoView(out_Ym,out[1],AcceleratorWrite);
+   autoView(out_Zm,out[2],AcceleratorWrite);
+   autoView(out_Xp,out[4],AcceleratorWrite);
+   autoView(out_Yp,out[5],AcceleratorWrite);
+   autoView(out_Zp,out[6],AcceleratorWrite);
+   auto CBp=st.CommBuf();
+   accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
+      int sU=sss/Ls;
+      int sF =sss;
+      DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
+      DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
+      DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
+      DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,3);
+      DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,4);
+      DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,5);
+   });
+}
+
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
+					 int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma)
+{
+  assert(dirdisp<=5);
+  assert(dirdisp>=0);
+
+   autoView(U_v  ,U  ,AcceleratorRead);
+   autoView(in_v ,in ,AcceleratorRead);
+   autoView(out_v,out,AcceleratorWrite);
+   autoView(st_v ,st ,AcceleratorRead);
+   auto CBp=st.CommBuf();
+#define LoopBody(Dir)				\
+   case Dir :					\
+     accelerator_for(ss,Nsite,Simd::Nsimd(),{	\
+       for(int s=0;s<Ls;s++){			\
+	 int sU=ss;				\
+	 int sF = s+Ls*sU;						\
+	 DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
+       }							       \
+       });							       \
+     break;
+
+   switch(gamma){
+   LoopBody(Xp);
+   LoopBody(Yp);
+   LoopBody(Zp);
+
+   LoopBody(Xm);
+   LoopBody(Ym);
+   LoopBody(Zm);
+   default:
+     assert(0);
+     break;
+   }
+#undef LoopBody
+}
+
+
+#define KERNEL_CALLNB(A)						\
+  const uint64_t    NN = Nsite*Ls;					\
+  accelerator_forNB( ss, NN, Simd::Nsimd(), {				\
+      int sF = ss;							\
+      int sU = ss/Ls;							\
+      TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+    });
+
+#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
+
+#define KERNEL_CALL_EXT(A)						\
+  const uint64_t    sz = st.surface_list.size();			\
+  auto ptr = &st.surface_list[0];					\
+  accelerator_forNB( ss, sz, Simd::Nsimd(), {				\
+      int sF = ptr[ss];							\
+      int sU = sF/Ls;							\
+      TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\
+    });									\
+  accelerator_barrier();
+
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+					    int Ls, int Nsite, const FermionField &in, FermionField &out,
+					    int interior,int exterior)
+{
+  autoView(U_v  ,  U,AcceleratorRead);
+  autoView(in_v , in,AcceleratorRead);
+  autoView(out_v,out,AcceleratorWrite);
+  autoView(st_v , st,AcceleratorRead);
+  
+  if( interior && exterior ) {
+    acceleratorFenceComputeStream();
+    KERNEL_CALL(GenericDhopSite);
+    return;
+  } else if( interior ) {
+    KERNEL_CALLNB(GenericDhopSiteInt);
+    return;
+  } else if( exterior ) {
+    //     // dependent on result of merge
+    acceleratorFenceComputeStream();
+    KERNEL_CALL_EXT(GenericDhopSiteExt);
+    return;
+  }
+  assert(0 && " Kernel optimisation case not covered ");
+}
+
+template <class Impl>
+void TwoSpinWilsonKernels<Impl>::DhopDagKernel(StencilImpl &st,  DoubledGaugeField &U, SiteSpinor * buf,
+					       int Ls, int Nsite, const FermionField &in, FermionField &out,
+					       int interior,int exterior)
+{
+  autoView(U_v  ,U,AcceleratorRead);
+  autoView(in_v ,in,AcceleratorRead);
+  autoView(out_v,out,AcceleratorWrite);
+  autoView(st_v ,st,AcceleratorRead);
+  
+  if( interior && exterior ) {
+    acceleratorFenceComputeStream();
+    KERNEL_CALL(GenericDhopSiteDag);
+    return;
+  } else if( interior ) {
+    KERNEL_CALLNB(GenericDhopSiteDagInt); return;
+  } else if( exterior ) {
+    // Dependent on result of merge
+    acceleratorFenceComputeStream();
+    KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;
+  }
+  assert(0 && " Kernel optimisation case not covered ");
+}
+
+#undef KERNEL_CALLNB
+#undef KERNEL_CALL
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc
new file mode 100644
index 00000000..fc388914
--- /dev/null
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc
@@ -0,0 +1,61 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
+
+NAMESPACE_BEGIN(Grid);
+
+// S-direction is INNERMOST and takes no part in the parity.
+
+const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::directions   (TwoSpinWilsonFermion3plus1DStatic::MakeDirections());
+const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::displacements(TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements());
+
+std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDirections (void)
+{
+  std::vector<int> directions(2*Nd);
+  for(int d=0;d<Nd;d++){
+    directions[d]    = d+1;
+    directions[d+Nd] = d+1;
+  }
+  return directions;
+}
+std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements(void)
+{
+  std::vector<int> displacements(2*Nd);
+  for(int d=0;d<Nd;d++){
+    displacements[d]    = +1;
+    displacements[d+Nd] = -1;
+  }
+  return displacements;
+}
+
+NAMESPACE_END(Grid);
+
diff --git a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc.master
new file mode 100644
index 00000000..0e97a7a7
--- /dev/null
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonFermion3plus1DInstantiation.cc.master
@@ -0,0 +1,40 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class TwoSpinWilsonFermion3plus1D<IMPLEMENTATION>; 
+
+NAMESPACE_END(Grid);
+
diff --git a/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonKernelsInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonKernelsInstantiation.cc.master
new file mode 100644
index 00000000..630b5c4a
--- /dev/null
+++ b/Grid/qcd/action/fermion/instantiation/TwoSpinWilsonKernelsInstantiation.cc.master
@@ -0,0 +1,40 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
+
+Copyright (C) 2015, 2020
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/qcd/action/fermion/FermionCore.h>
+#include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h>
+
+NAMESPACE_BEGIN(Grid);
+
+#include "impl.h"
+template class TwoSpinWilsonKernels<IMPLEMENTATION>;
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/spin/Pauli.h b/Grid/qcd/spin/Pauli.h
new file mode 100644
index 00000000..fac66375
--- /dev/null
+++ b/Grid/qcd/spin/Pauli.h
@@ -0,0 +1,220 @@
+#ifndef GRID_QCD_PAULI_H
+#define GRID_QCD_PAULI_H
+
+#include <array>
+
+NAMESPACE_BEGIN(Grid);
+//
+/*
+ * Pauli basis
+ * sx        sy       sz       ident
+ * (0 1)  , (0 -i) , ( 1 0 )
+ * (1 0)    (i  0)   ( 0 -1)
+ *
+ * These are hermitian.
+ *
+ * Also supply wilson "projectors" (1+/-sx), (1+/-sy), (1+/-sz)
+ *
+ * spPauliProjXm
+ * spPauliProjYm etc...
+ */
+class Pauli {
+  public:
+    GRID_SERIALIZABLE_ENUM(Algebra, undef,
+                           SigmaX           , 0,
+                           MinusSigmaX      , 1,
+                           SigmaY           , 2,
+                           MinusSigmaY      , 3,
+                           SigmaZ           , 4,
+                           MinusSigmaZ      , 5,
+                           Identity         , 6,
+			   MinusIdentity    , 7);
+  
+    static constexpr unsigned int nPauli = 8;
+    static const std::array<const char *, nPauli>                name;
+    static const std::array<std::array<Algebra, nPauli>, nPauli> mul;
+    static const std::array<Algebra, nPauli>                     adj;
+    static const std::array<const Pauli, 4>                      gmu;
+    static const std::array<const Pauli, 16>                     gall;
+    Algebra                                                      g;
+  public:
+  accelerator Pauli(Algebra initg): g(initg) {}  
+};
+
+#define CopyImplementation(iTemplate,multPauli,multFlavour)	\
+  template<class vtype>							\
+  accelerator_inline void multPauli(iTemplate<vtype, Nhs> &ret, const iTemplate<vtype, Nhs> &rhs) {	\
+    multFlavour(ret,rhs);						\
+}
+
+CopyImplementation(iVector,multPauliSigmaX,multFlavourSigmaX);
+CopyImplementation(iMatrix,lmultPauliSigmaX,lmultFlavourSigmaX);
+CopyImplementation(iMatrix,rmultPauliSigmaX,rmultFlavourSigmaX);
+
+CopyImplementation(iVector,multPauliMinusSigmaX ,multFlavourMinusSigmaX);
+CopyImplementation(iMatrix,lmultPauliMinusSigmaX,lmultFlavourMinusSigmaX);
+CopyImplementation(iMatrix,rmultPauliMinusSigmaX,rmultFlavourMinusSigmaX);
+
+CopyImplementation(iVector,multPauliSigmaY,multFlavourSigmaY);
+CopyImplementation(iMatrix,lmultPauliSigmaY,lmultFlavourSigmaY);
+CopyImplementation(iMatrix,rmultPauliSigmaY,rmultFlavourSigmaY);
+
+CopyImplementation(iVector,multPauliMinusSigmaY ,multFlavourMinusSigmaY);
+CopyImplementation(iMatrix,lmultPauliMinusSigmaY,lmultFlavourMinusSigmaY);
+CopyImplementation(iMatrix,rmultPauliMinusSigmaY,rmultFlavourMinusSigmaY);
+
+CopyImplementation(iVector,multPauliSigmaZ,multFlavourSigmaZ);
+CopyImplementation(iMatrix,lmultPauliSigmaZ,lmultFlavourSigmaZ);
+CopyImplementation(iMatrix,rmultPauliSigmaZ,rmultFlavourSigmaZ);
+
+CopyImplementation(iVector,multPauliMinusSigmaZ ,multFlavourMinusSigmaZ);
+CopyImplementation(iMatrix,lmultPauliMinusSigmaZ,lmultFlavourMinusSigmaZ);
+CopyImplementation(iMatrix,rmultPauliMinusSigmaZ,rmultFlavourMinusSigmaZ);
+
+CopyImplementation(iVector,multPauliIdentity,multFlavourIdentity);
+CopyImplementation(iMatrix,lmultPauliIdentity,lmultFlavourIdentity);
+CopyImplementation(iMatrix,rmultPauliIdentity,rmultFlavourIdentity);
+
+CopyImplementation(iVector,multPauliMinusIdentity ,multFlavourMinusIdentity);
+CopyImplementation(iMatrix,lmultPauliMinusIdentity,lmultFlavourMinusIdentity);
+CopyImplementation(iMatrix,rmultPauliMinusIdentity,rmultFlavourMinusIdentity);
+
+/*
+ * sx        sy       sz       ident
+ * (0 1)  , (0 -i) , ( 1 0 )
+ * (1 0)    (i  0)   ( 0 -1)
+ */
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)+fspin(1);
+  hspin(1)=fspin(1)+fspin(0);
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)-fspin(1);
+  hspin(1)=fspin(1)-fspin(0);
+}
+
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)-timesI(fspin(1));
+  hspin(1)=fspin(1)+timesI(fspin(0));
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)+timesI(fspin(1));
+  hspin(1)=fspin(1)-timesI(fspin(0));
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=fspin(0)+fspin(0);
+  hspin(1)=Zero();
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
+{
+  hspin(0)=Zero();
+  hspin(1)=fspin(1)+fspin(1);
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAssign(iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
+{
+  fspin = hspin;
+}
+template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAdd   (iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
+{
+  fspin = fspin + hspin;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const Pauli &G, const iVector<vtype, Nhs> &arg)
+->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Nhs>, PauliIndex>::value, iVector<vtype, Nhs>>::type
+{
+  iVector<vtype, Nhs> ret;
+
+  switch (G.g) 
+  {
+  case Pauli::Algebra::SigmaX:
+    multPauliSigmaX(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaX:
+    multPauliMinusSigmaX(ret, arg); break;
+  case Pauli::Algebra::SigmaY:
+    multPauliSigmaY(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaY:
+    multPauliMinusSigmaY(ret, arg); break;
+  case Pauli::Algebra::SigmaZ:
+    multPauliSigmaZ(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaZ:
+    multPauliMinusSigmaZ(ret, arg); break;
+  case Pauli::Algebra::Identity:
+    multPauliIdentity(ret, arg); break;
+  case Pauli::Algebra::MinusIdentity:
+    multPauliMinusIdentity(ret, arg); break;
+  default: assert(0);
+  }
+ 
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const Pauli &G, const iMatrix<vtype, Nhs> &arg)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
+{
+  iMatrix<vtype, Nhs> ret;
+
+  switch (G.g) 
+  {
+  case Pauli::Algebra::SigmaX:
+    lmultPauliSigmaX(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaX:
+    lmultPauliMinusSigmaX(ret, arg); break;
+  case Pauli::Algebra::SigmaY:
+    lmultPauliSigmaY(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaY:
+    lmultPauliMinusSigmaY(ret, arg); break;
+  case Pauli::Algebra::SigmaZ:
+    lmultPauliSigmaZ(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaZ:
+    lmultPauliMinusSigmaZ(ret, arg); break;
+  case Pauli::Algebra::Identity:
+    lmultPauliIdentity(ret, arg); break;
+  case Pauli::Algebra::MinusIdentity:
+    lmultPauliMinusIdentity(ret, arg); break;
+  default: assert(0);
+  }
+  
+  return ret;
+}
+
+template<class vtype> 
+accelerator_inline auto operator*(const iMatrix<vtype, Nhs> &arg, const Pauli &G)
+->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
+{
+  iMatrix<vtype, Nhs> ret;
+
+  switch (G.g) 
+  {
+  case Pauli::Algebra::SigmaX:
+    rmultPauliSigmaX(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaX:
+    rmultPauliMinusSigmaX(ret, arg); break;
+  case Pauli::Algebra::SigmaY:
+    rmultPauliSigmaY(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaY:
+    rmultPauliMinusSigmaY(ret, arg); break;
+  case Pauli::Algebra::SigmaZ:
+    rmultPauliSigmaZ(ret, arg); break;
+  case Pauli::Algebra::MinusSigmaZ:
+    rmultPauliMinusSigmaZ(ret, arg); break;
+  case Pauli::Algebra::Identity:
+    rmultPauliIdentity(ret, arg); break;
+  case Pauli::Algebra::MinusIdentity:
+    rmultPauliMinusIdentity(ret, arg); break;
+  default: assert(0);
+  }
+
+  return ret;
+}
+
+
+NAMESPACE_END(Grid);
+
+#endif // GRID_QCD_GAMMA_H
diff --git a/TODO b/TODO
index d10d5793..12573208 100644
--- a/TODO
+++ b/TODO
@@ -1,3 +1,8 @@
+
+* Clean up the extract merge and replace with insertLane/extractLane
+
+-----
+
 i)    Refine subspace with HDCG & recompute
 ii)   Block Lanczos in coarse space
 iii)  Batched block project in the operator computation
diff --git a/tests/debug/Test_icosahedron.cc b/tests/debug/Test_icosahedron.cc
new file mode 100644
index 00000000..cab6a281
--- /dev/null
+++ b/tests/debug/Test_icosahedron.cc
@@ -0,0 +1,93 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/debug/Test_icosahedron.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+const int MyNd=3;
+template<typename vtype> using iIcosahedralLorentzComplex = iVector<iScalar<iScalar<vtype> >, MyNd+1 > ;
+
+typedef iIcosahedralLorentzComplex<ComplexD >  IcosahedralLorentzComplexD;
+typedef iIcosahedralLorentzComplex<vComplexD> vIcosahedralLorentzComplexD;
+typedef Lattice<vIcosahedralLorentzComplexD> LatticeIcosahedralLorentzComplexD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  const int L=8;
+  const int Npatch = num_icosahedron_tiles;
+
+  // Put SIMD all in time direction
+  Coordinate latt_size = GridDefaultLatt();
+  Coordinate simd_layout({1,1,vComplexD::Nsimd(),1});
+  Coordinate mpi_layout = GridDefaultMpi();
+
+  std::cout << GridLogMessage << " mpi "<<mpi_layout<<std::endl;
+  std::cout << GridLogMessage << " simd "<<simd_layout<<std::endl;
+  std::cout << GridLogMessage << " latt "<<latt_size<<std::endl;
+  GridCartesianCrossIcosahedron EdgeGrid(latt_size,simd_layout,mpi_layout,IcosahedralEdges);
+  std::cout << GridLogMessage << " Created edge grid "<<std::endl;
+  GridCartesianCrossIcosahedron VertexGrid(latt_size,simd_layout,mpi_layout,IcosahedralVertices);
+
+  std::cout << GridLogMessage << " Created vertex grid "<<std::endl;
+  LatticeIcosahedralLorentzComplexD Umu(&EdgeGrid);
+  LatticeComplex Phi(&VertexGrid);
+  std::cout << GridLogMessage << " Created two fields "<<std::endl;
+
+  Phi = Zero();
+  Umu = Zero();
+  std::cout << GridLogMessage << " Zeroed two fields "<<std::endl;
+
+  ComplexD one (1.0);
+  Phi = one;
+  Umu = one;
+  
+  std::cout << GridLogMessage << " V = "<<norm2(Phi)<<std::endl;
+  std::cout << GridLogMessage << " Expect "<<latt_size[0]*latt_size[1]*latt_size[2]*10+2*latt_size[2]<<std::endl;
+  
+  std::cout << GridLogMessage << " E = "<<norm2(Umu)<<std::endl;
+  std::cout << GridLogMessage << " Expect "<<latt_size[0]*latt_size[1]*latt_size[2]*10*4<<std::endl;
+
+  //  std::cout << " Umu "<<Umu<<std::endl;
+  //  std::cout << " Phi "<<Phi<<std::endl;
+  LatticePole(Phi,South);
+  std::cout << " Phi South Pole set\n"<<Phi<<std::endl;
+
+  LatticePole(Phi,North);
+  std::cout << " Phi North Pole set\n"<<Phi<<std::endl;
+
+  for(int mu=0;mu<VertexGrid._ndimension;mu++){
+    std::cout << " Calling lattice coordinate mu="<<mu<<std::endl;
+    LatticeCoordinate(Phi,mu);
+    std::cout << " Phi coor mu="<<mu<<"\n"<<Phi<<std::endl;
+  }
+
+  Grid_finalize();
+}