From 1eee664092b1d981770532a5917e364a5efebc6a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 14 Apr 2015 20:22:04 +0100
Subject: [PATCH] Stencil code pretty much shaken out. Beginning of inner
 product and norm2.

---
 Grid.h                 |   1 +
 Grid_Lattice.h         |  12 +
 Grid_config.h          |   4 +-
 Grid_cshift_common.h   |   2 +
 Grid_main.cc           |  15 +-
 Grid_math_types.h      |   3 +-
 Grid_simd.h            |  53 +++-
 Grid_stencil.h         | 572 ++++++++++++++++++++---------------------
 Grid_stencil_common.cc | 258 +++++++++++++++++++
 Grid_vComplexD.h       |   7 +-
 Grid_vComplexF.h       |  38 +--
 Makefile.am            |  33 ++-
 Makefile.in            |  53 ++--
 TODO                   |   2 +-
 test_Grid_jacobi.cc    | 206 +++++++++++++++
 test_Grid_stencil.cc   | 154 +++++++++++
 16 files changed, 1050 insertions(+), 363 deletions(-)
 create mode 100644 Grid_stencil_common.cc
 create mode 100644 test_Grid_jacobi.cc
 create mode 100644 test_Grid_stencil.cc
diff --git a/Grid.h b/Grid.h
index a0989e0b..2a1072ea 100644
--- a/Grid.h
+++ b/Grid.h
@@ -47,6 +47,7 @@
 #include <Grid_Cartesian.h>
 #include <Grid_Lattice.h>
 #include <Grid_comparison.h>
+#include <Grid_stencil.h>
 #include <Grid_QCD.h>
 
 namespace Grid {
diff --git a/Grid_Lattice.h b/Grid_Lattice.h
index 119d6181..1e2c820e 100644
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -407,6 +407,18 @@ public:
         return ret;
     }
 
+    template<class vobj>
+    inline RealD norm2(const Lattice<vobj> &arg){
 
+      typedef typename vobj::scalar_type scalar;
+      decltype(localInnerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
+
+      scalar nrm;
+      for(int ss=0;ss<arg._grid->oSites(); ss++){
+	vnrm = vnrm + localInnerProduct(arg._odata[ss],arg._odata[ss]);
+      }
+      nrm = Reduce(TensorRemove(vnrm));
+      return real(nrm);
+    }
 }
 #endif
diff --git a/Grid_config.h b/Grid_config.h
index deb6c74c..bb885708 100644
--- a/Grid_config.h
+++ b/Grid_config.h
@@ -11,10 +11,10 @@
 /* #undef AVX512 */
 
 /* GRID_COMMS_MPI */
-/* #undef GRID_COMMS_MPI */
+#define GRID_COMMS_MPI 1
 
 /* GRID_COMMS_NONE */
-#define GRID_COMMS_NONE 1
+/* #undef GRID_COMMS_NONE */
 
 /* Define to 1 if you have the `gettimeofday' function. */
 #define HAVE_GETTIMEOFDAY 1
diff --git a/Grid_cshift_common.h b/Grid_cshift_common.h
index 2910151c..e962f6ea 100644
--- a/Grid_cshift_common.h
+++ b/Grid_cshift_common.h
@@ -8,6 +8,7 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
 {
   int rd = rhs._grid->_rdimensions[dimension];
 
+  printf("Gather_plane_simple buf size %d rhs size %d \n",buffer.size(),rhs._odata.size());fflush(stdout);
   if ( !rhs._grid->CheckerBoarded(dimension) ) {
 
     int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
@@ -18,6 +19,7 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
 #pragma omp parallel for collapse(2)
     for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
       for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	printf("Gather_plane_simple %d ; %d %d %d\n",bo,so,o,b);fflush(stdout);
 	buffer[bo++]=rhs._odata[so+o+b];
       }
       o +=rhs._grid->_slice_stride[dimension];
diff --git a/Grid_main.cc b/Grid_main.cc
index 8cf0dae3..13c3d9ee 100644
--- a/Grid_main.cc
+++ b/Grid_main.cc
@@ -1,10 +1,4 @@
 #include "Grid.h"
-#include "Grid_vRealD.h"
-#include "Grid_vRealF.h"
-#include "Grid_vComplexD.h"
-#include "Grid_vComplexF.h"
-#include "Grid_Cartesian.h"
-#include "Grid_Lattice.h"
 
 using namespace std;
 using namespace Grid;
@@ -16,13 +10,14 @@ int main (int argc, char ** argv)
   Grid_init(&argc,&argv);
 
   std::vector<int> latt_size(4);
+
   std::vector<int> simd_layout(4);
   
   std::vector<int> mpi_layout(4);
-  mpi_layout[0]=1;
-  mpi_layout[1]=1;
-  mpi_layout[2]=1;
-  mpi_layout[3]=1;
+  mpi_layout[0]=2;
+  mpi_layout[1]=2;
+  mpi_layout[2]=2;
+  mpi_layout[3]=2;
 
 #ifdef AVX512
  for(int omp=128;omp<236;omp+=16){
diff --git a/Grid_math_types.h b/Grid_math_types.h
index a2a3f97c..e3432645 100644
--- a/Grid_math_types.h
+++ b/Grid_math_types.h
@@ -922,9 +922,10 @@ template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatri
     {
         typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
         iScalar<ret_t> ret=zero;
+        iScalar<ret_t> tmp;
         for(int c1=0;c1<N;c1++){
         for(int c2=0;c2<N;c2++){
-            ret._internal += localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
+	  ret._internal+=localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
         }}
         return ret;
     }
diff --git a/Grid_simd.h b/Grid_simd.h
index 1229d328..23061632 100644
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -26,11 +26,10 @@ namespace Grid {
 
   typedef  float  RealF;
   typedef  double RealD;
-  typedef  RealF  Real;
   
   typedef std::complex<RealF> ComplexF;
   typedef std::complex<RealD> ComplexD;
-  typedef std::complex<Real>  Complex;
+
 
   inline RealF adj(const RealF  & r){ return r; }
   inline RealF conj(const RealF  & r){ return r; }
@@ -54,7 +53,7 @@ namespace Grid {
     inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
     inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
     inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
-    inline Complex  adj(const Complex& r ){ return(conj(r)); }
+    inline ComplexF  adj(const ComplexF& r ){ return(conj(r)); }
     //conj already supported for complex
     
     inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
@@ -232,4 +231,52 @@ inline void Gpermute(vsimd &y,const vsimd &b,int perm){
 #include <Grid_vComplexF.h>
 #include <Grid_vComplexD.h>
 
+namespace Grid {
+
+  // NB: Template the following on "type Complex" and then implement *,+,- for 
+  // ComplexF, ComplexD, RealF, RealD above to
+  // get full generality of binops with scalars.
+   inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
+   inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
+
+   inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); }
+   inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); }
+
+   inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); }
+   inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
+
+   inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); }
+   inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); }
+
+  // Default precision
+  typedef RealD  Real;
+  typedef std::complex<Real>  Complex;
+
+  typedef vRealD vReal;
+  typedef vComplexD vComplex;
+}
 #endif
diff --git a/Grid_stencil.h b/Grid_stencil.h
index e10cabaa..d8debac0 100644
--- a/Grid_stencil.h
+++ b/Grid_stencil.h
@@ -1,11 +1,15 @@
 #ifndef GRID_STENCIL_H
 #define GRID_STENCIL_H
+
 //////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
-// gather to a point stencil code. CSHIFT is not the best way, so probably need
+// gather to a point stencil code. CSHIFT is not the best way, so need
 // additional stencil support.
 //
-// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
+// Stencil based code will pre-exchange haloes and use a table lookup for neighbours.
+// This will be done with generality to allow easier efficient implementations.
+// Overlap of comms and compute could be semi-automated by tabulating off-node connected,
+// and 
 //
 // Lattice <foo> could also allocate haloes which get used for stencil code.
 //
@@ -35,329 +39,313 @@
 
 namespace Grid {
 
-    class Stencil {
+  struct CommsRequest { 
+    int words;
+    int unified_buffer_offset;
+    int tag;
+    int to_rank;
+    int from_rank;
+  } ;
+
+  class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
   public:
 
-      Stencil(GridBase *grid,
-	      int npoints,
-	      int checkerboard,
-	      std::vector<int> directions,
-	      std::vector<int> distances);
-
-      void Stencil_local     (int dimension,int shift,int cbmask);
-      void Stencil_comms     (int dimension,int shift,int cbmask);
-      void Stencil_comms_simd(int dimension,int shift,int cbmask);
-
-      // Will need to implement actions for
-      Copy_plane;
-      Copy_plane_permute;
-      Gather_plane;
-
-      // The offsets to all neibours in stencil in each direction
       int                               _checkerboard;
       int                               _npoints; // Move to template param?
       GridBase *                        _grid;
-
-      // Store these as SIMD Integer needed
-      //
-      // std::vector< iVector<Integer, Npoint> > _offsets;
-      // std::vector< iVector<Integer, Npoint> > _local;
-      // std::vector< iVector<Integer, Npoint> > _comm_buf_size;
-      // std::vector< iVector<Integer, Npoint> > _permute;
-
-      std::vector<std::vector<int>    > _offsets;
-      std::vector<std::vector<int>    > _local;
+      
+      // npoints of these
+      std::vector<int>                  _directions;
+      std::vector<int>                  _distances;
       std::vector<int>                  _comm_buf_size;
-      std::vector<int>                  _permute;
+      std::vector<int>                  _permute_type;
 
-    };
+      // npoints x Osites() of these
+      std::vector<std::vector<int>    > _offsets;
+      std::vector<std::vector<int>    > _is_local;
+      std::vector<std::vector<int> >    _permute;
 
-    Stencil::Stencil(GridBase *grid,
-		     int npoints,
-		     int checkerboard,
-		     std::vector<int> directions,
-		     std::vector<int> distances){
-      
-      _npoints = npoints;
-      _grid    = grid;
-      
-      for(int i=0;i<npoints;i++){
+      int _unified_buffer_size;
+      int _request_count;
 
-	int dimension    = directions[i];
-	int displacement = distances[i];
+      std::vector<CommsRequest>  CommsRequests;
 
-	int fd = _grid->_fdimensions[dimension];
-	int rd = _grid->_rdimensions[dimension];
+      CartesianStencil(GridBase *grid,
+		       int npoints,
+		       int checkerboard,
+		       const std::vector<int> &directions,
+		       const std::vector<int> &distances);
 
-	_checkerboard = checkerboard;
 
-	// the permute type
-	int simd_layout     = _grid->_simd_layout[dimension];
-	int comm_dim        = _grid->_processors[dimension] >1 ;
-	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+      // Add to tables for various cases;  is this mistaken. only local if 1 proc in dim
+      // Can this be avoided with simpler coding of comms?
+      void Local     (int point, int dimension,int shift,int cbmask);
+      void Comms     (int point, int dimension,int shift,int cbmask);
+      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute);
+      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset);
 
-	int sshift[2];
+      // Could allow a functional munging of the halo to another type during the comms.
+      // this could implement the 16bit/32bit/64bit compression.
+      template<class vobj> void HaloExchange(Lattice<vobj> &source,
+					     std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf)
+      {
+	// conformable(source._grid,_grid);
+	assert(source._grid==_grid);
+	if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
+	int u_comm_offset=0;
 
-	if ( !comm_dim ) {
-	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
-	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+	// Gather all comms buffers
+	typedef typename vobj::vector_type vector_type;
+	typedef typename vobj::scalar_type scalar_type;
 
-	  if ( sshift[0] == sshift[1] ) {
-	    Stencil_local(dimension,shift,0x3);
-	  } else {
-	    Stencil_local(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
-	    Stencil_local(dimension,shift,0x2);// both with block stride loop iteration
-	  }
-	} else if ( splice_dim ) {
-	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
-	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+	for(int point = 0 ; point < _npoints; point++) {
+
+	  printf("Point %d \n",point);fflush(stdout);
+	  int dimension    = _directions[point];
+	  int displacement = _distances[point];
 	  
-	  if ( sshift[0] == sshift[1] ) {
-	    Stencil_comms_simd(dimension,shift,0x3);
-	  } else {
-	    Stencil_comms_simd(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
-	    Stencil_comms_simd(dimension,shift,0x2);// both with block stride loop iteration
-	  }
-	} else {
-	  //	  Cshift_comms(ret,rhs,dimension,shift);
-	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
-	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
-	  if ( sshift[0] == sshift[1] ) {
-	    Stencil_comms(dimension,shift,0x3);
-	  } else {
-	    Stencil_comms(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
-	    Stencil_comms(dimension,shift,0x2);// both with block stride loop iteration
+	  int fd = _grid->_fdimensions[dimension];
+	  int rd = _grid->_rdimensions[dimension];
+
+
+	  // Map to always positive shift modulo global full dimension.
+	  int shift = (displacement+fd)%fd;
+	  
+     	  int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
+	  assert (checkerboard== _checkerboard);
+
+	  // the permute type
+	  int simd_layout     = _grid->_simd_layout[dimension];
+	  int comm_dim        = _grid->_processors[dimension] >1 ;
+	  int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+
+	  // Gather phase
+	  int sshift [2];
+	  if ( comm_dim ) {
+	    sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	    sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+	    if ( sshift[0] == sshift[1] ) {
+	      if (splice_dim) {
+		printf("splice 0x3 \n");fflush(stdout);
+		GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
+	      } else { 
+		printf("NO splice 0x3 \n");fflush(stdout);
+		GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
+	      }
+	    } else {
+	      if(splice_dim){
+		printf("splice 0x1,2 \n");fflush(stdout);
+		GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);// if checkerboard is unfavourable take two passes
+		GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);// both with block stride loop iteration
+	      } else {
+		printf("NO splice 0x1,2 \n");fflush(stdout);
+		GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);
+		GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);
+	      }
+	    }
 	  }
 	}
       }
-    }
 
-
-      void Stencil::Stencil_local     (int dimension,int shift,int cbmask)
-      {
-	int fd = _grid->_fdimensions[dimension];
-	int rd = _grid->_rdimensions[dimension];
-	int ld = _grid->_ldimensions[dimension];
-	int gd = _grid->_gdimensions[dimension];
-
-	// Map to always positive shift modulo global full dimension.
-	shift = (shift+fd)%fd;
-	
-	// the permute type
-	int permute_dim =_grid->PermuteDim(dimension);
-	int permute_type=_grid->PermuteType(dimension);
-	
-	for(int x=0;x<rd;x++){       
+      template<class vobj> void GatherStartComms(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
+						 std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
+						 int &u_comm_offset)
+	{
+	  typedef typename vobj::vector_type vector_type;
+	  typedef typename vobj::scalar_type scalar_type;
 	  
-	  int o   = 0;
-	  int bo  = x * _grid->_ostride[dimension];
+	  GridBase *grid=_grid;
+	  assert(rhs._grid==_grid);
+	  //	  conformable(_grid,rhs._grid);
+
+	  int fd              = _grid->_fdimensions[dimension];
+	  int rd              = _grid->_rdimensions[dimension];
+	  int simd_layout     = _grid->_simd_layout[dimension];
+	  int comm_dim        = _grid->_processors[dimension] >1 ;
+	  assert(simd_layout==1);
+	  assert(comm_dim==1);
+	  assert(shift>=0);
+	  assert(shift<fd);
+	  
+	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+	  
+	  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size); // hmm...
+	  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
 	  
 	  int cb= (cbmask==0x2)? 1 : 0;
+	  int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
 	  
-	  int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
-	  int sx     = (x+sshift)%rd;
-	  
-	  int permute_slice=0;
-	  if(permute_dim){
-	    int wrap = sshift/rd;
-	    int  num = sshift%rd;
-	    if ( x< rd-num ) permute_slice=wrap;
-	    else permute_slice = 1-wrap;
-	  }
-
-	  if ( permute_slice ) Copy_plane_permute(dimension,x,sx,cbmask,permute_type);
-	  else                 Copy_plane        (dimension,x,sx,cbmask); 
-  
-	}
-      }
-
-      void Stencil::Stencil_comms     (int dimension,int shift,int cbmask)
-      {
-	typedef typename vobj::vector_type vector_type;
-	typedef typename vobj::scalar_type scalar_type;
-	
-	GridBase *grid=_grid;
-
-	int fd              = _grid->_fdimensions[dimension];
-	int rd              = _grid->_rdimensions[dimension];
-	int simd_layout     = _grid->_simd_layout[dimension];
-	int comm_dim        = _grid->_processors[dimension] >1 ;
-
-	assert(simd_layout==1);
-	assert(comm_dim==1);
-	assert(shift>=0);
-	assert(shift<fd);
-	
-	int buffer_size = _grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
-	// FIXME: Do something with buffer_size??
-
-	int cb= (cbmask==0x2)? 1 : 0;
-	int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
-	
-	for(int x=0;x<rd;x++){       
-	  
-	  int offnode = ( x+sshift >= rd );
-	  int sx        = (x+sshift)%rd;
-	  int comm_proc = (x+sshift)/rd;
-	  
-	  if (!offnode) {
+	  for(int x=0;x<rd;x++){       
 	    
-	    Copy_plane(dimension,x,sx,cbmask); 
+	    printf("GatherStartComms x %d/%d\n",x,rd);fflush(stdout);
+	    int offnode = ( x+sshift >= rd );
+	    int sx        = (x+sshift)%rd;
+	    int comm_proc = (x+sshift)/rd;
 	    
-	  } else {
-	    
-	    int words = send_buf.size();
-	    if (cbmask != 0x3) words=words>>1;
-	    
-	    int bytes = words * sizeof(vobj);
-	    
-	    Gather_plane_simple (dimension,sx,cbmask);
-	    
-	    int rank           = grid->_processor;
-	    int recv_from_rank;
-	    int xmit_to_rank;
-	    grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-	    /*	    
-	    grid->SendToRecvFrom((void *)&send_buf[0],
-				 xmit_to_rank,
-				 (void *)&recv_buf[0],
-				 recv_from_rank,
-				 bytes);
-	    */
-	    Scatter_plane_simple (dimension,x,cbmask);
-	  }
-	}
-      }
-
-      void Stencil::Stencil_comms_simd(int dimension,int shift,int cbmask)
-      {
-	GridBase *grid=_grid;
-	const int Nsimd = _grid->Nsimd();
-	typedef typename vobj::vector_type vector_type;
-	typedef typename vobj::scalar_type scalar_type;
-	
-	int fd = _grid->_fdimensions[dimension];
-	int rd = _grid->_rdimensions[dimension];
-	int ld = _grid->_ldimensions[dimension];
-	int simd_layout     = _grid->_simd_layout[dimension];
-	int comm_dim        = _grid->_processors[dimension] >1 ;
-	
-	assert(comm_dim==1);
-	assert(simd_layout==2);
-	assert(shift>=0);
-	assert(shift<fd);
-	
-	int permute_type=_grid->PermuteType(dimension);
-	
-	///////////////////////////////////////////////
-	// Simd direction uses an extract/merge pair
-	///////////////////////////////////////////////
-	int buffer_size = _grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
-	// FIXME do something with buffer size
-	
-	std::vector<scalar_type *> pointers(Nsimd);  // 
-	std::vector<scalar_type *> rpointers(Nsimd); // received pointers
-	
-	///////////////////////////////////////////
-	// Work out what to send where
-	///////////////////////////////////////////
-	
-	int cb    = (cbmask==0x2)? 1 : 0;
-	int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
-	
-	std::vector<int> comm_offnode(simd_layout);
-	std::vector<int> comm_proc   (simd_layout);  //relative processor coord in dim=dimension
-	std::vector<int> icoor(grid->Nd());
-	
-	for(int x=0;x<rd;x++){       
-	  
-	  int comm_any = 0;
-	  for(int s=0;s<simd_layout;s++) {
-	    int shifted_x   = x+s*rd+sshift;
-	    comm_offnode[s] = shifted_x >= ld; 
-	    comm_any        = comm_any | comm_offnode[s];
-	    comm_proc[s]    = shifted_x/ld;     
-	  }
-	  
-	  int o    = 0;
-	  int bo   = x*grid->_ostride[dimension];
-	  int sx   = (x+sshift)%rd;
-	  
-	  if ( comm_any ) {
-	    
-	    for(int i=0;i<Nsimd;i++){
-	      pointers[i] = (scalar_type *)&send_buf_extract[i][0];
-	    }
-	    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
-	    
-	    for(int i=0;i<Nsimd;i++){
+	    if (offnode) {
 	      
-	      int s;
-	      grid->iCoorFromIindex(icoor,i);
-	      s = icoor[dimension];
+	      printf("GatherStartComms offnode x %d\n",x);fflush(stdout);
+	      int words = send_buf.size();
+	      if (cbmask != 0x3) words=words>>1;
+	    
+	      int bytes = words * sizeof(vobj);
+
+	      printf("Gather_plane_simple dimension %d sx %d cbmask %d\n",dimension,sx,cbmask);fflush(stdout);
+	      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
+
+	      printf("GatherStartComms gathered offnode x %d\n",x);fflush(stdout);
+
+	      int rank           = _grid->_processor;
+	      int recv_from_rank;
+	      int xmit_to_rank;
+	      _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
 	      
-	      if(comm_offnode[s]){
-		
-		int rank           = grid->_processor;
-		int recv_from_rank;
-		int xmit_to_rank;
-		grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
-		
-		/*		
-		grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
-				     xmit_to_rank,
-				     (void *)&recv_buf_extract[i][0],
-				     recv_from_rank,
-				     bytes);
-		*/
-
-		rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
-		
-	      } else { 
-		
-		rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
+	      //      FIXME Implement asynchronous send & also avoid buffer copy
+	      _grid->SendToRecvFrom((void *)&send_buf[0],
+				   xmit_to_rank,
+				   (void *)&recv_buf[0],
+				   recv_from_rank,
+				   bytes);
+	      printf("GatherStartComms communicated offnode x %d\n",x);fflush(stdout);
 
+	      printf("GatherStartComms inserting %d buf size %d\n",u_comm_offset,buffer_size);fflush(stdout);
+	      for(int i=0;i<buffer_size;i++){
+		u_comm_buf[u_comm_offset+i]=recv_buf[i];
 	      }
-	      
+	      u_comm_offset+=buffer_size;
+	      printf("GatherStartComms inserted x %d\n",x);fflush(stdout);
 	    }
-	    
-	    // Permute by swizzling pointers in merge
-	    int permute_slice=0;
-	    int lshift=sshift%ld;
-	    int wrap  =lshift/rd;
-	    int  num  =lshift%rd;
-
-	    if ( x< rd-num ) permute_slice=wrap;
-	    else permute_slice = 1-wrap;
-	    
-	    int toggle_bit = (Nsimd>>(permute_type+1));
-	    int PermuteMap;
-	    for(int i=0;i<Nsimd;i++){
-	      if ( permute_slice ) {
-		PermuteMap=i^toggle_bit;
-		pointers[i] = rpointers[PermuteMap];
-	      } else {
-		pointers[i] = rpointers[i];
-	      }
-	    }
-
-	    Scatter_plane_merge(pointers,dimension,x,cbmask);
-	    
-	  } else { 
-
-	    int permute_slice=0;
-	    int wrap = sshift/rd;
-	    int  num = sshift%rd;
-	    if ( x< rd-num ) permute_slice=wrap;
-	    else permute_slice = 1-wrap;
-
-	    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
-	    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-
 	  }
 	}
-      }
 
 
-};
+      template<class vobj>
+	void  GatherStartCommsSimd(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
+				   std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
+				   int &u_comm_offset)
+	{
+	  const int Nsimd = _grid->Nsimd();
+	  typedef typename vobj::vector_type vector_type;
+	  typedef typename vobj::scalar_type scalar_type;
+	  
+	  int fd = _grid->_fdimensions[dimension];
+	  int rd = _grid->_rdimensions[dimension];
+	  int ld = _grid->_ldimensions[dimension];
+	  int simd_layout     = _grid->_simd_layout[dimension];
+	  int comm_dim        = _grid->_processors[dimension] >1 ;
+
+	  assert(comm_dim==1);
+	  assert(simd_layout==2);
+	  assert(shift>=0);
+	  assert(shift<fd);
+
+	  int permute_type=_grid->PermuteType(dimension);
+
+	  ///////////////////////////////////////////////
+	  // Simd direction uses an extract/merge pair
+	  ///////////////////////////////////////////////
+	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+	  int words = sizeof(vobj)/sizeof(vector_type);
+
+	  /*   FIXME ALTERNATE BUFFER DETERMINATION */
+	  std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) ); 
+	  std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
+	  int bytes = buffer_size*words*sizeof(scalar_type);
+
+	  std::vector<scalar_type *> pointers(Nsimd);  //
+	  std::vector<scalar_type *> rpointers(Nsimd); // received pointers
+	  
+	  ///////////////////////////////////////////
+	  // Work out what to send where
+	  ///////////////////////////////////////////
+
+	  int cb    = (cbmask==0x2)? 1 : 0;
+	  int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
+	  
+	  std::vector<int> comm_offnode(simd_layout);
+	  std::vector<int> comm_proc   (simd_layout);  //relative processor coord in dim=dimension
+	  std::vector<int> icoor(_grid->Nd());
+	  
+	  for(int x=0;x<rd;x++){
+	    
+	    int comm_any = 0;
+	    for(int s=0;s<simd_layout;s++) {
+	      int shifted_x   = x+s*rd+sshift;
+	      comm_offnode[s] = shifted_x >= ld;
+	      comm_any        = comm_any | comm_offnode[s];
+	      comm_proc[s]    = shifted_x/ld;
+	    }
+    
+	    int o    = 0;
+	    int bo   = x*_grid->_ostride[dimension];
+	    int sx   = (x+sshift)%rd;
+	    
+	    if ( comm_any ) {
+
+	      for(int i=0;i<Nsimd;i++){
+		pointers[i] = (scalar_type *)&send_buf_extract[i][0];
+	      }
+	      Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+	      
+	      for(int i=0;i<Nsimd;i++){
+		
+		int s;
+		_grid->iCoorFromIindex(icoor,i);
+		s = icoor[dimension];
+		
+		if(comm_offnode[s]){
+		  
+		  int rank           = _grid->_processor;
+		  int recv_from_rank;
+		  int xmit_to_rank;
+
+		  _grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
+	  
+
+		  _grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
+					xmit_to_rank,
+					(void *)&recv_buf_extract[i][0],
+					recv_from_rank,
+					bytes);
+
+		  rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
+		  
+		} else { 
+		  
+		  rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
+
+		}
+		
+	      }
+
+	      // Permute by swizzling pointers in merge
+	      int permute_slice=0;
+	      int lshift=sshift%ld;
+	      int wrap  =lshift/rd;
+	      int  num  =lshift%rd;
+
+	      if ( x< rd-num ) permute_slice=wrap;
+	      else permute_slice = 1-wrap;
+
+	      int toggle_bit = (Nsimd>>(permute_type+1));
+	      int PermuteMap;
+	      for(int i=0;i<Nsimd;i++){
+		if ( permute_slice ) {
+		  PermuteMap=i^toggle_bit;
+		  pointers[i] = rpointers[PermuteMap];
+		} else {
+		  pointers[i] = rpointers[i];
+		}
+	      }
+
+	      // Here we don't want to scatter, just place into a buffer.
+	      for(int i=0;i<buffer_size;i++){
+		merge(u_comm_buf[u_comm_offset+i],pointers);
+	      }
+
+	    }
+	  }
+	}
+  };
+}
 #endif
diff --git a/Grid_stencil_common.cc b/Grid_stencil_common.cc
new file mode 100644
index 00000000..01bdce5d
--- /dev/null
+++ b/Grid_stencil_common.cc
@@ -0,0 +1,258 @@
+#include "Grid.h"
+
+namespace Grid {
+
+  CartesianStencil::CartesianStencil(GridBase *grid,
+				     int npoints,
+				     int checkerboard,
+				     const std::vector<int> &directions,
+				     const std::vector<int> &distances) 
+    :   _offsets(npoints), 
+	_is_local(npoints), 
+	_comm_buf_size(npoints), 
+	_permute_type(npoints),
+	_permute(npoints)
+    {
+      _npoints = npoints;
+      _grid    = grid;
+      _directions = directions;
+      _distances  = distances;
+      _unified_buffer_size=0;
+      _request_count =0;
+      CommsRequests.resize(0);
+
+      int osites  = _grid->oSites();
+
+      for(int i=0;i<npoints;i++){
+
+	int point = i;
+
+	_offsets[i].resize( osites);
+	_is_local[i].resize(osites);
+	_permute[i].resize( osites);
+
+	int dimension    = directions[i];
+	int displacement = distances[i];
+	int shift = displacement;
+	
+	int fd = _grid->_fdimensions[dimension];
+	int rd = _grid->_rdimensions[dimension];
+	_permute_type[point]=_grid->PermuteType(dimension);
+
+	_checkerboard = checkerboard;
+
+	// the permute type
+	int simd_layout     = _grid->_simd_layout[dimension];
+	int comm_dim        = _grid->_processors[dimension] >1 ;
+	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+
+	int sshift[2];
+	
+	// Underlying approach. For each local site build
+	// up a table containing the npoint "neighbours" and whether they 
+	// live in lattice or a comms buffer.
+	if ( !comm_dim ) {
+	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+
+	  if ( sshift[0] == sshift[1] ) {
+	    Local(point,dimension,shift,0x3);
+	  } else {
+	    Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Local(point,dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	} else { // All permute extract done in comms phase prior to Stencil application
+	  //        So tables are the same whether comm_dim or splice_dim
+	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+	  if ( sshift[0] == sshift[1] ) {
+	    Comms(point,dimension,shift,0x3);
+	  } else {
+	    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	}
+      }
+    }
+
+
+    void CartesianStencil::Local     (int point, int dimension,int shift,int cbmask)
+    {
+      int fd = _grid->_fdimensions[dimension];
+      int rd = _grid->_rdimensions[dimension];
+      int ld = _grid->_ldimensions[dimension];
+      int gd = _grid->_gdimensions[dimension];
+      
+      // Map to always positive shift modulo global full dimension.
+      shift = (shift+fd)%fd;
+      
+      // the permute type
+      int permute_dim =_grid->PermuteDim(dimension);
+      
+      for(int x=0;x<rd;x++){       
+	
+	int o   = 0;
+	int bo  = x * _grid->_ostride[dimension];
+	
+	int cb= (cbmask==0x2)? 1 : 0;
+	  
+	int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
+	int sx     = (x+sshift)%rd;
+	  
+	int permute_slice=0;
+	if(permute_dim){
+	  int wrap = sshift/rd;
+	  int  num = sshift%rd;
+	  if ( x< rd-num ) permute_slice=wrap;
+	  else permute_slice = 1-wrap;
+	}
+
+  	CopyPlane(point,dimension,x,sx,cbmask,permute_slice);
+  
+      }
+    }
+
+    void CartesianStencil::Comms     (int point,int dimension,int shift,int cbmask)
+    {
+      GridBase *grid=_grid;
+      
+      int fd              = _grid->_fdimensions[dimension];
+      int rd              = _grid->_rdimensions[dimension];
+      int simd_layout     = _grid->_simd_layout[dimension];
+      int comm_dim        = _grid->_processors[dimension] >1 ;
+      
+      assert(simd_layout==1);
+      assert(comm_dim==1);
+      assert(shift>=0);
+      assert(shift<fd);
+      
+      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+      _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
+                                           // send to one or more remote nodes.
+
+      int cb= (cbmask==0x2)? 1 : 0;
+      int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
+      
+      for(int x=0;x<rd;x++){       
+	
+	int offnode = ( x+sshift >= rd );
+	int sx        = (x+sshift)%rd;
+	int comm_proc = (x+sshift)/rd;
+	  
+	if (!offnode) {
+	  
+	  int permute_slice=0;
+	  CopyPlane(point,dimension,x,sx,cbmask,permute_slice); 
+	  
+	} else {
+	  
+	  int words = buffer_size;
+	  if (cbmask != 0x3) words=words>>1;
+	  
+	  //	  GatherPlaneSimple (point,dimension,sx,cbmask);
+	  
+	  int rank           = grid->_processor;
+	  int recv_from_rank;
+	  int xmit_to_rank;
+
+	  CommsRequest cr;
+
+	  cr.tag                   = _request_count++;
+	  cr.words                 = words;
+	  cr.unified_buffer_offset = _unified_buffer_size;
+	  _unified_buffer_size    += words;
+	  grid->ShiftedRanks(dimension,comm_proc,cr.to_rank,cr.from_rank);
+
+	  CommsRequests.push_back(cr);
+
+	  ScatterPlane(point,dimension,x,cbmask,cr.unified_buffer_offset); // permute/extract/merge is done in comms phase
+	  
+	}
+      }
+    }
+  // Routine builds up integer table for each site in _offsets, _is_local, _permute
+  void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute)
+    {
+      int rd = _grid->_rdimensions[dimension];
+      
+      if ( !_grid->CheckerBoarded(dimension) ) {
+	
+	int o   = 0;                                     // relative offset to base within plane
+	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int lo  = lplane*_grid->_ostride[dimension]; // offset in buffer
+	
+	// Simple block stride gather of SIMD objects
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    _offsets [point][lo+o+b]=ro+o+b;
+	    _is_local[point][lo+o+b]=1;
+	    _permute [point][lo+o+b]=permute;
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+	
+      } else {
+	
+	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int lo  = lplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                     // relative offset to base within plane
+	
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    
+	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
+	    
+	    if ( ocb&cbmask ) {
+	      _offsets [point][lo+o+b]=ro+o+b;
+	      _is_local[point][lo+o+b]=1;
+	      _permute [point][lo+o+b]=permute;
+	    }
+	    
+	    }
+	  o +=_grid->_slice_stride[dimension];
+	}
+	
+      }
+    }
+  // Routine builds up integer table for each site in _offsets, _is_local, _permute
+    void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset)
+    {
+      int rd = _grid->_rdimensions[dimension];
+      
+      if ( !_grid->CheckerBoarded(dimension) ) {
+	
+	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                    // relative offset to base within plane
+	int bo  = 0;                                    // offset in buffer
+
+	// Simple block stride gather of SIMD objects
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    _offsets [point][so+o+b]=offset+(bo++);
+	    _is_local[point][so+o+b]=0;
+	    _permute [point][so+o+b]=0;
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+
+      } else { 
+	
+	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                      // relative offset to base within plane
+	int bo  = 0;                                      // offset in buffer
+    
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+
+	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	    if ( ocb & cbmask ) {
+	      _offsets [point][so+o+b]=offset+(bo++);
+	      _is_local[point][so+o+b]=0;
+	      _permute [point][so+o+b]=0;
+	    }
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+      }
+    }
+}
diff --git a/Grid_vComplexD.h b/Grid_vComplexD.h
index 3278b15e..f1c335e8 100644
--- a/Grid_vComplexD.h
+++ b/Grid_vComplexD.h
@@ -194,6 +194,8 @@ namespace Grid {
             float b= imag(c);
             vsplat(ret,a,b);
         }
+
+
         friend inline void vsplat(vComplexD &ret,double rl,double ig){
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_pd(ig,rl,ig,rl);
@@ -321,13 +323,14 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
     {
         return l*r;
     }
-    inline vComplex trace(const vComplex &arg){
+    inline vComplexD trace(const vComplexD &arg){
         return arg;
     }
 /////////////////////////////////////////////////////////////////////////
 //// Generic routine to promote object<complex> -> object<vcomplex>
 //// Supports the array reordering transformation that gives me SIMD utilisation
 ///////////////////////////////////////////////////////////////////////////
+/*
 template<template<class> class object>
 inline object<vComplex> splat(object<Complex >s){
       object<vComplex> ret;
@@ -338,6 +341,6 @@ inline object<vComplex> splat(object<Complex >s){
       }
       return ret;
     }
-
+*/
 }
 #endif
diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h
index b6b7ebe9..18cf1749 100644
--- a/Grid_vComplexF.h
+++ b/Grid_vComplexF.h
@@ -3,6 +3,16 @@
 #include "Grid.h"
 
 namespace Grid {
+
+  /*
+  inline void Print(const char *A,cvec c) { 
+	  float *fp=(float *)&c; 
+	  printf(A); 
+	  printf(" %le %le %le %le %le %le %le %le\n",
+		 fp[0],fp[1],fp[2],fp[3],fp[4],fp[5],fp[6],fp[7]);
+	}
+  */
+
     class vComplexF {
       //    protected:
 
@@ -108,7 +118,7 @@ namespace Grid {
             ymm1 = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
             ymm2 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
             ymm1 = _mm256_mul_ps(ymm1,ymm2);       // ymm1 <- br ai, ai bi
-            ret.v= _mm256_addsub_ps(ymm0,ymm1);    // FIXME -- AVX2 could MAC
+            ret.v= _mm256_addsub_ps(ymm0,ymm1);    
 #endif
 #ifdef SSE2
             cvec ymm0,ymm1,ymm2;
@@ -142,7 +152,7 @@ namespace Grid {
 	////////////////////////////////////////////////////////////////////////
 	// FIXME:  gonna remove these load/store, get, set, prefetch
 	////////////////////////////////////////////////////////////////////////
-        friend inline void vset(vComplexF &ret, Complex *a){
+        friend inline void vset(vComplexF &ret, ComplexF *a){
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
 #endif
@@ -216,12 +226,12 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 #endif
         }
 
-        friend inline vComplexF operator * (const Complex &a, vComplexF b){
+        friend inline vComplexF operator * (const ComplexF &a, vComplexF b){
             vComplexF va;
             vsplat(va,a);
             return va*b;
         }
-        friend inline vComplexF operator * (vComplexF b,const Complex &a){
+        friend inline vComplexF operator * (vComplexF b,const ComplexF &a){
 	  return a*b;
         }
 
@@ -283,22 +293,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 	}
        */
        
-        // NB: Template the following on "type Complex" and then implement *,+,- for 
-	// ComplexF, ComplexD, RealF, RealD above to
-        // get full generality of binops with scalars.
-        friend inline void mac (vComplexF *__restrict__ y,const Complex *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-        friend inline void mult(vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
-        friend inline void sub (vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
-        friend inline void add (vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
-        friend inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const Complex *__restrict__ x){ *y = (*a)*(*x)+(*y); };
-        friend inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) * (*r); }
-        friend inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) - (*r); }
-        friend inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const Complex *__restrict__ r){ *y = (*l) + (*r); }
-
 
         ///////////////////////
         // Conjugate
         ///////////////////////
+								     
         friend inline vComplexF conj(const vComplexF &in){
             vComplexF ret ; vzero(ret);
 #if defined (AVX1)|| defined (AVX2)
@@ -363,10 +362,11 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 
     };
 
-    inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
+    inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) 
+    {
+      return conj(l)*r; 
+    }
 
-    typedef  vComplexF vFComplex;
-    typedef  vComplexF vComplex;
     inline void zeroit(vComplexF &z){ vzero(z);}
 
     inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r)
diff --git a/Makefile.am b/Makefile.am
index d11f28d5..e306eb56 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,11 +1,22 @@
 # additional include paths necessary to compile the C++ library
 AM_CXXFLAGS = -I$(top_srcdir)/
 
+
+extra_sources=
+if BUILD_COMMS_MPI
+  extra_sources+=Grid_communicator_mpi.cc
+  extra_sources+=Grid_stencil_common.cc
+endif
+if BUILD_COMMS_NONE
+  extra_sources+=Grid_communicator_fake.cc
+  extra_sources+=Grid_stencil_common.cc
+endif
+
 #
 # Libraries
 #
 lib_LIBRARIES = libGrid.a
-libGrid_a_SOURCES = Grid_init.cc
+libGrid_a_SOURCES = Grid_init.cc $(extra_sources)
 
 #
 # Include files
@@ -26,24 +37,18 @@ include_HEADERS = Grid_config.h\
 	Grid_cshift_common.h\
 	Grid_cshift_mpi.h\
 	Grid_cshift_none.h\
+	Grid_stencil.h\
 	Grid_math_types.h
 
 
 #
 # Test code
 #
-bin_PROGRAMS = Grid_main
-
-extra_sources=
-if BUILD_COMMS_MPI
-  extra_sources+=Grid_communicator_mpi.cc
-endif
-if BUILD_COMMS_NONE
-  extra_sources+=Grid_communicator_fake.cc
-endif
-
-Grid_main_SOURCES = \
-	Grid_main.cc\
-	$(extra_sources)
+bin_PROGRAMS = Grid_main test_Grid_stencil
 
+Grid_main_SOURCES = Grid_main.cc
 Grid_main_LDADD = libGrid.a
+
+test_Grid_stencil_SOURCES = test_Grid_Stencil.cc
+test_Grid_stencil_LDADD = libGrid.a
+
diff --git a/Makefile.in b/Makefile.in
index 16981e52..11b86688 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -88,9 +88,11 @@ POST_INSTALL = :
 NORMAL_UNINSTALL = :
 PRE_UNINSTALL = :
 POST_UNINSTALL = :
-bin_PROGRAMS = Grid_main$(EXEEXT)
-@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc
-@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc
+@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc \
+@BUILD_COMMS_MPI_TRUE@	Grid_stencil_common.cc
+@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc \
+@BUILD_COMMS_NONE_TRUE@	Grid_stencil_common.cc
+bin_PROGRAMS = Grid_main$(EXEEXT) test_Grid_stencil$(EXEEXT)
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/configure.ac
@@ -142,18 +144,23 @@ am__v_AR_0 = @echo "  AR      " $@;
 am__v_AR_1 = 
 libGrid_a_AR = $(AR) $(ARFLAGS)
 libGrid_a_LIBADD =
-am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT)
+am__libGrid_a_SOURCES_DIST = Grid_init.cc Grid_communicator_mpi.cc \
+	Grid_stencil_common.cc Grid_communicator_fake.cc
+@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT) \
+@BUILD_COMMS_MPI_TRUE@	Grid_stencil_common.$(OBJEXT)
+@BUILD_COMMS_NONE_TRUE@am__objects_2 =  \
+@BUILD_COMMS_NONE_TRUE@	Grid_communicator_fake.$(OBJEXT) \
+@BUILD_COMMS_NONE_TRUE@	Grid_stencil_common.$(OBJEXT)
+am__objects_3 = $(am__objects_1) $(am__objects_2)
+am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT) $(am__objects_3)
 libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS)
 PROGRAMS = $(bin_PROGRAMS)
-am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_communicator_mpi.cc \
-	Grid_communicator_fake.cc
-@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT)
-@BUILD_COMMS_NONE_TRUE@am__objects_2 =  \
-@BUILD_COMMS_NONE_TRUE@	Grid_communicator_fake.$(OBJEXT)
-am__objects_3 = $(am__objects_1) $(am__objects_2)
-am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_3)
+am_Grid_main_OBJECTS = Grid_main.$(OBJEXT)
 Grid_main_OBJECTS = $(am_Grid_main_OBJECTS)
 Grid_main_DEPENDENCIES = libGrid.a
+am_test_Grid_stencil_OBJECTS = test_Grid_Stencil.$(OBJEXT)
+test_Grid_stencil_OBJECTS = $(am_test_Grid_stencil_OBJECTS)
+test_Grid_stencil_DEPENDENCIES = libGrid.a
 AM_V_P = $(am__v_P_@AM_V@)
 am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
 am__v_P_0 = false
@@ -183,8 +190,10 @@ AM_V_CXXLD = $(am__v_CXXLD_@AM_V@)
 am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
 am__v_CXXLD_0 = @echo "  CXXLD   " $@;
 am__v_CXXLD_1 = 
-SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES)
-DIST_SOURCES = $(libGrid_a_SOURCES) $(am__Grid_main_SOURCES_DIST)
+SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES) \
+	$(test_Grid_stencil_SOURCES)
+DIST_SOURCES = $(am__libGrid_a_SOURCES_DIST) $(Grid_main_SOURCES) \
+	$(test_Grid_stencil_SOURCES)
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
@@ -329,12 +338,13 @@ top_srcdir = @top_srcdir@
 
 # additional include paths necessary to compile the C++ library
 AM_CXXFLAGS = -I$(top_srcdir)/
+extra_sources = $(am__append_1) $(am__append_2)
 
 #
 # Libraries
 #
 lib_LIBRARIES = libGrid.a
-libGrid_a_SOURCES = Grid_init.cc
+libGrid_a_SOURCES = Grid_init.cc $(extra_sources)
 
 #
 # Include files
@@ -355,14 +365,13 @@ include_HEADERS = Grid_config.h\
 	Grid_cshift_common.h\
 	Grid_cshift_mpi.h\
 	Grid_cshift_none.h\
+	Grid_stencil.h\
 	Grid_math_types.h
 
-extra_sources = $(am__append_1) $(am__append_2)
-Grid_main_SOURCES = \
-	Grid_main.cc\
-	$(extra_sources)
-
+Grid_main_SOURCES = Grid_main.cc
 Grid_main_LDADD = libGrid.a
+test_Grid_stencil_SOURCES = test_Grid_Stencil.cc
+test_Grid_stencil_LDADD = libGrid.a
 all: Grid_config.h
 	$(MAKE) $(AM_MAKEFLAGS) all-am
 
@@ -499,6 +508,10 @@ Grid_main$(EXEEXT): $(Grid_main_OBJECTS) $(Grid_main_DEPENDENCIES) $(EXTRA_Grid_
 	@rm -f Grid_main$(EXEEXT)
 	$(AM_V_CXXLD)$(CXXLINK) $(Grid_main_OBJECTS) $(Grid_main_LDADD) $(LIBS)
 
+test_Grid_stencil$(EXEEXT): $(test_Grid_stencil_OBJECTS) $(test_Grid_stencil_DEPENDENCIES) $(EXTRA_test_Grid_stencil_DEPENDENCIES) 
+	@rm -f test_Grid_stencil$(EXEEXT)
+	$(AM_V_CXXLD)$(CXXLINK) $(test_Grid_stencil_OBJECTS) $(test_Grid_stencil_LDADD) $(LIBS)
+
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
 
@@ -509,6 +522,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_mpi.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_stencil_common.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_Grid_Stencil.Po@am__quote@
 
 .cc.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
diff --git a/TODO b/TODO
index 5297e213..665e3a55 100644
--- a/TODO
+++ b/TODO
@@ -3,7 +3,7 @@
 * Replace vset with a call to merge.; 
 * care in Gmerge,Gextract over set.
 * Const audit
-* extract / merge extra implementation removal
+* extract / merge extra implementation removal      
 
 
 * Conditional execution, where etc...                -----DONE, simple test
diff --git a/test_Grid_jacobi.cc b/test_Grid_jacobi.cc
new file mode 100644
index 00000000..95e808f1
--- /dev/null
+++ b/test_Grid_jacobi.cc
@@ -0,0 +1,206 @@
+#include "Grid.h"
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template<class vobj>
+class LinearOperator {
+public:
+  operator () (const Lattice<vobj>&src,Lattice<vobj> &result) {};
+};
+
+template<class vobj>
+class LinearOperatorJacobi : public LinearOperator<vobj>
+{
+  CartesianStencil *Stencil;
+  GridBase *_grid;
+  std::vector<vobj,alignedAllocator<vobj> >  comm_buf;
+  LinearOperatorJacobi(GridCartesian *grid)    
+  {
+    _grid = grid;
+    int npoint=9;
+    std::vector<int> directions(npoint);
+    std::vector<int> displacements(npoint);
+    for(int mu=0;mu<4;mu++){
+      for(int mp=0;mp<2;mp++){
+	int dir = 2*mu+mp;
+	directions[dir]   = mu;
+	displacements[dir]= -1+2*mp;
+      }
+    }
+    directions[8] = 0;
+    displacements[8] = 0;
+
+    Stencil = new CartesianStencil(grid,npoint,0,directions,displacements);  
+
+    comm_buf.resize(Stencil->_unified_buffer_size);
+
+  }
+  operator () (const Lattice<vobj>&src,Lattice<vobj> &result)
+  {
+    const int npoint=9;
+    printf("calling halo exchange\n");fflush(stdout);
+    myStencil.HaloExchange(Foo,comm_buf);
+    vobj tmp;
+    vobj
+    for(int i=0;i<_grid->oSites();i++){
+      for(int p=0;p<npoint;p++){
+
+	int offset = Stencil->_offsets [p][i];
+	int  local = Stencil->_is_local[p][i];
+	int ptype  = Stencil->_permute_type[p];
+	int perm   = Stencil->_permute[0][i];
+	
+	vobj *nbr;
+	if ( local && perm ){
+	  permute(tmp,src._odata[offset],ptype);
+	  nbr = &tmp;
+	} else if (local) {
+	  nbr = &src._odata[offset];
+	} else  {
+	  nbr = &comm_buf[offset];
+	}
+       	result[i] = result[i]+*nbr;
+      }
+    }	
+  }
+  ~LinearOperatorJacobi()
+  {
+    delete Stencil;
+  }
+}
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size  (4);
+  std::vector<int> simd_layout(4);
+  std::vector<int> mpi_layout (4);
+
+  int omp=1;
+  int lat=8;
+
+  mpi_layout[0]=1;
+  mpi_layout[1]=2;
+  mpi_layout[2]=1;
+  mpi_layout[3]=1;
+
+    latt_size[0] = lat;
+    latt_size[1] = lat;
+    latt_size[2] = lat;
+    latt_size[3] = lat;
+    double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+    
+#ifdef AVX512
+    simd_layout[0] = 1;
+    simd_layout[1] = 2;
+    simd_layout[2] = 2;
+    simd_layout[3] = 2;
+#endif
+#if defined (AVX1)|| defined (AVX2)
+    simd_layout[0] = 1;
+    simd_layout[1] = 1;
+    simd_layout[2] = 2;
+    simd_layout[3] = 2;
+#endif
+#if defined (SSE2)
+    simd_layout[0] = 1;
+    simd_layout[1] = 1;
+    simd_layout[2] = 1;
+    simd_layout[3] = 2;
+#endif
+    
+    GridCartesian Fine(latt_size,simd_layout,mpi_layout);
+    GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
+    
+    LatticeColourMatrix Foo(&Fine);
+    LatticeColourMatrix Bar(&Fine);
+    LatticeColourMatrix Check(&Fine);
+    LatticeColourMatrix Diff(&Fine);
+    
+    random(Foo);
+    gaussian(Bar);
+
+
+    for(int dir=0;dir<4;dir++){
+      for(int disp=0;disp<Fine._rdimensions[dir];disp++){
+
+	// start to test the Cartesian npoint stencil infrastructure
+	int npoint=;
+	std::vector<int> directions(npoint,dir);
+	std::vector<int> displacements(npoint,disp);
+
+	CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
+
+	printf("STENCIL: osites %d %d dir %d disp %d\n",Fine.oSites(),(int)myStencil._offsets[0].size(),dir,disp);
+	std::vector<int> ocoor(4);
+	for(int o=0;o<Fine.oSites();o++){
+	  Fine.oCoorFromOindex(ocoor,o);
+	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
+	  int nbr = Fine.oIndexReduced(ocoor);
+	  int stcl= myStencil._offsets[0][o];
+	  if(nbr!=stcl){
+	    printf("STENCIL: nbr %d stencil._offset %d\n",nbr,stcl);
+	  }
+	}
+	
+	printf("allocating %d buffers\n",myStencil._unified_buffer_size);
+	fflush(stdout);
+	std::vector<vColourMatrix,alignedAllocator<vColourMatrix> >  comm_buf(myStencil._unified_buffer_size);
+	printf("calling halo exchange\n");fflush(stdout);
+	myStencil.HaloExchange(Foo,comm_buf);
+
+	Bar = Cshift(Foo,dir,disp);
+
+	// Implement a stencil code that should agree with cshift!
+	for(int i=0;i<Check._grid->oSites();i++){
+
+	  int offset = myStencil._offsets [0][i];
+	  int  local = myStencil._is_local[0][i];
+	  int permute_type = myStencil._permute_type[0];
+	  int perm =myStencil._permute[0][i];
+	  if ( local && perm )
+	    permute(Check._odata[i],Foo._odata[offset],permute_type);
+	  else if (local)
+	    Check._odata[i] = Foo._odata[offset];
+	  else 
+	    Check._odata[i] = comm_buf[offset];
+	  
+
+	}
+
+
+	std::vector<int> coor(4);
+	for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
+	for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
+	for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
+	for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
+
+	  Complex diff;
+	  ColourMatrix check,bar;
+	  peekSite(check,Check,coor);
+	  peekSite(bar,Bar,coor);
+
+	  for(int r=0;r<3;r++){
+	  for(int c=0;c<3;c++){
+            diff =check._internal._internal[r][c]-bar._internal._internal[r][c];
+            double nn=real(conj(diff)*diff);
+            if ( nn > 0 ){
+	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
+		     coor[0],coor[1],coor[2],coor[3],r,c,
+		     nn,
+		     real(check._internal._internal[r][c]),
+		     real(bar._internal._internal[r][c])
+		     );
+	    }
+	  }}
+	 
+	}}}}
+
+      }
+    }
+
+ Grid_finalize();
+}
diff --git a/test_Grid_stencil.cc b/test_Grid_stencil.cc
new file mode 100644
index 00000000..8077562f
--- /dev/null
+++ b/test_Grid_stencil.cc
@@ -0,0 +1,154 @@
+#include "Grid.h"
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> latt_size  (4);
+  std::vector<int> simd_layout(4);
+  std::vector<int> mpi_layout (4);
+
+  int omp=1;
+  int lat=8;
+
+  mpi_layout[0]=1;
+  mpi_layout[1]=1;
+  mpi_layout[2]=1;
+  mpi_layout[3]=1;
+
+    latt_size[0] = lat;
+    latt_size[1] = lat;
+    latt_size[2] = lat;
+    latt_size[3] = lat;
+    double volume = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+    
+#ifdef AVX512
+    simd_layout[0] = 1;
+    simd_layout[1] = 1;
+    simd_layout[2] = 2;
+    simd_layout[3] = 2;
+#endif
+#if defined (AVX1)|| defined (AVX2)
+    simd_layout[0] = 1;
+    simd_layout[1] = 1;
+    simd_layout[2] = 1;
+    simd_layout[3] = 2;
+#endif
+#if defined (SSE2)
+    simd_layout[0] = 1;
+    simd_layout[1] = 1;
+    simd_layout[2] = 1;
+    simd_layout[3] = 2;
+#endif
+    
+    GridCartesian Fine(latt_size,simd_layout,mpi_layout);
+    GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
+    
+    LatticeColourMatrix Foo(&Fine);
+    LatticeColourMatrix Bar(&Fine);
+    LatticeColourMatrix Check(&Fine);
+    LatticeColourMatrix Diff(&Fine);
+    
+    random(Foo);
+    gaussian(Bar);
+
+
+    for(int dir=0;dir<4;dir++){
+      for(int disp=0;disp<Fine._rdimensions[dir];disp++){
+
+	// start to test the Cartesian npoint stencil infrastructure
+	int npoint=1;
+	std::vector<int> directions(npoint,dir);
+	std::vector<int> displacements(npoint,disp);
+
+	CartesianStencil myStencil(&Fine,npoint,0,directions,displacements);
+
+	printf("STENCIL: osites %d %d dir %d disp %d\n",Fine.oSites(),(int)myStencil._offsets[0].size(),dir,disp);
+	std::vector<int> ocoor(4);
+	for(int o=0;o<Fine.oSites();o++){
+	  Fine.oCoorFromOindex(ocoor,o);
+	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
+	  int nbr = Fine.oIndexReduced(ocoor);
+	  int stcl= myStencil._offsets[0][o];
+	  if(nbr!=stcl){
+	    printf("STENCIL: nbr %d stencil._offset %d\n",nbr,stcl);
+	  }
+	}
+	
+	printf("allocating %d buffers\n",myStencil._unified_buffer_size);
+	fflush(stdout);
+	std::vector<vColourMatrix,alignedAllocator<vColourMatrix> >  comm_buf(myStencil._unified_buffer_size);
+	printf("calling halo exchange\n");fflush(stdout);
+	myStencil.HaloExchange(Foo,comm_buf);
+
+	Bar = Cshift(Foo,dir,disp);
+
+	// Implement a stencil code that should agree with cshift!
+	for(int i=0;i<Check._grid->oSites();i++){
+
+	  int offset = myStencil._offsets [0][i];
+	  int  local = myStencil._is_local[0][i];
+	  int permute_type = myStencil._permute_type[0];
+	  int perm =myStencil._permute[0][i];
+	  if ( local && perm )
+	    permute(Check._odata[i],Foo._odata[offset],permute_type);
+	  else if (local)
+	    Check._odata[i] = Foo._odata[offset];
+	  else 
+	    Check._odata[i] = comm_buf[offset];
+	  
+
+	}
+
+	Real nrmC = norm2(Check);
+	Real nrmB = norm2(Bar);
+	Real nrm = norm2(Check-Bar);
+	printf("N2diff = %le (%le, %le) \n",nrm,nrmC,nrmB);fflush(stdout);
+
+	Real snrmC =0;
+	Real snrmB =0;
+	Real snrm  =0;
+
+	std::vector<int> coor(4);
+	for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
+	for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
+	for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
+	for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
+
+	  Complex diff;
+	  ColourMatrix check,bar;
+	  peekSite(check,Check,coor);
+	  peekSite(bar,Bar,coor);
+
+	  for(int r=0;r<3;r++){
+	  for(int c=0;c<3;c++){
+            diff =check._internal._internal[r][c]-bar._internal._internal[r][c];
+            double nn=real(conj(diff)*diff);
+            if ( nn > 0){
+	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le %le %le\n",
+		     coor[0],coor[1],coor[2],coor[3],r,c,
+		     nn,
+		     real(check._internal._internal[r][c]),
+		     real(bar._internal._internal[r][c])
+		     );
+	    }
+	    snrmC=snrmC+real(conj(check._internal._internal[r][c])*check._internal._internal[r][c]);
+	    snrmB=snrmB+real(conj(bar._internal._internal[r][c])*bar._internal._internal[r][c]);
+	    snrm=snrm+nn;
+	  }}
+	 
+	}}}}
+
+	printf("scalar N2diff = %le (%le, %le) \n",snrm,snrmC,snrmB);fflush(stdout);
+
+
+      }
+    }
+
+ Grid_finalize();
+}