From 982274e5a0e9f4433dd05be420e31fb98fbc4357 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 6 Apr 2015 11:26:24 +0100
Subject: [PATCH] Major rework of extract/merge/permute processing debugged and
 working.

---
 Grid.h                                    |   3 +-
 Grid_Cartesian.h                          |  58 +++---------
 Grid_Lattice.h                            |  20 +----
 Grid_QCD.h                                |  12 +--
 Grid_aligned_allocator.h                  |   3 +
 Grid_fake.cc => Grid_communicator_fake.cc |   0
 Grid_mpi.cc => Grid_communicator_mpi.cc   |   0
 Grid_config.h                             |   3 -
 Grid_config.h.in                          |   3 -
 Grid_cshift_common.h                      |  76 +++-------------
 Grid_cshift_mpi.h                         |  14 ++-
 Grid_main.cc                              |   4 +-
 Grid_simd.h                               |  74 ++++++----------
 Grid_stencil.h                            |  12 +++
 Grid_vComplexD.h                          |  81 +++++------------
 Grid_vComplexF.h                          |  92 +++++--------------
 Grid_vInteger.h                           |  63 +------------
 Grid_vRealD.h                             |  87 +++++-------------
 Grid_vRealF.h                             |  90 +++++--------------
 Makefile.am                               |   8 +-
 Makefile.in                               |  29 +++---
 TODO                                      | 103 ++++++++++++++++++++++
 configure                                 |  23 +----
 configure.ac                              |   7 +-
 24 files changed, 291 insertions(+), 574 deletions(-)
 rename Grid_fake.cc => Grid_communicator_fake.cc (100%)
 rename Grid_mpi.cc => Grid_communicator_mpi.cc (100%)
 create mode 100644 Grid_stencil.h

diff --git a/Grid.h b/Grid.h
index 184e7e36..0a4024bf 100644
--- a/Grid.h
+++ b/Grid.h
@@ -42,11 +42,10 @@
 #endif
 
 
+#include <Grid_aligned_allocator.h>
 #include <Grid_simd.h>
 #include <Grid_math_types.h>
 #include <Grid_Cartesian.h>
-#include <Grid_aligned_allocator.h>
-#include <Grid_aligned_allocator.h>
 #include <Grid_Lattice.h>
 #include <Grid_QCD.h>
 
diff --git a/Grid_Cartesian.h b/Grid_Cartesian.h
index 004c2fa5..c891edcc 100644
--- a/Grid_Cartesian.h
+++ b/Grid_Cartesian.h
@@ -8,48 +8,6 @@ namespace Grid{
 /////////////////////////////////////////////////////////////////////////////////////////
 // Grid Support.
 /////////////////////////////////////////////////////////////////////////////////////////
-//
-// Cartesian grid inheritance
-//            Grid::GridBase
-//                     |
-//           __________|___________
-//          |                      |
-// Grid::GridCartesian   Grid::GridCartesianRedBlack
-//
-// TODO: document the following as an API guaranteed public interface
-
-    /* 
-     *       Rough map of functionality against QDP++ Layout
-     *
-     *       Param     |     Grid                     |     QDP++             
-     *       -----------------------------------------
-     *                 |                              |
-     *        void     |     oSites, iSites, lSites   |  sitesOnNode 
-     *        void     |     gSites                   |  vol
-     *                 |                              |
-     *        gcoor    |     oIndex, iIndex           |  linearSiteIndex // no virtual node in QDP
-     *        lcoor    |                              |
-     * 
-     *        void     |     CheckerBoarded           |  -        // No checkerboarded in QDP
-     *        void     |     FullDimensions           |  lattSize
-     *        void     |     GlobalDimensions         |  lattSize // No checkerboarded in QDP
-     *        void     |     LocalDimensions          |  subgridLattSize
-     *        void     |     VirtualLocalDimensions   |  subgridLattSize // no virtual node in QDP
-     *                 |                              |
-     *       int x 3   |     oiSiteRankToGlobal       |  siteCoords
-     *                 |     ProcessorCoorLocalCoorToGlobalCoor | 
-     *                 |                              |
-     *     vector<int> |     GlobalCoorToRankIndex   |  nodeNumber(coord)
-     *     vector<int> |     GlobalCoorToProcessorCoorLocalCoor|  nodeCoord(coord)
-     *                 |                              |
-     *     void        |     Processors               |  logicalSize    // returns cart array shape
-     *     void        |     ThisRank        |  nodeNumber();  // returns this node rank
-     *     void        |     ThisProcessorCoor        |    // returns this node coor
-     *     void        |     isBoss(void)             |  primaryNode();
-     *                 |                              |
-     *                 |     RankFromProcessorCoor    |  getLogicalCoorFrom(node)
-     *                 |     ProcessorCoorFromRank    |  getNodeNumberFrom(logical_coord)
-     */
 
 class GridBase : public CartesianCommunicator {
 public:
@@ -60,7 +18,8 @@ public:
  GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
 
         
- //protected:
+ //FIXME 
+ // protected:
  // Lattice wide random support. not yet fully implemented. Need seed strategy
  // and one generator per site.
  // std::default_random_engine generator;
@@ -165,7 +124,16 @@ public:
 	lane    = lane / _simd_layout[d];
       }
     }
-    
+    inline int PermuteDim(int dimension){
+      return _simd_layout[dimension]>1;
+    }
+    inline int PermuteType(int dimension){
+      int permute_type=0;
+      for(int d=_ndimension-1;d>dimension;d--){
+	if (_simd_layout[d]>1 ) permute_type++;
+      }
+      return permute_type;
+    }
     ////////////////////////////////////////////////////////////////
     // Array sizing queries
     ////////////////////////////////////////////////////////////////
@@ -399,8 +367,6 @@ public:
             
         ////////////////////////////////////////////////////////////////////////////////////////////
         // subplane information
-        // It may be worth the investment of generating a more general subplane "iterator",
-        // and providing support for threads grabbing a unit of allocation.
         ////////////////////////////////////////////////////////////////////////////////////////////
         _slice_block.resize(_ndimension);
         _slice_stride.resize(_ndimension);
diff --git a/Grid_Lattice.h b/Grid_Lattice.h
index 2aa46b97..ebf68560 100644
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -4,17 +4,9 @@
 #include "Grid.h"
 
 
-
 namespace Grid {
 
-// Permute the pointers 32bitx16 = 512
-static int permute_map[4][16] = { 
-  { 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
-  { 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13},
-  { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11},
-  { 9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8}
-};
-
+  extern int GridCshiftPermuteMap[4][16];
 
 template<class vobj>
 class Lattice
@@ -37,11 +29,10 @@ public:
 
 #include <Grid_cshift.h>
     
-    // overloading Grid::conformable but no conformable in Grid ...?:w
     template<class obj1,class obj2>
     friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);
 
-    // Performance difference between operator * and mult is troubling.
+    // FIXME Performance difference between operator * and mult is troubling.
     // Auto move constructor seems to lose surprisingly much.
 
     // Site wise binary operations
@@ -182,23 +173,20 @@ public:
         }}
     };
 
+    // FIXME Implement a consistent seed management strategy
     friend void gaussian(Lattice<vobj> &l){
         // Zero mean, unit variance.
         std::normal_distribution<double> distribution(0.0,1.0);
         Real *v_ptr = (Real *)&l._odata[0];
         size_t v_len = l._grid->oSites()*sizeof(vobj);
         size_t d_len = v_len/sizeof(Real);
-        
-        // Not a parallel RNG. Could make up some seed per 4d site, seed
-        // per hypercube type scheme.
+
         for(int i=0;i<d_len;i++){
 	  v_ptr[i]= drand48();
         }
     };
 
-
     // Unary functions and Unops
-    // Unary negation
     friend inline Lattice<vobj> operator -(const Lattice<vobj> &r) {
         Lattice<vobj> ret(r._grid);
 #pragma omp parallel for
diff --git a/Grid_QCD.h b/Grid_QCD.h
index a0f7801a..3e852633 100644
--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@@ -24,10 +24,7 @@ namespace QCD {
     typedef iSinglet<Real >             TReal;    // This is painful. Tensor singlet complex type.
 
 
-    typedef iSinglet<vIntegerF >         vTIntegerF;
-    typedef iSinglet<vIntegerD >         vTIntegerD;
-    typedef iSinglet<vIntegerC >         vTIntegerC;
-    typedef iSinglet<vIntegerZ >         vTIntegerZ;
+    typedef iSinglet<vInteger >         vTInteger;
 
     typedef iSpinMatrix<Complex >       SpinMatrix;
     typedef iColourMatrix<Complex >     ColourMatrix;
@@ -46,12 +43,9 @@ namespace QCD {
     typedef iColourVector<vComplex >     vColourVector;
     typedef iSpinColourVector<vComplex > vSpinColourVector;
     
-    typedef Lattice<vTComplex>         LatticeComplex;
+    typedef Lattice<vTComplex>            LatticeComplex;
 
-    typedef Lattice<vTIntegerF>            LatticeIntegerF; // Predicates for "where"
-    typedef Lattice<vTIntegerD>            LatticeIntegerD; 
-    typedef Lattice<vTIntegerC>            LatticeIntegerC;
-    typedef Lattice<vTIntegerZ>            LatticeIntegerZ;
+    typedef Lattice<vTInteger>            LatticeInteger; // Predicates for "where"
     
     typedef Lattice<vColourMatrix>     LatticeColourMatrix;
     typedef Lattice<vSpinMatrix>       LatticeSpinMatrix;
diff --git a/Grid_aligned_allocator.h b/Grid_aligned_allocator.h
index dbaa0ba3..9008105a 100644
--- a/Grid_aligned_allocator.h
+++ b/Grid_aligned_allocator.h
@@ -1,5 +1,8 @@
 #ifndef GRID_ALIGNED_ALLOCATOR_H
 #define GRID_ALIGNED_ALLOCATOR_H
+
+#include <immintrin.h>
+
 namespace Grid {
 
 ////////////////////////////////////////////////////////////////////
diff --git a/Grid_fake.cc b/Grid_communicator_fake.cc
similarity index 100%
rename from Grid_fake.cc
rename to Grid_communicator_fake.cc
diff --git a/Grid_mpi.cc b/Grid_communicator_mpi.cc
similarity index 100%
rename from Grid_mpi.cc
rename to Grid_communicator_mpi.cc
diff --git a/Grid_config.h b/Grid_config.h
index 8a0fdc26..bb885708 100644
--- a/Grid_config.h
+++ b/Grid_config.h
@@ -10,9 +10,6 @@
 /* AVX512 */
 /* #undef AVX512 */
 
-/* GRID_COMMS_FAKE */
-/* #undef GRID_COMMS_FAKE */
-
 /* GRID_COMMS_MPI */
 #define GRID_COMMS_MPI 1
 
diff --git a/Grid_config.h.in b/Grid_config.h.in
index ff15a834..91948fa2 100644
--- a/Grid_config.h.in
+++ b/Grid_config.h.in
@@ -9,9 +9,6 @@
 /* AVX512 */
 #undef AVX512
 
-/* GRID_COMMS_FAKE */
-#undef GRID_COMMS_FAKE
-
 /* GRID_COMMS_MPI */
 #undef GRID_COMMS_MPI
 
diff --git a/Grid_cshift_common.h b/Grid_cshift_common.h
index 72518d7d..08e75cff 100644
--- a/Grid_cshift_common.h
+++ b/Grid_cshift_common.h
@@ -1,17 +1,5 @@
 #ifndef _GRID_CSHIFT_COMMON_H_
 #define _GRID_CSHIFT_COMMON_H_
-//////////////////////////////////////////////////////////////////////////////////////////
-// Must not lose sight that goal is to be able to construct really efficient
-// gather to a point stencil code. CSHIFT is not the best way, so probably need
-// additional stencil support.
-//
-// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
-//
-// Lattice <foo> could also allocate haloes which get used for stencil code.
-//
-// Grid could create a neighbour index table for a given stencil.
-// Could also implement CovariantCshift.
-//////////////////////////////////////////////////////////////////////////////////////////
 
 //////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split
@@ -57,7 +45,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
   }
 }
 
-
 //////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split
 //////////////////////////////////////////////////////
@@ -101,8 +88,6 @@ friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> p
   }
 }
 
-
-
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
@@ -146,7 +131,6 @@ friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAll
   }
 }
 
-
 //////////////////////////////////////////////////////
 // Scatter for when there *is* need to SIMD split
 //////////////////////////////////////////////////////
@@ -190,11 +174,9 @@ friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> po
   }
 }
 
-
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
-// if lhs is odd, rhs even??
 friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
 {
   int rd = rhs._grid->_rdimensions[dimension];
@@ -284,40 +266,6 @@ friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimens
 // Local to node Cshift
 //////////////////////////////////////////////////////
 
-  // Work out whether to permute 
-  // ABCDEFGH ->   AE BF CG DH       permute              wrap num
-  //
-  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH   0   0
-  // Shift 1       BF CG DH AE       0 0 0 1    BCDEFGHA   0   1
-  // Shift 2       CG DH AE BF       0 0 1 1    CDEFGHAB   0   2
-  // Shift 3       DH AE BF CG       0 1 1 1    DEFGHABC   0   3
-  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD   1   0 
-  // Shift 5       BF CG DH AE       1 1 1 0    FGHACBDE   1   1 
-  // Shift 6       CG DH AE BF       1 1 0 0    GHABCDEF   1   2
-  // Shift 7       DH AE BF CG       1 0 0 0    HABCDEFG   1   3
-
-  // Suppose 4way simd in one dim.
-  // ABCDEFGH ->   AECG BFDH      permute              wrap num
-
-  // Shift 0       AECG BFDH      0,00 0,00 ABCDEFGH         0     0
-  // Shift 1       BFDH CGEA      0,00 1,01 BCDEFGHA         0     1
-  // Shift 2       CGEA DHFB      1,01 1,01 CDEFGHAB         1     0
-  // Shift 3       DHFB EAGC      1,01 1,11 DEFGHABC         1     1
-  // Shift 4       EAGC FBHD      1,11 1,11 EFGHABCD         2     0 
-  // Shift 5       FBHD GCAE      1,11 1,10 FGHABCDE         2     1
-  // Shift 6       GCAE HDBF      1,10 1,10 GHABCDEF         3     0
-  // Shift 7       HDBF AECG      1,10 0,00 HABCDEFG         3     1
-
-  // Generalisation to 8 way simd, 16 way simd required.
-  //
-  // Need log2 Nway masks. consisting of 
-  //	    1 bit  256 bit granule
-  //	    2 bit  128 bit granule
-  //        4 bits 64  bit granule
-  //        8 bits 32  bit granules
-  //
-  //        15 bits....
-
 friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
 {
   int sshift[2];
@@ -333,35 +281,31 @@ friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int
   }
 }
 
-
 friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
-  int fd = rhs._grid->_fdimensions[dimension];
-  int rd = rhs._grid->_rdimensions[dimension];
-  int ld = rhs._grid->_ldimensions[dimension];
-  int gd = rhs._grid->_gdimensions[dimension];
-  
+  GridBase *grid = rhs._grid;
+  int fd = grid->_fdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
+  int ld = grid->_ldimensions[dimension];
+  int gd = grid->_gdimensions[dimension];
 
   // Map to always positive shift modulo global full dimension.
   shift = (shift+fd)%fd;
 
-  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
+  ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift);
         
   // the permute type
-  int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
-  int permute_type=0;
-  for(int d=0;d<dimension;d++){
-    if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
-  }
+  int permute_dim =grid->PermuteDim(dimension);
+  int permute_type=grid->PermuteType(dimension);
 
   for(int x=0;x<rd;x++){       
 
     int o   = 0;
-    int bo  = x * rhs._grid->_ostride[dimension];
+    int bo  = x * grid->_ostride[dimension];
     
     int cb= (cbmask==0x2)? 1 : 0;
 
-    int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
+    int sshift = grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
     int sx     = (x+sshift)%rd;
 	
     int permute_slice=0;
diff --git a/Grid_cshift_mpi.h b/Grid_cshift_mpi.h
index 75a4dc96..4ca8f6b5 100644
--- a/Grid_cshift_mpi.h
+++ b/Grid_cshift_mpi.h
@@ -146,10 +146,7 @@ friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
   assert(shift>=0);
   assert(shift<fd);
 
-  int permute_type=0;
-  for(int d=0;d<dimension;d++){
-    if (grid->_simd_layout[d]>1 ) permute_type++;
-  }
+  int permute_type=grid->PermuteType(dimension);
 
   ///////////////////////////////////////////////
   // Simd direction uses an extract/merge pair
@@ -236,9 +233,12 @@ friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
       if ( x< rd-num ) permute_slice=wrap;
       else permute_slice = 1-wrap;
 
+      int toggle_bit = (Nsimd>>(permute_type+1));
+      int PermuteMap;
       for(int i=0;i<Nsimd;i++){
 	if ( permute_slice ) {
-	  pointers[i] = rpointers[permute_map[permute_type][i]];
+	  PermuteMap=i^toggle_bit;
+	  pointers[i] = rpointers[PermuteMap];
 	} else {
 	  pointers[i] = rpointers[i];
 	}
@@ -260,8 +260,4 @@ friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
     }
   }
 }
-
-
-
-
 #endif
diff --git a/Grid_main.cc b/Grid_main.cc
index cb45fd85..62356436 100644
--- a/Grid_main.cc
+++ b/Grid_main.cc
@@ -20,8 +20,8 @@ int main (int argc, char ** argv)
   
   std::vector<int> mpi_layout(4);
   mpi_layout[0]=2;
-  mpi_layout[1]=1;
-  mpi_layout[2]=1;
+  mpi_layout[1]=2;
+  mpi_layout[2]=2;
   mpi_layout[3]=2;
 
 #ifdef AVX512
diff --git a/Grid_simd.h b/Grid_simd.h
index ffcbff15..53cc8909 100644
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -10,32 +10,6 @@
 //
 // Vector types are arch dependent
 ////////////////////////////////////////////////////////////////////////
-    // TODO
-    //
-    // Base class to share common code between vRealF, VComplexF etc...
-    //
-    // lattice Broad cast assignment
-    //
-    // where() support
-    // implement with masks, and/or? Type of the mask & boolean support?
-    //
-    // Unary functions
-    // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
-    // exp, log, sqrt, fabs
-    //
-    // transposeColor, transposeSpin,
-    // adjColor, adjSpin,
-    // traceColor, traceSpin.
-    // peekColor, peekSpin + pokeColor PokeSpin
-    //
-    // copyMask.
-    //
-    // localMaxAbs
-    //
-    // norm2,
-    // sumMulti equivalent.
-    // Fourier transform equivalent.
-    //
     
   ////////////////////////////////////////////////////////////
   // SIMD Alignment controls
@@ -71,9 +45,6 @@ namespace Grid {
   typedef std::complex<RealD> ComplexD;
   typedef std::complex<Real>  Complex;
 
-
-
-
   inline RealF adj(const RealF  & r){ return r; }
   inline RealF conj(const RealF  & r){ return r; }
   inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
@@ -122,7 +93,6 @@ namespace Grid {
   template<>            inline void ZeroIt(RealD &arg){ arg=0; };
 
 
-
 #if defined (SSE2)
     typedef __m128 fvec;
     typedef __m128d dvec;
@@ -162,31 +132,46 @@ namespace Grid {
     inline void v_prefetch0(int size, const char *ptr){};
 #endif
 
-};
-
 
 /////////////////////////////////////////////////////////////////
 // Generic extract/merge/permute
 /////////////////////////////////////////////////////////////////
-template<class vsimd,class scalar,int Nsimd>
+template<class vsimd,class scalar>
 inline void Gextract(vsimd &y,std::vector<scalar *> &extracted){
-  // Bounce off stack is painful
-  // temporary hack while I figure out the right interface
-  scalar buf[Nsimd]; 
-  vstore(y,buf);
-  for(int i=0;i<Nsimd;i++){
-    *extracted[i] = buf[i];
-    extracted[i]++;
+#if 1
+  // FIXME: bounce off stack is painful
+  // temporary hack while I figure out better way.
+  // There are intrinsics to do this work without the storage.
+  int Nsimd = extracted.size();
+  {
+    std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
+    vstore(y,&buf[0]);
+    for(int i=0;i<Nsimd;i++){
+      *extracted[i] = buf[i];
+      extracted[i]++;
+    }
   }
+#else 
+  int NSo   = extracted.size();
+  int NSv   = vsimd::Nsimd();
+  int sparse= NSv/NSo;
+  for(int i=0;i<NSv;i+=sparse){
+    
+  }
+#endif
 };
-template<class vsimd,class scalar,int Nsimd>
+template<class vsimd,class scalar>
 inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
-  scalar buf[Nsimd]; 
+#if 1
+  int Nsimd = extracted.size();
+  std::vector<scalar> buf(Nsimd); 
   for(int i=0;i<Nsimd;i++){
     buf[i]=*extracted[i];
     extracted[i]++;
   }
-  vset(y,buf); 
+  vset(y,&buf[0]); 
+#else
+#endif
 };
 
 //////////////////////////////////////////////////////////
@@ -197,8 +182,6 @@ inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
 // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
 // Permute 4 possible on half precision @512bit vectors.
 //////////////////////////////////////////////////////////
-// Should be able to make the permute/extract/merge independent of the
-// vector subtype and reduce the volume of code.
 template<class vsimd>
 inline void Gpermute(vsimd &y,vsimd b,int perm){
       switch (perm){
@@ -229,6 +212,7 @@ inline void Gpermute(vsimd &y,vsimd b,int perm){
       default: assert(0); break;
       }
     };
+};
 
 #include <Grid_vRealF.h>
 #include <Grid_vRealD.h>
diff --git a/Grid_stencil.h b/Grid_stencil.h
new file mode 100644
index 00000000..d93204fc
--- /dev/null
+++ b/Grid_stencil.h
@@ -0,0 +1,12 @@
+//////////////////////////////////////////////////////////////////////////////////////////
+// Must not lose sight that goal is to be able to construct really efficient
+// gather to a point stencil code. CSHIFT is not the best way, so probably need
+// additional stencil support.
+//
+// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
+//
+// Lattice <foo> could also allocate haloes which get used for stencil code.
+//
+// Grid could create a neighbour index table for a given stencil.
+// Could also implement CovariantCshift.
+//////////////////////////////////////////////////////////////////////////////////////////
diff --git a/Grid_vComplexD.h b/Grid_vComplexD.h
index d1caefb1..330de26e 100644
--- a/Grid_vComplexD.h
+++ b/Grid_vComplexD.h
@@ -5,7 +5,7 @@
 
 namespace Grid {
     class vComplexD {
-    protected:
+    public:
         zvec v;
     public:
 	typedef zvec     vector_type;
@@ -154,64 +154,27 @@ namespace Grid {
             return ret;
         };
 
-        /////////////////////////////////////////////////////////////////
-        // Extract
-        /////////////////////////////////////////////////////////////////
-        friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vComplexD::Nsimd();
-	  std::vector<ComplexD> buf(Nsimd); 
+	////////////////////////////////////////////////////////////////////
+	// General permute; assumes vector length is same across 
+	// all subtypes; may not be a good assumption, but could
+	// add the vector width as a template param for BG/Q for example
+	////////////////////////////////////////////////////////////////////
+	friend inline void permute(vComplexD &y,vComplexD b,int perm)
+	{
+	  Gpermute<vComplexD>(y,b,perm);
+	}
+	friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
+	{
+	  Gmerge<vComplexD,ComplexD >(y,extracted);
+	}
+	friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted)
+	{
+	  Gextract<vComplexD,ComplexD>(y,extracted);
+	}
 
-	  vstore(y,&buf[0]);
-
-	  for(int i=0;i<Nsimd;i++){
-	    *extracted[i] = buf[i];
-	    extracted[i]++;
-	  }
-        };
-
-        friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vComplexD::Nsimd();
-	  std::vector<ComplexD> buf(Nsimd); 
-
-	  for(int i=0;i<Nsimd;i++){
-	    buf[i]=*extracted[i];
-	    extracted[i]++;
-	  }
-	  vset(y,&buf[0]); 
-        };
-        
-        
-        /////////////////////////////////////////////////////////////////
-        // Permute
-        /////////////////////////////////////////////////////////////////
-        friend inline void permute(vComplexD &y,vComplexD b,int perm){
-            switch (perm){
-                    // 2 complex=>1 permute
-#if defined(AVX1)||defined(AVX2)
-                case 0: y.v = _mm256_permute2f128_pd(b.v,b.v,0x01); break;
-                // AB => BA i.e. ab cd =>cd ab
-#endif
-#ifdef SSE2
-		  break;
-#endif
-#ifdef AVX512
-                    // 4 complex=>2 permute
-                // ABCD => BADC i.e. abcd efgh => cdab ghef
-                // ABCD => CDAB i.e. abcd efgh => efgh abcd
-                case 0: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_BADC); break;
-                case 1: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); // permute for double is not implemented 
-
-#endif
-#ifdef QPX
-#error // Not implemented yet
-#endif
- 	        default: assert(0); break;
-            }
-        };
+	////////////////////////////////////////////////////////////////////////
+	// FIXME:  gonna remove these load/store, get, set, prefetch
+	////////////////////////////////////////////////////////////////////////
         void vload(zvec& a){
           this->v = a;
         }
@@ -296,7 +259,7 @@ friend inline void vstore(vComplexD &ret, ComplexD *a){
 #endif
             return ret;
         }
-// REDUCE
+// REDUCE FIXME must be a cleaner implementation
        friend inline ComplexD Reduce(const vComplexD & in)
        { 
 #if defined (AVX1) || defined(AVX2)
diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h
index 578228a3..b7fb3d6a 100644
--- a/Grid_vComplexF.h
+++ b/Grid_vComplexF.h
@@ -4,7 +4,9 @@
 
 namespace Grid {
     class vComplexF {
-    protected:
+      //    protected:
+
+    public:
         cvec v;
         
     public:
@@ -129,75 +131,11 @@ namespace Grid {
 #endif
             return ret;
         };
+      
 
-        /////////////////////////////////////////////////////////////////
-        // Extract
-        /////////////////////////////////////////////////////////////////
-        friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted){
-	  // Bounce off heap is painful
-	  // temporary hack while I figure out the right interface
-            vComplexF vbuf;
-            ComplexF *buf = (ComplexF *)&vbuf;
-            
-	  vstore(y,&buf[0]);
-	  for(int i=0;i<vComplexF::Nsimd();i++){
-	    *extracted[i] = buf[i];
-	    extracted[i]++;
-	  }
-
-        };
-
-        friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vComplexF::Nsimd();
-            vComplexF vbuf;
-            ComplexF *buf = (ComplexF *)&vbuf;
-
-	  for(int i=0;i<Nsimd;i++){
-	    buf[i]=*extracted[i];
-	    extracted[i]++;
-	  }
-	  vset(y,&buf[0]); 
-        };
-        
-
-
-        /////////////////////////////////////////////////////////////////
-        // Permute
-        /////////////////////////////////////////////////////////////////
-        friend inline void permute(vComplexF &y,vComplexF b,int perm){
-            switch (perm){
-#if defined(AVX1)||defined(AVX2)
-//HERE
-                    // 4 complex=>2 permutes
-                    // case 0 ABCD->BADC
-                    // case 1 ABCD->CDAB
-                case 0: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
-                case 1: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
-#endif
-#ifdef SSE2
-                case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
-#endif
-#ifdef AVX512
-//#error should permute for  512
-                    // 8 complex=>3 permutes
-                    // case 0 ABCD EFGH -> BADC FEHG
-                    // case 1 ABCD EFGH -> CDAB GHEF
-                    // case 2 ABCD EFGH -> EFGH ABCD
-                case 0: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break; // OK
-                case 1: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break; // OK
-                case 2: y.v = _mm512_permute4f128_ps(b.v, (_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; // OK
-
-#endif
-#ifdef QPX
-#error
-#endif
-	        default: assert(0); break;
-            }
-        };
-        
-
+	////////////////////////////////////////////////////////////////////////
+	// FIXME:  gonna remove these load/store, get, set, prefetch
+	////////////////////////////////////////////////////////////////////////
         friend inline void vset(vComplexF &ret, Complex *a){
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
@@ -358,6 +296,20 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
             return *this;
         }
 
+      friend inline void permute(vComplexF &y,vComplexF b,int perm)
+      {
+	Gpermute<vComplexF>(y,b,perm);
+      }
+      friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
+      {
+	Gmerge<vComplexF,ComplexF >(y,extracted);
+      }
+      friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted)
+      {
+	Gextract<vComplexF,ComplexF>(y,extracted);
+      }
+
+
     };
 
     inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
@@ -371,7 +323,5 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
         return l*r;
     }
 
-
-
 }
 #endif
diff --git a/Grid_vInteger.h b/Grid_vInteger.h
index 7a0c1c4f..6ddce191 100644
--- a/Grid_vInteger.h
+++ b/Grid_vInteger.h
@@ -235,70 +235,11 @@ friend inline void vstore(vInteger &ret, Integer *a){
       }
       friend inline void merge(vIntegerF &y,std::vector<Integer *> &extracted)
       {
-	Gmerge<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
+	Gmerge<vIntegerF,Integer>(y,extracted);
       }
       friend inline void extract(vIntegerF &y,std::vector<Integer *> &extracted)
       {
-	Gextract<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
-      }
-    };
-
-
-    class vIntegerD : public vInteger
-    {
-    public:
-      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(double);}
-      
-      friend inline void permute(vIntegerD &y,vIntegerD b,int perm)
-      {
-	Gpermute<vIntegerD>(y,b,perm);
-      }
-      friend inline void merge(vIntegerD &y,std::vector<Integer *> &extracted)
-      {
-	Gmerge<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
-      }
-      friend inline void extract(vIntegerD &y,std::vector<Integer *> &extracted)
-      {
-	Gextract<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
-      }
-    };
-
-
-    class vIntegerC : public vInteger
-    {
-    public:
-      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexF);}
-      
-      friend inline void permute(vIntegerC &y,vIntegerC b,int perm)
-      {
-	Gpermute<vIntegerC>(y,b,perm);
-      }
-      friend inline void merge(vIntegerC &y,std::vector<Integer *> &extracted)
-      {
-	Gmerge<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
-      }
-      friend inline void extract(vIntegerC &y,std::vector<Integer *> &extracted)
-      {
-	Gextract<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
-      }
-    };
-
-    class vIntegerZ : public vInteger
-    {
-    public:
-      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexD);}
-      
-      friend inline void permute(vIntegerZ &y,vIntegerZ b,int perm)
-      {
-	Gpermute<vIntegerZ>(y,b,perm);
-      }
-      friend inline void merge(vIntegerZ &y,std::vector<Integer *> &extracted)
-      {
-	Gmerge<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
-      }
-      friend inline void extract(vIntegerZ &y,std::vector<Integer *> &extracted)
-      {
-	Gextract<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
+	Gextract<vIntegerF,Integer>(y,extracted);
       }
     };
 
diff --git a/Grid_vRealD.h b/Grid_vRealD.h
index 1abc0804..13ceedbe 100644
--- a/Grid_vRealD.h
+++ b/Grid_vRealD.h
@@ -5,7 +5,7 @@
 
 namespace Grid {
     class vRealD  {
-    protected:
+    public:
         dvec v; // dvec is double precision vector
 
     public:
@@ -99,72 +99,27 @@ namespace Grid {
             return ret;
         };
 
-        /////////////////////////////////////////////////////////////////
-        // Extract
-        /////////////////////////////////////////////////////////////////
-        friend inline void extract(vRealD &y,std::vector<RealD *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vRealD::Nsimd();
-	  RealD buf[Nsimd]; 
+	////////////////////////////////////////////////////////////////////
+	// General permute; assumes vector length is same across 
+	// all subtypes; may not be a good assumption, but could
+	// add the vector width as a template param for BG/Q for example
+	////////////////////////////////////////////////////////////////////
+	friend inline void permute(vRealD &y,vRealD b,int perm)
+	{
+	  Gpermute<vRealD>(y,b,perm);
+	}
+	friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
+	{
+	  Gmerge<vRealD,RealD >(y,extracted);
+	}
+	friend inline void extract(vRealD &y,std::vector<RealD *> &extracted)
+	{
+	  Gextract<vRealD,RealD>(y,extracted);
+	}
 
-	  vstore(y,buf);
-
-	  for(int i=0;i<Nsimd;i++){
-	    *extracted[i] = buf[i];
-	    extracted[i]++;
-	  }
-        };
-
-        friend inline void merge(vRealD &y,std::vector<RealD *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vRealD::Nsimd();
-	  RealD buf[Nsimd]; 
-
-	  for(int i=0;i<Nsimd;i++){
-	    buf[i]=*extracted[i];
-	    extracted[i]++;
-	  }
-	  vset(y,buf); 
-        };
-
-        
-        // Permute plans
-        // Permute 0 every ABCDEFGH -> BA DC FE HG
-        // Permute 1 every ABCDEFGH -> CD AB GH EF
-        // Permute 2 every ABCDEFGH -> EFGH ABCD
-        // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
-        // Permute 4 possible on half precision @512bit vectors.
-        friend inline void permute(vRealD &y,vRealD b,int perm){
-            switch (perm){
-                    // 4 doubles=>2 permutes
-#if defined(AVX1)||defined(AVX2)
-                case 0: y.v = _mm256_shuffle_pd(b.v,b.v,0x5); break;
-                case 1: y.v = _mm256_permute2f128_pd(b.v,b.v,0x01); break;
-#endif
-#ifdef SSE2
-                case 0: y.v = _mm_shuffle_pd(b.v,b.v,0x1); break;
-#endif
-#ifdef AVX512
-                    // 8 double => 3 permutes
-        // Permute 0 every abcd efgh -> badc fehg 
-        // Permute 1 every abcd efgh -> cdab ghef 
-        // Permute 2 every abcd efgh -> efgh abcd 
-        // NOTE: mm_512_permutex_pd not implemented
-        // NOTE: ignore warning
-                case 0: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_CDAB); break;
-                case 1: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_BADC); break;
-                case 2: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
-      
-#endif
-#ifdef QPX
-#error
-#endif
-	    default: assert(0);break;
-            }
-        };
-// gona be bye bye
+	////////////////////////////////////////////////////////////////////////
+	// FIXME:  gonna remove these load/store, get, set, prefetch
+	////////////////////////////////////////////////////////////////////////
         void vload(dvec& a){
           this->v = a;
         }
diff --git a/Grid_vRealF.h b/Grid_vRealF.h
index 22809b83..185f4da8 100644
--- a/Grid_vRealF.h
+++ b/Grid_vRealF.h
@@ -5,7 +5,7 @@
 
 namespace Grid {
     class vRealF  {
-    protected:
+    public:
         fvec v;
 
     public:
@@ -120,74 +120,25 @@ namespace Grid {
         friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
 
 
-        /////////////////////////////////////////////////////////////////
-        // Extract
-        /////////////////////////////////////////////////////////////////
-        friend inline void extract(vRealF &y,std::vector<RealF *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vRealF::Nsimd();
-	  RealF buf[Nsimd]; 
+	////////////////////////////////////////////////////////////////////
+	// General permute; assumes vector length is same across 
+	// all subtypes; may not be a good assumption, but could
+	// add the vector width as a template param for BG/Q for example
+	////////////////////////////////////////////////////////////////////
+	friend inline void permute(vRealF &y,vRealF b,int perm)
+	{
+	  Gpermute<vRealF>(y,b,perm);
+	}
+	friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
+	{
+	  Gmerge<vRealF,RealF >(y,extracted);
+	}
+	friend inline void extract(vRealF &y,std::vector<RealF *> &extracted)
+	{
+	  Gextract<vRealF,RealF>(y,extracted);
+	}
 
-	  vstore(y,buf);
 
-	  for(int i=0;i<Nsimd;i++){
-	    *extracted[i] = buf[i];
-	    extracted[i]++;
-	  }
-        };
-
-        friend inline void merge(vRealF &y,std::vector<RealF *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vRealF::Nsimd();
-	  RealF buf[Nsimd]; 
-
-	  for(int i=0;i<Nsimd;i++){
-	    buf[i]=*extracted[i];
-	    extracted[i]++;
-	  }
-	  vset(y,buf); 
-        };
-        
-        //////////////////////////////////////////////////////////
-        // Permute
-        // Permute 0 every ABCDEFGH -> BA DC FE HG
-        // Permute 1 every ABCDEFGH -> CD AB GH EF
-        // Permute 2 every ABCDEFGH -> EFGH ABCD
-        // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
-        // Permute 4 possible on half precision @512bit vectors.
-        //////////////////////////////////////////////////////////
-        friend inline void permute(vRealF &y,vRealF b,int perm){
-            switch (perm){
-                    // 8 floats=>3 permutes
-#if defined(AVX1)||defined(AVX2)
-                case 0: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
-                case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
-                case 2: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
-#endif
-#ifdef SSE2
-                case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
-                case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
-#endif
-#ifdef AVX512
-                    // 16 floats=> permutes
-        // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo 
-        // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn 
-        // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
-        // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
-//#error not implemented should do something
-                case 0: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break;
-                case 1: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break;
-                case 2: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
-                case 3: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
-#endif
-#ifdef QPX
-#error not implemented
-#endif
-	    default: assert(0); break;
-            }
-        };
         
         /////////////////////////////////////////////////////
         // Broadcast a value across Nsimd copies.
@@ -207,6 +158,8 @@ namespace Grid {
             ret.v = {a,a,a,a};
 #endif
         }
+
+
         friend inline void vset(vRealF &ret, float *a){
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
@@ -224,6 +177,9 @@ namespace Grid {
 #endif
 	}
 
+	////////////////////////////////////////////////////////////////////////
+	// FIXME:  gonna remove these load/store, get, set, prefetch
+	////////////////////////////////////////////////////////////////////////
 friend inline void vstore(vRealF &ret, float *a){
 #if defined (AVX1)|| defined (AVX2)
 	_mm256_store_ps(a,ret.v);
diff --git a/Makefile.am b/Makefile.am
index d3a7b329..d11f28d5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -24,7 +24,6 @@ include_HEADERS = Grid_config.h\
 	Grid_aligned_allocator.h\
 	Grid_cshift.h\
 	Grid_cshift_common.h\
-	Grid_cshift_fake.h\
 	Grid_cshift_mpi.h\
 	Grid_cshift_none.h\
 	Grid_math_types.h
@@ -37,13 +36,10 @@ bin_PROGRAMS = Grid_main
 
 extra_sources=
 if BUILD_COMMS_MPI
-  extra_sources+=Grid_mpi.cc
-endif
-if BUILD_COMMS_FAKE
-  extra_sources+=Grid_fake.cc
+  extra_sources+=Grid_communicator_mpi.cc
 endif
 if BUILD_COMMS_NONE
-  extra_sources+=Grid_fake.cc
+  extra_sources+=Grid_communicator_fake.cc
 endif
 
 Grid_main_SOURCES = \
diff --git a/Makefile.in b/Makefile.in
index 6983b96a..16981e52 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -89,9 +89,8 @@ NORMAL_UNINSTALL = :
 PRE_UNINSTALL = :
 POST_UNINSTALL = :
 bin_PROGRAMS = Grid_main$(EXEEXT)
-@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_mpi.cc
-@BUILD_COMMS_FAKE_TRUE@am__append_2 = Grid_fake.cc
-@BUILD_COMMS_NONE_TRUE@am__append_3 = Grid_fake.cc
+@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc
+@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/configure.ac
@@ -146,12 +145,13 @@ libGrid_a_LIBADD =
 am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT)
 libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS)
 PROGRAMS = $(bin_PROGRAMS)
-am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_mpi.cc Grid_fake.cc
-@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_mpi.$(OBJEXT)
-@BUILD_COMMS_FAKE_TRUE@am__objects_2 = Grid_fake.$(OBJEXT)
-@BUILD_COMMS_NONE_TRUE@am__objects_3 = Grid_fake.$(OBJEXT)
-am__objects_4 = $(am__objects_1) $(am__objects_2) $(am__objects_3)
-am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_4)
+am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_communicator_mpi.cc \
+	Grid_communicator_fake.cc
+@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT)
+@BUILD_COMMS_NONE_TRUE@am__objects_2 =  \
+@BUILD_COMMS_NONE_TRUE@	Grid_communicator_fake.$(OBJEXT)
+am__objects_3 = $(am__objects_1) $(am__objects_2)
+am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_3)
 Grid_main_OBJECTS = $(am_Grid_main_OBJECTS)
 Grid_main_DEPENDENCIES = libGrid.a
 AM_V_P = $(am__v_P_@AM_V@)
@@ -214,8 +214,8 @@ CTAGS = ctags
 CSCOPE = cscope
 AM_RECURSIVE_TARGETS = cscope
 am__DIST_COMMON = $(srcdir)/Grid_config.h.in $(srcdir)/Makefile.in \
-	AUTHORS COPYING ChangeLog INSTALL NEWS README compile depcomp \
-	install-sh missing
+	AUTHORS COPYING ChangeLog INSTALL NEWS README TODO compile \
+	depcomp install-sh missing
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@@ -353,12 +353,11 @@ include_HEADERS = Grid_config.h\
 	Grid_aligned_allocator.h\
 	Grid_cshift.h\
 	Grid_cshift_common.h\
-	Grid_cshift_fake.h\
 	Grid_cshift_mpi.h\
 	Grid_cshift_none.h\
 	Grid_math_types.h
 
-extra_sources = $(am__append_1) $(am__append_2) $(am__append_3)
+extra_sources = $(am__append_1) $(am__append_2)
 Grid_main_SOURCES = \
 	Grid_main.cc\
 	$(extra_sources)
@@ -506,10 +505,10 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c
 
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_fake.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_fake.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_mpi.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_mpi.Po@am__quote@
 
 .cc.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
diff --git a/TODO b/TODO
index 9febd6c3..d2373ba1 100644
--- a/TODO
+++ b/TODO
@@ -1,4 +1,7 @@
 * FIXME audit
+* Remove vload/store etc..
+* Replace vset with a call to merge.
+* Replace vset with a call to merge.
 
 * Conditional execution Subset, where etc...
 * Coordinate information, integers etc...
@@ -27,3 +30,103 @@
   - BinaryWriter, TextWriter etc...
   - protocol buffers?
   - 
+// Cartesian grid inheritance
+//            Grid::GridBase
+//                     |
+//           __________|___________
+//          |                      |
+// Grid::GridCartesian   Grid::GridCartesianRedBlack
+//
+// TODO: document the following as an API guaranteed public interface
+
+    /* 
+     *       Rough map of functionality against QDP++ Layout
+     *
+     *       Param     |     Grid                     |     QDP++             
+     *       -----------------------------------------
+     *                 |                              |
+     *        void     |     oSites, iSites, lSites   |  sitesOnNode 
+     *        void     |     gSites                   |  vol
+     *                 |                              |
+     *        gcoor    |     oIndex, iIndex           |  linearSiteIndex // no virtual node in QDP
+     *        lcoor    |                              |
+     * 
+     *        void     |     CheckerBoarded           |  -        // No checkerboarded in QDP
+     *        void     |     FullDimensions           |  lattSize
+     *        void     |     GlobalDimensions         |  lattSize // No checkerboarded in QDP
+     *        void     |     LocalDimensions          |  subgridLattSize
+     *        void     |     VirtualLocalDimensions   |  subgridLattSize // no virtual node in QDP
+     *                 |                              |
+     *       int x 3   |     oiSiteRankToGlobal       |  siteCoords
+     *                 |     ProcessorCoorLocalCoorToGlobalCoor | 
+     *                 |                              |
+     *     vector<int> |     GlobalCoorToRankIndex   |  nodeNumber(coord)
+     *     vector<int> |     GlobalCoorToProcessorCoorLocalCoor|  nodeCoord(coord)
+     *                 |                              |
+     *     void        |     Processors               |  logicalSize    // returns cart array shape
+     *     void        |     ThisRank        |  nodeNumber();  // returns this node rank
+     *     void        |     ThisProcessorCoor        |    // returns this node coor
+     *     void        |     isBoss(void)             |  primaryNode();
+     *                 |                              |
+     *                 |     RankFromProcessorCoor    |  getLogicalCoorFrom(node)
+     *                 |     ProcessorCoorFromRank    |  getNodeNumberFrom(logical_coord)
+     */
+  // Work out whether to permute 
+  // ABCDEFGH ->   AE BF CG DH       permute              wrap num
+  //
+  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH   0   0
+  // Shift 1       BF CG DH AE       0 0 0 1    BCDEFGHA   0   1
+  // Shift 2       CG DH AE BF       0 0 1 1    CDEFGHAB   0   2
+  // Shift 3       DH AE BF CG       0 1 1 1    DEFGHABC   0   3
+  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD   1   0 
+  // Shift 5       BF CG DH AE       1 1 1 0    FGHACBDE   1   1 
+  // Shift 6       CG DH AE BF       1 1 0 0    GHABCDEF   1   2
+  // Shift 7       DH AE BF CG       1 0 0 0    HABCDEFG   1   3
+
+  // Suppose 4way simd in one dim.
+  // ABCDEFGH ->   AECG BFDH      permute              wrap num
+
+  // Shift 0       AECG BFDH      0,00 0,00 ABCDEFGH         0     0
+  // Shift 1       BFDH CGEA      0,00 1,01 BCDEFGHA         0     1
+  // Shift 2       CGEA DHFB      1,01 1,01 CDEFGHAB         1     0
+  // Shift 3       DHFB EAGC      1,01 1,11 DEFGHABC         1     1
+  // Shift 4       EAGC FBHD      1,11 1,11 EFGHABCD         2     0 
+  // Shift 5       FBHD GCAE      1,11 1,10 FGHABCDE         2     1
+  // Shift 6       GCAE HDBF      1,10 1,10 GHABCDEF         3     0
+  // Shift 7       HDBF AECG      1,10 0,00 HABCDEFG         3     1
+
+  // Generalisation to 8 way simd, 16 way simd required.
+  //
+  // Need log2 Nway masks. consisting of 
+  //	    1 bit  256 bit granule
+  //	    2 bit  128 bit granule
+  //        4 bits 64  bit granule
+  //        8 bits 32  bit granules
+  //
+  //        15 bits....
+    // TODO
+    //
+    // Base class to share common code between vRealF, VComplexF etc...
+    //
+    // lattice Broad cast assignment
+    //
+    // where() support
+    // implement with masks, and/or? Type of the mask & boolean support?
+    //
+    // Unary functions
+    // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
+    // exp, log, sqrt, fabs
+    //
+    // transposeColor, transposeSpin,
+    // adjColor, adjSpin,
+    // traceColor, traceSpin.
+    // peekColor, peekSpin + pokeColor PokeSpin
+    //
+    // copyMask.
+    //
+    // localMaxAbs
+    //
+    // norm2,
+    // sumMulti equivalent.
+    // Fourier transform equivalent.
+    //
diff --git a/configure b/configure
index cb6bab86..c6f93abc 100755
--- a/configure
+++ b/configure
@@ -628,8 +628,6 @@ LTLIBOBJS
 LIBOBJS
 BUILD_COMMS_NONE_FALSE
 BUILD_COMMS_NONE_TRUE
-BUILD_COMMS_FAKE_FALSE
-BUILD_COMMS_FAKE_TRUE
 BUILD_COMMS_MPI_FALSE
 BUILD_COMMS_MPI_TRUE
 EGREP
@@ -1369,8 +1367,7 @@ Optional Features:
   --disable-openmp        do not use OpenMP
   --enable-simd=SSE|AVX|AVX2|AVX512
                           Select instructions
-  --enable-comms=none|fake|mpi
-                          Select communications
+  --enable-comms=none|mpi Select communications
 
 Some influential environment variables:
   CXX         C++ compiler command
@@ -5051,12 +5048,6 @@ fi
 
 
 case ${ac_COMMS} in
-     fake)
-       echo Configuring for FAKE communications
-
-$as_echo "#define GRID_COMMS_FAKE 1" >>confdefs.h
-
-     ;;
      none)
        echo Configuring for NO communications
 
@@ -5082,14 +5073,6 @@ else
   BUILD_COMMS_MPI_FALSE=
 fi
 
- if  test "X${ac_COMMS}X" == "XfakeX" ; then
-  BUILD_COMMS_FAKE_TRUE=
-  BUILD_COMMS_FAKE_FALSE='#'
-else
-  BUILD_COMMS_FAKE_TRUE='#'
-  BUILD_COMMS_FAKE_FALSE=
-fi
-
  if  test "X${ac_COMMS}X" == "XnoneX" ; then
   BUILD_COMMS_NONE_TRUE=
   BUILD_COMMS_NONE_FALSE='#'
@@ -5243,10 +5226,6 @@ if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
   as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${BUILD_COMMS_FAKE_TRUE}" && test -z "${BUILD_COMMS_FAKE_FALSE}"; then
-  as_fn_error $? "conditional \"BUILD_COMMS_FAKE\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${BUILD_COMMS_NONE_TRUE}" && test -z "${BUILD_COMMS_NONE_FALSE}"; then
   as_fn_error $? "conditional \"BUILD_COMMS_NONE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
diff --git a/configure.ac b/configure.ac
index 22d0365e..bfe745e5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -51,13 +51,9 @@ case ${ac_SIMD} in
 esac
 
 
-AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|fake|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
+AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
 
 case ${ac_COMMS} in
-     fake)
-       echo Configuring for FAKE communications
-       AC_DEFINE([GRID_COMMS_FAKE],[1],[GRID_COMMS_FAKE] )
-     ;;
      none)
        echo Configuring for NO communications
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
@@ -72,7 +68,6 @@ case ${ac_COMMS} in
 esac
 
 AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
-AM_CONDITIONAL(BUILD_COMMS_FAKE,[ test "X${ac_COMMS}X" == "XfakeX" ])
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])