Major rework of extract/merge/permute processing debugged and working.

2025-07-25 17:07:07 +01:00 · 2015-04-06 11:26:24 +01:00
parent 9e597ac50a
commit 982274e5a0
24 changed files with 291 additions and 574 deletions
--- a/Grid.h
+++ b/Grid.h
@@ -42,11 +42,10 @@
 #endif


+#include <Grid_aligned_allocator.h>
 #include <Grid_simd.h>
 #include <Grid_math_types.h>
 #include <Grid_Cartesian.h>
-#include <Grid_aligned_allocator.h>
-#include <Grid_aligned_allocator.h>
 #include <Grid_Lattice.h>
 #include <Grid_QCD.h>

--- a/Grid_Cartesian.h
+++ b/Grid_Cartesian.h
@@ -8,48 +8,6 @@ namespace Grid{
 /////////////////////////////////////////////////////////////////////////////////////////
 // Grid Support.
 /////////////////////////////////////////////////////////////////////////////////////////
-//
-// Cartesian grid inheritance
-//            Grid::GridBase
-//                     |
-//           __________|___________
-//          |                      |
-// Grid::GridCartesian   Grid::GridCartesianRedBlack
-//
-// TODO: document the following as an API guaranteed public interface
-
-    /* 
-     *       Rough map of functionality against QDP++ Layout
-     *
-     *       Param     |     Grid                     |     QDP++             
-     *       -----------------------------------------
-     *                 |                              |
-     *        void     |     oSites, iSites, lSites   |  sitesOnNode 
-     *        void     |     gSites                   |  vol
-     *                 |                              |
-     *        gcoor    |     oIndex, iIndex           |  linearSiteIndex // no virtual node in QDP
-     *        lcoor    |                              |
-     * 
-     *        void     |     CheckerBoarded           |  -        // No checkerboarded in QDP
-     *        void     |     FullDimensions           |  lattSize
-     *        void     |     GlobalDimensions         |  lattSize // No checkerboarded in QDP
-     *        void     |     LocalDimensions          |  subgridLattSize
-     *        void     |     VirtualLocalDimensions   |  subgridLattSize // no virtual node in QDP
-     *                 |                              |
-     *       int x 3   |     oiSiteRankToGlobal       |  siteCoords
-     *                 |     ProcessorCoorLocalCoorToGlobalCoor | 
-     *                 |                              |
-     *     vector<int> |     GlobalCoorToRankIndex   |  nodeNumber(coord)
-     *     vector<int> |     GlobalCoorToProcessorCoorLocalCoor|  nodeCoord(coord)
-     *                 |                              |
-     *     void        |     Processors               |  logicalSize    // returns cart array shape
-     *     void        |     ThisRank        |  nodeNumber();  // returns this node rank
-     *     void        |     ThisProcessorCoor        |    // returns this node coor
-     *     void        |     isBoss(void)             |  primaryNode();
-     *                 |                              |
-     *                 |     RankFromProcessorCoor    |  getLogicalCoorFrom(node)
-     *                 |     ProcessorCoorFromRank    |  getNodeNumberFrom(logical_coord)
-     */

 class GridBase : public CartesianCommunicator {
 public:
@@ -60,7 +18,8 @@ public:
 GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};

        
- //protected:
+ //FIXME 
+ // protected:
 // Lattice wide random support. not yet fully implemented. Need seed strategy
 // and one generator per site.
 // std::default_random_engine generator;
@@ -165,7 +124,16 @@ public:
 	lane    = lane / _simd_layout[d];
      }
    }
-    
+    inline int PermuteDim(int dimension){
+      return _simd_layout[dimension]>1;
+    }
+    inline int PermuteType(int dimension){
+      int permute_type=0;
+      for(int d=_ndimension-1;d>dimension;d--){
+	if (_simd_layout[d]>1 ) permute_type++;
+      }
+      return permute_type;
+    }
    ////////////////////////////////////////////////////////////////
    // Array sizing queries
    ////////////////////////////////////////////////////////////////
@@ -399,8 +367,6 @@ public:
            
        ////////////////////////////////////////////////////////////////////////////////////////////
        // subplane information
-        // It may be worth the investment of generating a more general subplane "iterator",
-        // and providing support for threads grabbing a unit of allocation.
        ////////////////////////////////////////////////////////////////////////////////////////////
        _slice_block.resize(_ndimension);
        _slice_stride.resize(_ndimension);
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -4,17 +4,9 @@
 #include "Grid.h"


-
 namespace Grid {

-// Permute the pointers 32bitx16 = 512
-static int permute_map[4][16] = { 
-  { 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
-  { 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13},
-  { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11},
-  { 9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8}
-};
-
+  extern int GridCshiftPermuteMap[4][16];

 template<class vobj>
 class Lattice
@@ -37,11 +29,10 @@ public:

 #include <Grid_cshift.h>
    
-    // overloading Grid::conformable but no conformable in Grid ...?:w
    template<class obj1,class obj2>
    friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);

-    // Performance difference between operator * and mult is troubling.
+    // FIXME Performance difference between operator * and mult is troubling.
    // Auto move constructor seems to lose surprisingly much.

    // Site wise binary operations
@@ -182,23 +173,20 @@ public:
        }}
    };

+    // FIXME Implement a consistent seed management strategy
    friend void gaussian(Lattice<vobj> &l){
        // Zero mean, unit variance.
        std::normal_distribution<double> distribution(0.0,1.0);
        Real *v_ptr = (Real *)&l._odata[0];
        size_t v_len = l._grid->oSites()*sizeof(vobj);
        size_t d_len = v_len/sizeof(Real);
-        
-        // Not a parallel RNG. Could make up some seed per 4d site, seed
-        // per hypercube type scheme.
+
        for(int i=0;i<d_len;i++){
 	  v_ptr[i]= drand48();
        }
    };

-
    // Unary functions and Unops
-    // Unary negation
    friend inline Lattice<vobj> operator -(const Lattice<vobj> &r) {
        Lattice<vobj> ret(r._grid);
 #pragma omp parallel for
--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@@ -24,10 +24,7 @@ namespace QCD {
    typedef iSinglet<Real >             TReal;    // This is painful. Tensor singlet complex type.


-    typedef iSinglet<vIntegerF >         vTIntegerF;
-    typedef iSinglet<vIntegerD >         vTIntegerD;
-    typedef iSinglet<vIntegerC >         vTIntegerC;
-    typedef iSinglet<vIntegerZ >         vTIntegerZ;
+    typedef iSinglet<vInteger >         vTInteger;

    typedef iSpinMatrix<Complex >       SpinMatrix;
    typedef iColourMatrix<Complex >     ColourMatrix;
@@ -46,12 +43,9 @@ namespace QCD {
    typedef iColourVector<vComplex >     vColourVector;
    typedef iSpinColourVector<vComplex > vSpinColourVector;
    
-    typedef Lattice<vTComplex>         LatticeComplex;
+    typedef Lattice<vTComplex>            LatticeComplex;

-    typedef Lattice<vTIntegerF>            LatticeIntegerF; // Predicates for "where"
-    typedef Lattice<vTIntegerD>            LatticeIntegerD; 
-    typedef Lattice<vTIntegerC>            LatticeIntegerC;
-    typedef Lattice<vTIntegerZ>            LatticeIntegerZ;
+    typedef Lattice<vTInteger>            LatticeInteger; // Predicates for "where"
    
    typedef Lattice<vColourMatrix>     LatticeColourMatrix;
    typedef Lattice<vSpinMatrix>       LatticeSpinMatrix;
--- a/Grid_aligned_allocator.h
+++ b/Grid_aligned_allocator.h
@@ -1,5 +1,8 @@
 #ifndef GRID_ALIGNED_ALLOCATOR_H
 #define GRID_ALIGNED_ALLOCATOR_H
+
+#include <immintrin.h>
+
 namespace Grid {

 ////////////////////////////////////////////////////////////////////
--- a/Grid_communicator_fake.cc
+++ b/Grid_communicator_fake.cc
--- a/Grid_communicator_mpi.cc
+++ b/Grid_communicator_mpi.cc
--- a/Grid_config.h
+++ b/Grid_config.h
@@ -10,9 +10,6 @@
 /* AVX512 */
 /* #undef AVX512 */

-/* GRID_COMMS_FAKE */
-/* #undef GRID_COMMS_FAKE */
-
 /* GRID_COMMS_MPI */
 #define GRID_COMMS_MPI 1

--- a/Grid_config.h.in
+++ b/Grid_config.h.in
@@ -9,9 +9,6 @@
 /* AVX512 */
 #undef AVX512

-/* GRID_COMMS_FAKE */
-#undef GRID_COMMS_FAKE
-
 /* GRID_COMMS_MPI */
 #undef GRID_COMMS_MPI

--- a/Grid_cshift_common.h
+++ b/Grid_cshift_common.h
@@ -1,17 +1,5 @@
 #ifndef _GRID_CSHIFT_COMMON_H_
 #define _GRID_CSHIFT_COMMON_H_
-//////////////////////////////////////////////////////////////////////////////////////////
-// Must not lose sight that goal is to be able to construct really efficient
-// gather to a point stencil code. CSHIFT is not the best way, so probably need
-// additional stencil support.
-//
-// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
-//
-// Lattice <foo> could also allocate haloes which get used for stencil code.
-//
-// Grid could create a neighbour index table for a given stencil.
-// Could also implement CovariantCshift.
-//////////////////////////////////////////////////////////////////////////////////////////

 //////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split
@@ -57,7 +45,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
  }
 }

-
 //////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split
 //////////////////////////////////////////////////////
@@ -101,8 +88,6 @@ friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> p
  }
 }

-
-
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
@@ -146,7 +131,6 @@ friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAll
  }
 }

-
 //////////////////////////////////////////////////////
 // Scatter for when there *is* need to SIMD split
 //////////////////////////////////////////////////////
@@ -190,11 +174,9 @@ friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> po
  }
 }

-
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
-// if lhs is odd, rhs even??
 friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];
@@ -284,40 +266,6 @@ friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimens
 // Local to node Cshift
 //////////////////////////////////////////////////////

-  // Work out whether to permute 
-  // ABCDEFGH ->   AE BF CG DH       permute              wrap num
-  //
-  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH   0   0
-  // Shift 1       BF CG DH AE       0 0 0 1    BCDEFGHA   0   1
-  // Shift 2       CG DH AE BF       0 0 1 1    CDEFGHAB   0   2
-  // Shift 3       DH AE BF CG       0 1 1 1    DEFGHABC   0   3
-  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD   1   0 
-  // Shift 5       BF CG DH AE       1 1 1 0    FGHACBDE   1   1 
-  // Shift 6       CG DH AE BF       1 1 0 0    GHABCDEF   1   2
-  // Shift 7       DH AE BF CG       1 0 0 0    HABCDEFG   1   3
-
-  // Suppose 4way simd in one dim.
-  // ABCDEFGH ->   AECG BFDH      permute              wrap num
-
-  // Shift 0       AECG BFDH      0,00 0,00 ABCDEFGH         0     0
-  // Shift 1       BFDH CGEA      0,00 1,01 BCDEFGHA         0     1
-  // Shift 2       CGEA DHFB      1,01 1,01 CDEFGHAB         1     0
-  // Shift 3       DHFB EAGC      1,01 1,11 DEFGHABC         1     1
-  // Shift 4       EAGC FBHD      1,11 1,11 EFGHABCD         2     0 
-  // Shift 5       FBHD GCAE      1,11 1,10 FGHABCDE         2     1
-  // Shift 6       GCAE HDBF      1,10 1,10 GHABCDEF         3     0
-  // Shift 7       HDBF AECG      1,10 0,00 HABCDEFG         3     1
-
-  // Generalisation to 8 way simd, 16 way simd required.
-  //
-  // Need log2 Nway masks. consisting of 
-  //	    1 bit  256 bit granule
-  //	    2 bit  128 bit granule
-  //        4 bits 64  bit granule
-  //        8 bits 32  bit granules
-  //
-  //        15 bits....
-
 friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];
@@ -333,35 +281,31 @@ friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int
  }
 }

-
 friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
-  int fd = rhs._grid->_fdimensions[dimension];
-  int rd = rhs._grid->_rdimensions[dimension];
-  int ld = rhs._grid->_ldimensions[dimension];
-  int gd = rhs._grid->_gdimensions[dimension];
-  
+  GridBase *grid = rhs._grid;
+  int fd = grid->_fdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
+  int ld = grid->_ldimensions[dimension];
+  int gd = grid->_gdimensions[dimension];

  // Map to always positive shift modulo global full dimension.
  shift = (shift+fd)%fd;

-  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
+  ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift);
        
  // the permute type
-  int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
-  int permute_type=0;
-  for(int d=0;d<dimension;d++){
-    if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
-  }
+  int permute_dim =grid->PermuteDim(dimension);
+  int permute_type=grid->PermuteType(dimension);

  for(int x=0;x<rd;x++){       

    int o   = 0;
-    int bo  = x * rhs._grid->_ostride[dimension];
+    int bo  = x * grid->_ostride[dimension];
    
    int cb= (cbmask==0x2)? 1 : 0;

-    int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
+    int sshift = grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
    int sx     = (x+sshift)%rd;
 	
    int permute_slice=0;
--- a/Grid_cshift_mpi.h
+++ b/Grid_cshift_mpi.h
@@ -146,10 +146,7 @@ friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
  assert(shift>=0);
  assert(shift<fd);

-  int permute_type=0;
-  for(int d=0;d<dimension;d++){
-    if (grid->_simd_layout[d]>1 ) permute_type++;
-  }
+  int permute_type=grid->PermuteType(dimension);

  ///////////////////////////////////////////////
  // Simd direction uses an extract/merge pair
@@ -236,9 +233,12 @@ friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
      if ( x< rd-num ) permute_slice=wrap;
      else permute_slice = 1-wrap;

+      int toggle_bit = (Nsimd>>(permute_type+1));
+      int PermuteMap;
      for(int i=0;i<Nsimd;i++){
 	if ( permute_slice ) {
-	  pointers[i] = rpointers[permute_map[permute_type][i]];
+	  PermuteMap=i^toggle_bit;
+	  pointers[i] = rpointers[PermuteMap];
 	} else {
 	  pointers[i] = rpointers[i];
 	}
@@ -260,8 +260,4 @@ friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
    }
  }
 }
-
-
-
-
 #endif
--- a/Grid_main.cc
+++ b/Grid_main.cc
@@ -20,8 +20,8 @@ int main (int argc, char ** argv)
  
  std::vector<int> mpi_layout(4);
  mpi_layout[0]=2;
-  mpi_layout[1]=1;
-  mpi_layout[2]=1;
+  mpi_layout[1]=2;
+  mpi_layout[2]=2;
  mpi_layout[3]=2;

 #ifdef AVX512
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -10,32 +10,6 @@
 //
 // Vector types are arch dependent
 ////////////////////////////////////////////////////////////////////////
-    // TODO
-    //
-    // Base class to share common code between vRealF, VComplexF etc...
-    //
-    // lattice Broad cast assignment
-    //
-    // where() support
-    // implement with masks, and/or? Type of the mask & boolean support?
-    //
-    // Unary functions
-    // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
-    // exp, log, sqrt, fabs
-    //
-    // transposeColor, transposeSpin,
-    // adjColor, adjSpin,
-    // traceColor, traceSpin.
-    // peekColor, peekSpin + pokeColor PokeSpin
-    //
-    // copyMask.
-    //
-    // localMaxAbs
-    //
-    // norm2,
-    // sumMulti equivalent.
-    // Fourier transform equivalent.
-    //
    
  ////////////////////////////////////////////////////////////
  // SIMD Alignment controls
@@ -71,9 +45,6 @@ namespace Grid {
  typedef std::complex<RealD> ComplexD;
  typedef std::complex<Real>  Complex;

-
-
-
  inline RealF adj(const RealF  & r){ return r; }
  inline RealF conj(const RealF  & r){ return r; }
  inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
@@ -122,7 +93,6 @@ namespace Grid {
  template<>            inline void ZeroIt(RealD &arg){ arg=0; };


-
 #if defined (SSE2)
    typedef __m128 fvec;
    typedef __m128d dvec;
@@ -162,31 +132,46 @@ namespace Grid {
    inline void v_prefetch0(int size, const char *ptr){};
 #endif

-};
-

 /////////////////////////////////////////////////////////////////
 // Generic extract/merge/permute
 /////////////////////////////////////////////////////////////////
-template<class vsimd,class scalar,int Nsimd>
+template<class vsimd,class scalar>
 inline void Gextract(vsimd &y,std::vector<scalar *> &extracted){
-  // Bounce off stack is painful
-  // temporary hack while I figure out the right interface
-  scalar buf[Nsimd]; 
-  vstore(y,buf);
-  for(int i=0;i<Nsimd;i++){
-    *extracted[i] = buf[i];
-    extracted[i]++;
+#if 1
+  // FIXME: bounce off stack is painful
+  // temporary hack while I figure out better way.
+  // There are intrinsics to do this work without the storage.
+  int Nsimd = extracted.size();
+  {
+    std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
+    vstore(y,&buf[0]);
+    for(int i=0;i<Nsimd;i++){
+      *extracted[i] = buf[i];
+      extracted[i]++;
+    }
  }
+#else 
+  int NSo   = extracted.size();
+  int NSv   = vsimd::Nsimd();
+  int sparse= NSv/NSo;
+  for(int i=0;i<NSv;i+=sparse){
+    
+  }
+#endif
 };
-template<class vsimd,class scalar,int Nsimd>
+template<class vsimd,class scalar>
 inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
-  scalar buf[Nsimd]; 
+#if 1
+  int Nsimd = extracted.size();
+  std::vector<scalar> buf(Nsimd); 
  for(int i=0;i<Nsimd;i++){
    buf[i]=*extracted[i];
    extracted[i]++;
  }
-  vset(y,buf); 
+  vset(y,&buf[0]); 
+#else
+#endif
 };

 //////////////////////////////////////////////////////////
@@ -197,8 +182,6 @@ inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
 // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
 // Permute 4 possible on half precision @512bit vectors.
 //////////////////////////////////////////////////////////
-// Should be able to make the permute/extract/merge independent of the
-// vector subtype and reduce the volume of code.
 template<class vsimd>
 inline void Gpermute(vsimd &y,vsimd b,int perm){
      switch (perm){
@@ -229,6 +212,7 @@ inline void Gpermute(vsimd &y,vsimd b,int perm){
      default: assert(0); break;
      }
    };
+};

 #include <Grid_vRealF.h>
 #include <Grid_vRealD.h>
--- a/Grid_stencil.h
+++ b/Grid_stencil.h
@@ -0,0 +1,12 @@
+//////////////////////////////////////////////////////////////////////////////////////////
+// Must not lose sight that goal is to be able to construct really efficient
+// gather to a point stencil code. CSHIFT is not the best way, so probably need
+// additional stencil support.
+//
+// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
+//
+// Lattice <foo> could also allocate haloes which get used for stencil code.
+//
+// Grid could create a neighbour index table for a given stencil.
+// Could also implement CovariantCshift.
+//////////////////////////////////////////////////////////////////////////////////////////
--- a/Grid_vComplexD.h
+++ b/Grid_vComplexD.h
@@ -5,7 +5,7 @@

 namespace Grid {
    class vComplexD {
-    protected:
+    public:
        zvec v;
    public:
 	typedef zvec     vector_type;
@@ -154,64 +154,27 @@ namespace Grid {
            return ret;
        };

-        /////////////////////////////////////////////////////////////////
-        // Extract
-        /////////////////////////////////////////////////////////////////
-        friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vComplexD::Nsimd();
-	  std::vector<ComplexD> buf(Nsimd); 
+	////////////////////////////////////////////////////////////////////
+	// General permute; assumes vector length is same across 
+	// all subtypes; may not be a good assumption, but could
+	// add the vector width as a template param for BG/Q for example
+	////////////////////////////////////////////////////////////////////
+	friend inline void permute(vComplexD &y,vComplexD b,int perm)
+	{
+	  Gpermute<vComplexD>(y,b,perm);
+	}
+	friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
+	{
+	  Gmerge<vComplexD,ComplexD >(y,extracted);
+	}
+	friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted)
+	{
+	  Gextract<vComplexD,ComplexD>(y,extracted);
+	}

-	  vstore(y,&buf[0]);
-
-	  for(int i=0;i<Nsimd;i++){
-	    *extracted[i] = buf[i];
-	    extracted[i]++;
-	  }
-        };
-
-        friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vComplexD::Nsimd();
-	  std::vector<ComplexD> buf(Nsimd); 
-
-	  for(int i=0;i<Nsimd;i++){
-	    buf[i]=*extracted[i];
-	    extracted[i]++;
-	  }
-	  vset(y,&buf[0]); 
-        };
-        
-        
-        /////////////////////////////////////////////////////////////////
-        // Permute
-        /////////////////////////////////////////////////////////////////
-        friend inline void permute(vComplexD &y,vComplexD b,int perm){
-            switch (perm){
-                    // 2 complex=>1 permute
-#if defined(AVX1)||defined(AVX2)
-                case 0: y.v = _mm256_permute2f128_pd(b.v,b.v,0x01); break;
-                // AB => BA i.e. ab cd =>cd ab
-#endif
-#ifdef SSE2
-		  break;
-#endif
-#ifdef AVX512
-                    // 4 complex=>2 permute
-                // ABCD => BADC i.e. abcd efgh => cdab ghef
-                // ABCD => CDAB i.e. abcd efgh => efgh abcd
-                case 0: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_BADC); break;
-                case 1: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); // permute for double is not implemented 
-
-#endif
-#ifdef QPX
-#error // Not implemented yet
-#endif
- 	        default: assert(0); break;
-            }
-        };
+	////////////////////////////////////////////////////////////////////////
+	// FIXME:  gonna remove these load/store, get, set, prefetch
+	////////////////////////////////////////////////////////////////////////
        void vload(zvec& a){
          this->v = a;
        }
@@ -296,7 +259,7 @@ friend inline void vstore(vComplexD &ret, ComplexD *a){
 #endif
            return ret;
        }
-// REDUCE
+// REDUCE FIXME must be a cleaner implementation
       friend inline ComplexD Reduce(const vComplexD & in)
       { 
 #if defined (AVX1) || defined(AVX2)
--- a/Grid_vComplexF.h
+++ b/Grid_vComplexF.h
@@ -4,7 +4,9 @@

 namespace Grid {
    class vComplexF {
-    protected:
+      //    protected:
+
+    public:
        cvec v;
        
    public:
@@ -129,75 +131,11 @@ namespace Grid {
 #endif
            return ret;
        };
+      

-        /////////////////////////////////////////////////////////////////
-        // Extract
-        /////////////////////////////////////////////////////////////////
-        friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted){
-	  // Bounce off heap is painful
-	  // temporary hack while I figure out the right interface
-            vComplexF vbuf;
-            ComplexF *buf = (ComplexF *)&vbuf;
-            
-	  vstore(y,&buf[0]);
-	  for(int i=0;i<vComplexF::Nsimd();i++){
-	    *extracted[i] = buf[i];
-	    extracted[i]++;
-	  }
-
-        };
-
-        friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vComplexF::Nsimd();
-            vComplexF vbuf;
-            ComplexF *buf = (ComplexF *)&vbuf;
-
-	  for(int i=0;i<Nsimd;i++){
-	    buf[i]=*extracted[i];
-	    extracted[i]++;
-	  }
-	  vset(y,&buf[0]); 
-        };
-        
-
-
-        /////////////////////////////////////////////////////////////////
-        // Permute
-        /////////////////////////////////////////////////////////////////
-        friend inline void permute(vComplexF &y,vComplexF b,int perm){
-            switch (perm){
-#if defined(AVX1)||defined(AVX2)
-//HERE
-                    // 4 complex=>2 permutes
-                    // case 0 ABCD->BADC
-                    // case 1 ABCD->CDAB
-                case 0: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
-                case 1: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
-#endif
-#ifdef SSE2
-                case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
-#endif
-#ifdef AVX512
-//#error should permute for  512
-                    // 8 complex=>3 permutes
-                    // case 0 ABCD EFGH -> BADC FEHG
-                    // case 1 ABCD EFGH -> CDAB GHEF
-                    // case 2 ABCD EFGH -> EFGH ABCD
-                case 0: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break; // OK
-                case 1: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break; // OK
-                case 2: y.v = _mm512_permute4f128_ps(b.v, (_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break; // OK
-
-#endif
-#ifdef QPX
-#error
-#endif
-	        default: assert(0); break;
-            }
-        };
-        
-
+	////////////////////////////////////////////////////////////////////////
+	// FIXME:  gonna remove these load/store, get, set, prefetch
+	////////////////////////////////////////////////////////////////////////
        friend inline void vset(vComplexF &ret, Complex *a){
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
@@ -358,6 +296,20 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
            return *this;
        }

+      friend inline void permute(vComplexF &y,vComplexF b,int perm)
+      {
+	Gpermute<vComplexF>(y,b,perm);
+      }
+      friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
+      {
+	Gmerge<vComplexF,ComplexF >(y,extracted);
+      }
+      friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted)
+      {
+	Gextract<vComplexF,ComplexF>(y,extracted);
+      }
+
+
    };

    inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
@@ -371,7 +323,5 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
        return l*r;
    }

-
-
 }
 #endif
--- a/Grid_vInteger.h
+++ b/Grid_vInteger.h
@@ -235,70 +235,11 @@ friend inline void vstore(vInteger &ret, Integer *a){
      }
      friend inline void merge(vIntegerF &y,std::vector<Integer *> &extracted)
      {
-	Gmerge<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
+	Gmerge<vIntegerF,Integer>(y,extracted);
      }
      friend inline void extract(vIntegerF &y,std::vector<Integer *> &extracted)
      {
-	Gextract<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
-      }
-    };
-
-
-    class vIntegerD : public vInteger
-    {
-    public:
-      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(double);}
-      
-      friend inline void permute(vIntegerD &y,vIntegerD b,int perm)
-      {
-	Gpermute<vIntegerD>(y,b,perm);
-      }
-      friend inline void merge(vIntegerD &y,std::vector<Integer *> &extracted)
-      {
-	Gmerge<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
-      }
-      friend inline void extract(vIntegerD &y,std::vector<Integer *> &extracted)
-      {
-	Gextract<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
-      }
-    };
-
-
-    class vIntegerC : public vInteger
-    {
-    public:
-      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexF);}
-      
-      friend inline void permute(vIntegerC &y,vIntegerC b,int perm)
-      {
-	Gpermute<vIntegerC>(y,b,perm);
-      }
-      friend inline void merge(vIntegerC &y,std::vector<Integer *> &extracted)
-      {
-	Gmerge<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
-      }
-      friend inline void extract(vIntegerC &y,std::vector<Integer *> &extracted)
-      {
-	Gextract<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
-      }
-    };
-
-    class vIntegerZ : public vInteger
-    {
-    public:
-      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexD);}
-      
-      friend inline void permute(vIntegerZ &y,vIntegerZ b,int perm)
-      {
-	Gpermute<vIntegerZ>(y,b,perm);
-      }
-      friend inline void merge(vIntegerZ &y,std::vector<Integer *> &extracted)
-      {
-	Gmerge<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
-      }
-      friend inline void extract(vIntegerZ &y,std::vector<Integer *> &extracted)
-      {
-	Gextract<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
+	Gextract<vIntegerF,Integer>(y,extracted);
      }
    };

--- a/Grid_vRealD.h
+++ b/Grid_vRealD.h
@@ -5,7 +5,7 @@

 namespace Grid {
    class vRealD  {
-    protected:
+    public:
        dvec v; // dvec is double precision vector

    public:
@@ -99,72 +99,27 @@ namespace Grid {
            return ret;
        };

-        /////////////////////////////////////////////////////////////////
-        // Extract
-        /////////////////////////////////////////////////////////////////
-        friend inline void extract(vRealD &y,std::vector<RealD *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vRealD::Nsimd();
-	  RealD buf[Nsimd]; 
+	////////////////////////////////////////////////////////////////////
+	// General permute; assumes vector length is same across 
+	// all subtypes; may not be a good assumption, but could
+	// add the vector width as a template param for BG/Q for example
+	////////////////////////////////////////////////////////////////////
+	friend inline void permute(vRealD &y,vRealD b,int perm)
+	{
+	  Gpermute<vRealD>(y,b,perm);
+	}
+	friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
+	{
+	  Gmerge<vRealD,RealD >(y,extracted);
+	}
+	friend inline void extract(vRealD &y,std::vector<RealD *> &extracted)
+	{
+	  Gextract<vRealD,RealD>(y,extracted);
+	}

-	  vstore(y,buf);
-
-	  for(int i=0;i<Nsimd;i++){
-	    *extracted[i] = buf[i];
-	    extracted[i]++;
-	  }
-        };
-
-        friend inline void merge(vRealD &y,std::vector<RealD *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vRealD::Nsimd();
-	  RealD buf[Nsimd]; 
-
-	  for(int i=0;i<Nsimd;i++){
-	    buf[i]=*extracted[i];
-	    extracted[i]++;
-	  }
-	  vset(y,buf); 
-        };
-
-        
-        // Permute plans
-        // Permute 0 every ABCDEFGH -> BA DC FE HG
-        // Permute 1 every ABCDEFGH -> CD AB GH EF
-        // Permute 2 every ABCDEFGH -> EFGH ABCD
-        // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
-        // Permute 4 possible on half precision @512bit vectors.
-        friend inline void permute(vRealD &y,vRealD b,int perm){
-            switch (perm){
-                    // 4 doubles=>2 permutes
-#if defined(AVX1)||defined(AVX2)
-                case 0: y.v = _mm256_shuffle_pd(b.v,b.v,0x5); break;
-                case 1: y.v = _mm256_permute2f128_pd(b.v,b.v,0x01); break;
-#endif
-#ifdef SSE2
-                case 0: y.v = _mm_shuffle_pd(b.v,b.v,0x1); break;
-#endif
-#ifdef AVX512
-                    // 8 double => 3 permutes
-        // Permute 0 every abcd efgh -> badc fehg 
-        // Permute 1 every abcd efgh -> cdab ghef 
-        // Permute 2 every abcd efgh -> efgh abcd 
-        // NOTE: mm_512_permutex_pd not implemented
-        // NOTE: ignore warning
-                case 0: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_CDAB); break;
-                case 1: y.v = _mm512_swizzle_pd(b.v,_MM_SWIZ_REG_BADC); break;
-                case 2: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
-      
-#endif
-#ifdef QPX
-#error
-#endif
-	    default: assert(0);break;
-            }
-        };
-// gona be bye bye
+	////////////////////////////////////////////////////////////////////////
+	// FIXME:  gonna remove these load/store, get, set, prefetch
+	////////////////////////////////////////////////////////////////////////
        void vload(dvec& a){
          this->v = a;
        }
--- a/Grid_vRealF.h
+++ b/Grid_vRealF.h
@@ -5,7 +5,7 @@

 namespace Grid {
    class vRealF  {
-    protected:
+    public:
        fvec v;

    public:
@@ -120,74 +120,25 @@ namespace Grid {
        friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}


-        /////////////////////////////////////////////////////////////////
-        // Extract
-        /////////////////////////////////////////////////////////////////
-        friend inline void extract(vRealF &y,std::vector<RealF *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vRealF::Nsimd();
-	  RealF buf[Nsimd]; 
+	////////////////////////////////////////////////////////////////////
+	// General permute; assumes vector length is same across 
+	// all subtypes; may not be a good assumption, but could
+	// add the vector width as a template param for BG/Q for example
+	////////////////////////////////////////////////////////////////////
+	friend inline void permute(vRealF &y,vRealF b,int perm)
+	{
+	  Gpermute<vRealF>(y,b,perm);
+	}
+	friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
+	{
+	  Gmerge<vRealF,RealF >(y,extracted);
+	}
+	friend inline void extract(vRealF &y,std::vector<RealF *> &extracted)
+	{
+	  Gextract<vRealF,RealF>(y,extracted);
+	}

-	  vstore(y,buf);

-	  for(int i=0;i<Nsimd;i++){
-	    *extracted[i] = buf[i];
-	    extracted[i]++;
-	  }
-        };
-
-        friend inline void merge(vRealF &y,std::vector<RealF *> &extracted){
-	  // Bounce off stack is painful
-	  // temporary hack while I figure out the right interface
-	  const int Nsimd = vRealF::Nsimd();
-	  RealF buf[Nsimd]; 
-
-	  for(int i=0;i<Nsimd;i++){
-	    buf[i]=*extracted[i];
-	    extracted[i]++;
-	  }
-	  vset(y,buf); 
-        };
-        
-        //////////////////////////////////////////////////////////
-        // Permute
-        // Permute 0 every ABCDEFGH -> BA DC FE HG
-        // Permute 1 every ABCDEFGH -> CD AB GH EF
-        // Permute 2 every ABCDEFGH -> EFGH ABCD
-        // Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
-        // Permute 4 possible on half precision @512bit vectors.
-        //////////////////////////////////////////////////////////
-        friend inline void permute(vRealF &y,vRealF b,int perm){
-            switch (perm){
-                    // 8 floats=>3 permutes
-#if defined(AVX1)||defined(AVX2)
-                case 0: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
-                case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
-                case 2: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
-#endif
-#ifdef SSE2
-                case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
-                case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
-#endif
-#ifdef AVX512
-                    // 16 floats=> permutes
-        // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo 
-        // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn 
-        // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
-        // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
-//#error not implemented should do something
-                case 0: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break;
-                case 1: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break;
-                case 2: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
-                case 3: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
-#endif
-#ifdef QPX
-#error not implemented
-#endif
-	    default: assert(0); break;
-            }
-        };
        
        /////////////////////////////////////////////////////
        // Broadcast a value across Nsimd copies.
@@ -207,6 +158,8 @@ namespace Grid {
            ret.v = {a,a,a,a};
 #endif
        }
+
+
        friend inline void vset(vRealF &ret, float *a){
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
@@ -224,6 +177,9 @@ namespace Grid {
 #endif
 	}

+	////////////////////////////////////////////////////////////////////////
+	// FIXME:  gonna remove these load/store, get, set, prefetch
+	////////////////////////////////////////////////////////////////////////
 friend inline void vstore(vRealF &ret, float *a){
 #if defined (AVX1)|| defined (AVX2)
 	_mm256_store_ps(a,ret.v);
--- a/Makefile.am
+++ b/Makefile.am
@@ -24,7 +24,6 @@ include_HEADERS = Grid_config.h\
 	Grid_aligned_allocator.h\
 	Grid_cshift.h\
 	Grid_cshift_common.h\
-	Grid_cshift_fake.h\
 	Grid_cshift_mpi.h\
 	Grid_cshift_none.h\
 	Grid_math_types.h
@@ -37,13 +36,10 @@ bin_PROGRAMS = Grid_main

 extra_sources=
 if BUILD_COMMS_MPI
-  extra_sources+=Grid_mpi.cc
-endif
-if BUILD_COMMS_FAKE
-  extra_sources+=Grid_fake.cc
+  extra_sources+=Grid_communicator_mpi.cc
 endif
 if BUILD_COMMS_NONE
-  extra_sources+=Grid_fake.cc
+  extra_sources+=Grid_communicator_fake.cc
 endif

 Grid_main_SOURCES = \
--- a/Makefile.in
+++ b/Makefile.in
@@ -89,9 +89,8 @@ NORMAL_UNINSTALL = :
 PRE_UNINSTALL = :
 POST_UNINSTALL = :
 bin_PROGRAMS = Grid_main$(EXEEXT)
-@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_mpi.cc
-@BUILD_COMMS_FAKE_TRUE@am__append_2 = Grid_fake.cc
-@BUILD_COMMS_NONE_TRUE@am__append_3 = Grid_fake.cc
+@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_communicator_mpi.cc
+@BUILD_COMMS_NONE_TRUE@am__append_2 = Grid_communicator_fake.cc
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/configure.ac
@@ -146,12 +145,13 @@ libGrid_a_LIBADD =
 am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT)
 libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS)
 PROGRAMS = $(bin_PROGRAMS)
-am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_mpi.cc Grid_fake.cc
-@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_mpi.$(OBJEXT)
-@BUILD_COMMS_FAKE_TRUE@am__objects_2 = Grid_fake.$(OBJEXT)
-@BUILD_COMMS_NONE_TRUE@am__objects_3 = Grid_fake.$(OBJEXT)
-am__objects_4 = $(am__objects_1) $(am__objects_2) $(am__objects_3)
-am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_4)
+am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_communicator_mpi.cc \
+	Grid_communicator_fake.cc
+@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_communicator_mpi.$(OBJEXT)
+@BUILD_COMMS_NONE_TRUE@am__objects_2 =  \
+@BUILD_COMMS_NONE_TRUE@	Grid_communicator_fake.$(OBJEXT)
+am__objects_3 = $(am__objects_1) $(am__objects_2)
+am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_3)
 Grid_main_OBJECTS = $(am_Grid_main_OBJECTS)
 Grid_main_DEPENDENCIES = libGrid.a
 AM_V_P = $(am__v_P_@AM_V@)
@@ -214,8 +214,8 @@ CTAGS = ctags
 CSCOPE = cscope
 AM_RECURSIVE_TARGETS = cscope
 am__DIST_COMMON = $(srcdir)/Grid_config.h.in $(srcdir)/Makefile.in \
-	AUTHORS COPYING ChangeLog INSTALL NEWS README compile depcomp \
-	install-sh missing
+	AUTHORS COPYING ChangeLog INSTALL NEWS README TODO compile \
+	depcomp install-sh missing
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@@ -353,12 +353,11 @@ include_HEADERS = Grid_config.h\
 	Grid_aligned_allocator.h\
 	Grid_cshift.h\
 	Grid_cshift_common.h\
-	Grid_cshift_fake.h\
 	Grid_cshift_mpi.h\
 	Grid_cshift_none.h\
 	Grid_math_types.h

-extra_sources = $(am__append_1) $(am__append_2) $(am__append_3)
+extra_sources = $(am__append_1) $(am__append_2)
 Grid_main_SOURCES = \
 	Grid_main.cc\
 	$(extra_sources)
@@ -506,10 +505,10 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c

-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_fake.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_fake.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_communicator_mpi.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_mpi.Po@am__quote@

 .cc.o:
@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
--- a/103
+++ b/103
@@ -1,4 +1,7 @@
 * FIXME audit
+* Remove vload/store etc..
+* Replace vset with a call to merge.
+* Replace vset with a call to merge.

 * Conditional execution Subset, where etc...
 * Coordinate information, integers etc...
@@ -27,3 +30,103 @@
  - BinaryWriter, TextWriter etc...
  - protocol buffers?
  - 
+// Cartesian grid inheritance
+//            Grid::GridBase
+//                     |
+//           __________|___________
+//          |                      |
+// Grid::GridCartesian   Grid::GridCartesianRedBlack
+//
+// TODO: document the following as an API guaranteed public interface
+
+    /* 
+     *       Rough map of functionality against QDP++ Layout
+     *
+     *       Param     |     Grid                     |     QDP++             
+     *       -----------------------------------------
+     *                 |                              |
+     *        void     |     oSites, iSites, lSites   |  sitesOnNode 
+     *        void     |     gSites                   |  vol
+     *                 |                              |
+     *        gcoor    |     oIndex, iIndex           |  linearSiteIndex // no virtual node in QDP
+     *        lcoor    |                              |
+     * 
+     *        void     |     CheckerBoarded           |  -        // No checkerboarded in QDP
+     *        void     |     FullDimensions           |  lattSize
+     *        void     |     GlobalDimensions         |  lattSize // No checkerboarded in QDP
+     *        void     |     LocalDimensions          |  subgridLattSize
+     *        void     |     VirtualLocalDimensions   |  subgridLattSize // no virtual node in QDP
+     *                 |                              |
+     *       int x 3   |     oiSiteRankToGlobal       |  siteCoords
+     *                 |     ProcessorCoorLocalCoorToGlobalCoor | 
+     *                 |                              |
+     *     vector<int> |     GlobalCoorToRankIndex   |  nodeNumber(coord)
+     *     vector<int> |     GlobalCoorToProcessorCoorLocalCoor|  nodeCoord(coord)
+     *                 |                              |
+     *     void        |     Processors               |  logicalSize    // returns cart array shape
+     *     void        |     ThisRank        |  nodeNumber();  // returns this node rank
+     *     void        |     ThisProcessorCoor        |    // returns this node coor
+     *     void        |     isBoss(void)             |  primaryNode();
+     *                 |                              |
+     *                 |     RankFromProcessorCoor    |  getLogicalCoorFrom(node)
+     *                 |     ProcessorCoorFromRank    |  getNodeNumberFrom(logical_coord)
+     */
+  // Work out whether to permute 
+  // ABCDEFGH ->   AE BF CG DH       permute              wrap num
+  //
+  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH   0   0
+  // Shift 1       BF CG DH AE       0 0 0 1    BCDEFGHA   0   1
+  // Shift 2       CG DH AE BF       0 0 1 1    CDEFGHAB   0   2
+  // Shift 3       DH AE BF CG       0 1 1 1    DEFGHABC   0   3
+  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD   1   0 
+  // Shift 5       BF CG DH AE       1 1 1 0    FGHACBDE   1   1 
+  // Shift 6       CG DH AE BF       1 1 0 0    GHABCDEF   1   2
+  // Shift 7       DH AE BF CG       1 0 0 0    HABCDEFG   1   3
+
+  // Suppose 4way simd in one dim.
+  // ABCDEFGH ->   AECG BFDH      permute              wrap num
+
+  // Shift 0       AECG BFDH      0,00 0,00 ABCDEFGH         0     0
+  // Shift 1       BFDH CGEA      0,00 1,01 BCDEFGHA         0     1
+  // Shift 2       CGEA DHFB      1,01 1,01 CDEFGHAB         1     0
+  // Shift 3       DHFB EAGC      1,01 1,11 DEFGHABC         1     1
+  // Shift 4       EAGC FBHD      1,11 1,11 EFGHABCD         2     0 
+  // Shift 5       FBHD GCAE      1,11 1,10 FGHABCDE         2     1
+  // Shift 6       GCAE HDBF      1,10 1,10 GHABCDEF         3     0
+  // Shift 7       HDBF AECG      1,10 0,00 HABCDEFG         3     1
+
+  // Generalisation to 8 way simd, 16 way simd required.
+  //
+  // Need log2 Nway masks. consisting of 
+  //	    1 bit  256 bit granule
+  //	    2 bit  128 bit granule
+  //        4 bits 64  bit granule
+  //        8 bits 32  bit granules
+  //
+  //        15 bits....
+    // TODO
+    //
+    // Base class to share common code between vRealF, VComplexF etc...
+    //
+    // lattice Broad cast assignment
+    //
+    // where() support
+    // implement with masks, and/or? Type of the mask & boolean support?
+    //
+    // Unary functions
+    // cos,sin, tan, acos, asin, cosh, acosh, tanh, sinh, // Scalar<vReal> only arg
+    // exp, log, sqrt, fabs
+    //
+    // transposeColor, transposeSpin,
+    // adjColor, adjSpin,
+    // traceColor, traceSpin.
+    // peekColor, peekSpin + pokeColor PokeSpin
+    //
+    // copyMask.
+    //
+    // localMaxAbs
+    //
+    // norm2,
+    // sumMulti equivalent.
+    // Fourier transform equivalent.
+    //
--- a/23
+++ b/23
@@ -628,8 +628,6 @@ LTLIBOBJS
 LIBOBJS
 BUILD_COMMS_NONE_FALSE
 BUILD_COMMS_NONE_TRUE
-BUILD_COMMS_FAKE_FALSE
-BUILD_COMMS_FAKE_TRUE
 BUILD_COMMS_MPI_FALSE
 BUILD_COMMS_MPI_TRUE
 EGREP
@@ -1369,8 +1367,7 @@ Optional Features:
  --disable-openmp        do not use OpenMP
  --enable-simd=SSE|AVX|AVX2|AVX512
                          Select instructions
-  --enable-comms=none|fake|mpi
-                          Select communications
+  --enable-comms=none|mpi Select communications

 Some influential environment variables:
  CXX         C++ compiler command
@@ -5051,12 +5048,6 @@ fi


 case ${ac_COMMS} in
-     fake)
-       echo Configuring for FAKE communications
-
-$as_echo "#define GRID_COMMS_FAKE 1" >>confdefs.h
-
-     ;;
     none)
       echo Configuring for NO communications

@@ -5082,14 +5073,6 @@ else
  BUILD_COMMS_MPI_FALSE=
 fi

- if  test "X${ac_COMMS}X" == "XfakeX" ; then
-  BUILD_COMMS_FAKE_TRUE=
-  BUILD_COMMS_FAKE_FALSE='#'
-else
-  BUILD_COMMS_FAKE_TRUE='#'
-  BUILD_COMMS_FAKE_FALSE=
-fi
-
 if  test "X${ac_COMMS}X" == "XnoneX" ; then
  BUILD_COMMS_NONE_TRUE=
  BUILD_COMMS_NONE_FALSE='#'
@@ -5243,10 +5226,6 @@ if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
  as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
-if test -z "${BUILD_COMMS_FAKE_TRUE}" && test -z "${BUILD_COMMS_FAKE_FALSE}"; then
-  as_fn_error $? "conditional \"BUILD_COMMS_FAKE\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${BUILD_COMMS_NONE_TRUE}" && test -z "${BUILD_COMMS_NONE_FALSE}"; then
  as_fn_error $? "conditional \"BUILD_COMMS_NONE\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
--- a/configure.ac
+++ b/configure.ac
@@ -51,13 +51,9 @@ case ${ac_SIMD} in
 esac


-AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|fake|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
+AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])

 case ${ac_COMMS} in
-     fake)
-       echo Configuring for FAKE communications
-       AC_DEFINE([GRID_COMMS_FAKE],[1],[GRID_COMMS_FAKE] )
-     ;;
     none)
       echo Configuring for NO communications
       AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
@@ -72,7 +68,6 @@ case ${ac_COMMS} in
 esac

 AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
-AM_CONDITIONAL(BUILD_COMMS_FAKE,[ test "X${ac_COMMS}X" == "XfakeX" ])
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])