Reorg of build structure

2025-06-13 04:37:05 +01:00 · 2015-04-18 14:55:00 +01:00
parent 57586c8e05
commit c656164015
33 changed files with 59 additions and 0 deletions
--- a/lib/Grid.h
+++ b/lib/Grid.h
@ -0,0 +1,63 @@
+//
+//  Grid.cpp
+//  simd
+//
+//  Created by Peter Boyle on 09/05/2014.
+//  Copyright (c) 2014 University of Edinburgh. All rights reserved.
+//
+
+
+#ifndef GRID_V3_H
+#define GRID_V3_H
+
+#include <stdio.h>
+#include <complex>
+#include <vector>
+#include <iostream>
+#include <cassert>
+#include <random>
+#include <functional>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <stdio.h>
+#include <signal.h>
+
+#include <Grid_config.h>
+
+////////////////////////////////////////////////////////////
+// Tunable header includes
+////////////////////////////////////////////////////////////
+
+#ifdef HAVE_OPENMP
+#define OMP
+#include <omp.h>
+#endif
+
+#ifdef HAVE_MALLOC_MALLOC_H
+#include <malloc/malloc.h>
+#endif
+
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#include <Grid_aligned_allocator.h>
+#include <Grid_simd.h>
+#include <Grid_math_types.h>
+#include <Grid_Cartesian.h>
+#include <Grid_Lattice.h>
+#include <Grid_comparison.h>
+#include <Grid_stencil.h>
+#include <Grid_QCD.h>
+
+namespace Grid {
+
+  void Grid_init(int *argc,char ***argv);
+  void Grid_finalize(void);
+  double usecond(void);
+  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
+  void Grid_debug_handler_init(void);
+
+};
+
+#endif
--- a/lib/Grid_Cartesian.h
+++ b/lib/Grid_Cartesian.h
@ -0,0 +1,399 @@
+#ifndef GRID_CARTESIAN_H
+#define GRID_CARTESIAN_H
+
+#include <Grid.h>
+#include <Grid_Communicator.h>
+namespace Grid{
+    
+/////////////////////////////////////////////////////////////////////////////////////////
+// Grid Support.
+/////////////////////////////////////////////////////////////////////////////////////////
+
+class GridBase : public CartesianCommunicator {
+public:
+
+ // Give Lattice access
+ template<class object> friend class Lattice;
+  
+ GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
+
+        
+ //FIXME 
+ // protected:
+ // Lattice wide random support. not yet fully implemented. Need seed strategy
+ // and one generator per site.
+ // std::default_random_engine generator;
+ //    static std::mt19937  generator( 9 );
+ 
+    //////////////////////////////////////////////////////////////////////
+    // Commicator provides information on the processor grid
+    //////////////////////////////////////////////////////////////////////
+    //    unsigned long _ndimension;
+    //    std::vector<int> _processors; // processor grid
+    //    int              _processor;  // linear processor rank
+    //    std::vector<int> _processor_coor;  // linear processor rank
+    //////////////////////////////////////////////////////////////////////
+
+    // Physics Grid information.
+    std::vector<int> _simd_layout;     // Which dimensions get relayed out over simd lanes.
+    std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
+    std::vector<int> _gdimensions;// Global dimensions of array after cb removal
+    std::vector<int> _ldimensions;// local dimensions of array with processor images removed
+    std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed 
+    std::vector<int> _ostride;    // Outer stride for each dimension
+    std::vector<int> _istride;    // Inner stride i.e. within simd lane
+    int _osites;                  // _isites*_osites = product(dimensions).
+    int _isites;
+    std::vector<int> _slice_block;   // subslice information
+    std::vector<int> _slice_stride;
+    std::vector<int> _slice_nblock;
+
+    // Might need these at some point
+    //    std::vector<int> _lstart;     // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
+    //    std::vector<int> _lend;       // local end of array in gcoors    _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
+
+public:
+
+    ////////////////////////////////////////////////////////////////
+    // Checkerboarding interface is virtual and overridden by 
+    // GridCartesian / GridRedBlackCartesian
+    ////////////////////////////////////////////////////////////////
+    virtual int CheckerBoarded(int dim)=0;
+    virtual int CheckerBoard(std::vector<int> site)=0;
+    virtual int CheckerBoardDestination(int source_cb,int shift)=0;
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
+    inline int  CheckerBoardFromOindex (int Oindex){
+      std::vector<int> ocoor;
+      oCoorFromOindex(ocoor,Oindex); 
+      int ss=0;
+      for(int d=0;d<_ndimension;d++){
+	ss=ss+ocoor[d];
+      }      
+      return ss&0x1;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////////////
+    // Local layout calculations
+    //////////////////////////////////////////////////////////////////////////////////////////////
+    // These routines are key. Subdivide the linearised cartesian index into
+    //      "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
+    //      "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
+    //
+    // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
+    // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
+    // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
+    // lanes are operated upon simultaneously.
+  
+    virtual int oIndex(std::vector<int> &coor)
+    {
+        int idx=0;
+	// Works with either global or local coordinates
+        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
+        return idx;
+    }
+    inline int oIndexReduced(std::vector<int> &ocoor)
+    {
+      int idx=0; 
+      // ocoor is already reduced so can eliminate the modulo operation
+      // for fast indexing and inline the routine
+      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
+      return idx;
+    }
+    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
+      coor.resize(_ndimension);
+      for(int d=0;d<_ndimension;d++){
+	coor[d] = Oindex % _rdimensions[d];
+	Oindex  = Oindex / _rdimensions[d];
+      }
+    }
+
+    //////////////////////////////////////////////////////////
+    // SIMD lane addressing
+    //////////////////////////////////////////////////////////
+    inline int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+        return idx;
+    }
+    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
+    {
+      coor.resize(_ndimension);
+      for(int d=0;d<_ndimension;d++){
+	coor[d] = lane % _simd_layout[d];
+	lane    = lane / _simd_layout[d];
+      }
+    }
+    inline int PermuteDim(int dimension){
+      return _simd_layout[dimension]>1;
+    }
+    inline int PermuteType(int dimension){
+      int permute_type=0;
+      for(int d=_ndimension-1;d>dimension;d--){
+	if (_simd_layout[d]>1 ) permute_type++;
+      }
+      return permute_type;
+    }
+    ////////////////////////////////////////////////////////////////
+    // Array sizing queries
+    ////////////////////////////////////////////////////////////////
+
+    inline int iSites(void) { return _isites; };
+    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
+    inline int oSites(void) { return _osites; };
+    inline int lSites(void) { return _isites*_osites; }; 
+    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
+    inline int Nd    (void) { return _ndimension;};
+    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
+    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
+    inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
+    inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
+
+    ////////////////////////////////////////////////////////////////
+    // Global addressing
+    ////////////////////////////////////////////////////////////////
+    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
+    {
+      gcoor.resize(_ndimension);
+      std::vector<int> coor(_ndimension);
+
+      ProcessorCoorFromRank(rank,coor);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]&coor[mu];
+
+      iCoorFromIindex(coor,i_idx);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]&coor[mu];
+
+      oCoorFromOindex (coor,o_idx);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
+      
+    }
+    void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
+    {
+      RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
+      if(CheckerBoarded(0)){
+	fcoor[0] = fcoor[0]*2+cb;
+      }
+    }
+    void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
+    {
+      gcoor.resize(_ndimension);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
+    }
+    void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
+    {
+      pcoor.resize(_ndimension);
+      lcoor.resize(_ndimension);
+      for(int mu=0;mu<_ndimension;mu++){
+	pcoor[mu] = gcoor[mu]/_ldimensions[mu];
+	lcoor[mu] = gcoor[mu]%_ldimensions[mu];
+      }
+    }
+    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
+    {
+      std::vector<int> pcoor;
+      std::vector<int> lcoor;
+      GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
+      rank = RankFromProcessorCoor(pcoor);
+      i_idx= iIndex(lcoor);
+      o_idx= oIndex(lcoor);
+    }
+
+};
+
+class GridCartesian: public GridBase {
+
+public:
+
+    virtual int CheckerBoarded(int dim){
+      return 0;
+    }
+    virtual int CheckerBoard(std::vector<int> site){
+        return 0;
+    }
+    virtual int CheckerBoardDestination(int cb,int shift){
+        return 0;
+    }
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
+        return shift;
+    }
+    GridCartesian(std::vector<int> &dimensions,
+		  std::vector<int> &simd_layout,
+		  std::vector<int> &processor_grid
+		  ) : GridBase(processor_grid)
+    {
+        ///////////////////////
+        // Grid information
+        ///////////////////////
+        _ndimension = dimensions.size();
+            
+        _fdimensions.resize(_ndimension);
+        _gdimensions.resize(_ndimension);
+        _ldimensions.resize(_ndimension);
+        _rdimensions.resize(_ndimension);
+        _simd_layout.resize(_ndimension);
+            
+        _ostride.resize(_ndimension);
+        _istride.resize(_ndimension);
+            
+        _osites = 1;
+        _isites = 1;
+        for(int d=0;d<_ndimension;d++){
+	  _fdimensions[d] = dimensions[d]; // Global dimensions
+	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
+	  _simd_layout[d] = simd_layout[d];
+
+	  //FIXME check for exact division
+
+	  // Use a reduced simd grid
+	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
+	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
+	  _osites *= _rdimensions[d];
+	  _isites *= _simd_layout[d];
+                
+	  // Addressing support
+	  if ( d==0 ) {
+	    _ostride[d] = 1;
+	    _istride[d] = 1;
+	  } else {
+	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
+	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
+	  }
+        }
+        
+        ///////////////////////
+        // subplane information
+        ///////////////////////
+        _slice_block.resize(_ndimension);
+        _slice_stride.resize(_ndimension);
+        _slice_nblock.resize(_ndimension);
+            
+        int block =1;
+        int nblock=1;
+        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
+            
+        for(int d=0;d<_ndimension;d++){
+            nblock/=_rdimensions[d];
+            _slice_block[d] =block;
+            _slice_stride[d]=_ostride[d]*_rdimensions[d];
+            _slice_nblock[d]=nblock;
+            block = block*_rdimensions[d];
+        }
+
+    };
+};
+ 
+// Specialise this for red black grids storing half the data like a chess board.
+class GridRedBlackCartesian : public GridBase
+{
+public:
+    virtual int CheckerBoarded(int dim){
+      if( dim==0) return 1;
+      else return 0;
+    }
+    virtual int CheckerBoard(std::vector<int> site){
+      return (site[0]+site[1]+site[2]+site[3])&0x1;
+    }
+
+    // Depending on the cb of site, we toggle source cb.
+    // for block #b, element #e = (b, e)
+    // we need 
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
+
+      if(dim != 0) return shift;
+
+      int fulldim =_fdimensions[0];
+      shift = (shift+fulldim)%fulldim;
+
+      // Probably faster with table lookup;
+      // or by looping over x,y,z and multiply rather than computing checkerboard.
+      int ocb=CheckerBoardFromOindex(osite);
+	  
+      if ( (source_cb+ocb)&1 ) {
+	return (shift)/2;
+      } else {
+	return (shift+1)/2;
+      }
+    }
+
+    virtual int CheckerBoardDestination(int source_cb,int shift){
+        if ((shift+_fdimensions[0])&0x1) {
+            return 1-source_cb;
+        } else {
+            return source_cb;
+        }
+    };
+    GridRedBlackCartesian(std::vector<int> &dimensions,
+			  std::vector<int> &simd_layout,
+			  std::vector<int> &processor_grid) : GridBase(processor_grid)
+    {
+    ///////////////////////
+    // Grid information
+    ///////////////////////
+        _ndimension = dimensions.size();
+        
+        _fdimensions.resize(_ndimension);
+        _gdimensions.resize(_ndimension);
+        _ldimensions.resize(_ndimension);
+        _rdimensions.resize(_ndimension);
+        _simd_layout.resize(_ndimension);
+        
+        _ostride.resize(_ndimension);
+        _istride.resize(_ndimension);
+        
+        _osites = 1;
+        _isites = 1;
+        for(int d=0;d<_ndimension;d++){
+            _fdimensions[d] = dimensions[d];
+            _gdimensions[d] = _fdimensions[d];
+            if (d==0) _gdimensions[0] = _gdimensions[0]/2; // Remove a checkerboard
+            _ldimensions[d] = _gdimensions[d]/_processors[d];
+                
+            // Use a reduced simd grid
+            _simd_layout[d] = simd_layout[d];
+            _rdimensions[d]= _ldimensions[d]/_simd_layout[d];
+
+            _osites *= _rdimensions[d];
+            _isites *= _simd_layout[d];
+                
+            // Addressing support
+            if ( d==0 ) {
+                _ostride[d] = 1;
+                _istride[d] = 1;
+            } else {
+                _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
+                _istride[d] = _istride[d-1]*_simd_layout[d-1];
+            }
+        }
+            
+        ////////////////////////////////////////////////////////////////////////////////////////////
+        // subplane information
+        ////////////////////////////////////////////////////////////////////////////////////////////
+        _slice_block.resize(_ndimension);
+        _slice_stride.resize(_ndimension);
+        _slice_nblock.resize(_ndimension);
+        
+        int block =1;
+        int nblock=1;
+        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
+            
+        for(int d=0;d<_ndimension;d++){
+            nblock/=_rdimensions[d];
+            _slice_block[d] =block;
+            _slice_stride[d]=_ostride[d]*_rdimensions[d];
+            _slice_nblock[d]=nblock;
+            block = block*_rdimensions[d];
+        }
+            
+    };
+protected:
+    virtual int oIndex(std::vector<int> &coor)
+    {
+        int idx=_ostride[0]*((coor[0]/2)%_rdimensions[0]);
+        for(int d=1;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
+        return idx;
+    };
+        
+};
+
+}
+#endif
--- a/lib/Grid_Communicator.h
+++ b/lib/Grid_Communicator.h
@ -0,0 +1,103 @@
+#ifndef GRID_COMMUNICATOR_H
+#define GRID_COMMUNICATOR_H
+///////////////////////////////////
+// Processor layout information
+///////////////////////////////////
+#ifdef GRID_COMMS_MPI
+#include <mpi.h>
+#endif
+namespace Grid {
+class CartesianCommunicator {
+  public:    
+
+  // Communicator should know nothing of the physics grid, only processor grid.
+
+    int              _Nprocessors;     // How many in all
+    std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
+    int              _processor;       // linear processor rank
+    std::vector<int> _processor_coor;  // linear processor coordinate
+    unsigned long _ndimension;
+
+#ifdef GRID_COMMS_MPI
+    MPI_Comm communicator;
+#endif
+
+    // Constructor
+    CartesianCommunicator(std::vector<int> &pdimensions_in);
+
+    // Wraps MPI_Cart routines
+    void ShiftedRanks(int dim,int shift,int & source, int & dest);
+    int  RankFromProcessorCoor(std::vector<int> &coor);
+    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
+
+    /////////////////////////////////
+    // Grid information queries
+    /////////////////////////////////
+    int                      IsBoss(void)            { return _processor==0; };
+    int                      ThisRank(void)          { return _processor; };
+    const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
+    const std::vector<int> & ProcessorGrid(void)     { return _processors; };
+    int                      ProcessorCount(void)    { return _Nprocessors; };
+
+    ////////////////////////////////////////////////////////////
+    // Reduction
+    ////////////////////////////////////////////////////////////
+    void GlobalSum(RealF &);
+    void GlobalSumVector(RealF *,int N);
+
+    void GlobalSum(RealD &);
+    void GlobalSumVector(RealD *,int N);
+
+    void GlobalSum(ComplexF &c)
+    {
+      GlobalSumVector((float *)&c,2);
+    }
+    void GlobalSumVector(ComplexF *c,int N)
+    {
+      GlobalSumVector((float *)c,2*N);
+    }
+
+    void GlobalSum(ComplexD &c)
+    {
+      GlobalSumVector((double *)&c,2);
+    }
+    void GlobalSumVector(ComplexD *c,int N)
+    {
+      GlobalSumVector((double *)c,2*N);
+    }
+    
+    template<class obj> void GlobalSum(obj &o){
+
+      typedef typename obj::scalar_type scalar_type;
+      int words = sizeof(obj)/sizeof(scalar_type);
+
+      scalar_type * ptr = (scalar_type *)& o;
+      GlobalSum(ptr,words);
+    }
+    ////////////////////////////////////////////////////////////
+    // Face exchange
+    ////////////////////////////////////////////////////////////
+    void SendToRecvFrom(void *xmit,
+			int xmit_to_rank,
+			void *recv,
+			int recv_from_rank,
+			int bytes);
+
+    ////////////////////////////////////////////////////////////
+    // Barrier
+    ////////////////////////////////////////////////////////////
+    void Barrier(void);
+
+    ////////////////////////////////////////////////////////////
+    // Broadcast a buffer and composite larger
+    ////////////////////////////////////////////////////////////
+    void Broadcast(int root,void* data, int bytes);
+    template<class obj> void Broadcast(int root,obj &data)
+    {
+      Broadcast(root,(void *)&data,sizeof(data));
+    };
+
+}; 
+}
+
+#endif
--- a/lib/Grid_Lattice.h
+++ b/lib/Grid_Lattice.h
@ -0,0 +1,611 @@
+#ifndef GRID_LATTICE_H
+#define GRID_LATTICE_H
+
+#include "Grid.h"
+
+
+namespace Grid {
+
+// TODO: Indexing ()
+//       mac,real,imag
+//
+// Functionality required:
+//     -=,+=,*=,()
+//     add,+,sub,-,mult,mac,*
+//     adj,conj
+//     real,imag
+//     transpose,transposeIndex  
+//     trace,traceIndex
+//     peekIndex
+//     innerProduct,outerProduct,
+//     localNorm2
+//     localInnerProduct
+//     
+
+extern int GridCshiftPermuteMap[4][16];
+
+template<class vobj>
+class Lattice
+{
+public:
+    GridBase *_grid;
+    int checkerboard;
+    std::vector<vobj,alignedAllocator<vobj> > _odata;
+public:
+
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::vector_type vector_type;
+
+    Lattice(GridBase *grid) : _grid(grid) {
+        _odata.reserve(_grid->oSites());
+        assert((((uint64_t)&_odata[0])&0xF) ==0);
+        checkerboard=0;
+    }
+
+#include <Grid_cshift.h>
+   
+    template<class obj1,class obj2>
+    friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);
+
+    // FIXME Performance difference between operator * and mult is troubling.
+    // Auto move constructor seems to lose surprisingly much.
+
+    // Site wise binary operations
+    // We eliminate a temporary object assignment if use the mult,add,sub routines.
+    // For the operator versions we rely on move constructor to eliminate the
+    // vector copy back.
+    template<class obj1,class obj2,class obj3>
+    friend void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
+
+    template<class obj1,class obj2,class obj3>
+    friend void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
+
+    template<class obj1,class obj2,class obj3>
+    friend void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
+
+    template<class obj1,class obj2,class obj3>
+    friend void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
+
+    
+    friend void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+        conformable(lhs,rhs);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
+        }
+    }
+    friend void axpy(Lattice<vobj> &ret,std::complex<double> a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+        conformable(lhs,rhs);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
+        }
+    }
+    inline friend Lattice<vobj> operator / (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+        conformable(lhs,rhs);
+        Lattice<vobj> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = lhs._odata[ss]/rhs._odata[ss];
+        }
+        return ret;
+    };
+
+    template<class sobj>
+    inline Lattice<vobj> & operator = (const sobj & r){
+#pragma omp parallel for
+        for(int ss=0;ss<_grid->oSites();ss++){
+            this->_odata[ss]=r;
+        }
+        return *this;
+    }
+    
+    // Poke a scalar object into the SIMD array
+    template<class sobj>
+    friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+
+      GridBase *grid=l._grid;
+
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      int Nsimd = grid->Nsimd();
+
+      assert( l.checkerboard== l._grid->CheckerBoard(site));
+      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+      int rank,odx,idx;
+      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+
+      // Optional to broadcast from node 0.
+      grid->Broadcast(0,s);
+
+      std::vector<sobj> buf(Nsimd);
+      std::vector<scalar_type *> pointers(Nsimd);  
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+
+      // extract-modify-merge cycle is easiest way and this is not perf critical
+      extract(l._odata[odx],pointers);
+      
+      buf[idx] = s;
+
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+      merge(l._odata[odx],pointers);
+
+      return;
+    };
+    
+    // Peek a scalar object from the SIMD array
+    template<class sobj>
+    friend void peekSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+        
+      GridBase *grid=l._grid;
+
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      int Nsimd = grid->Nsimd();
+
+      assert( l.checkerboard== l._grid->CheckerBoard(site));
+      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+      int rank,odx,idx;
+      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+      std::vector<sobj> buf(Nsimd);
+      std::vector<scalar_type *> pointers(Nsimd);  
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+
+      extract(l._odata[odx],pointers);
+      
+      s = buf[idx];
+      grid->Broadcast(rank,s);
+
+      return;
+    };
+    
+    // FIXME Randomise; deprecate this
+    friend void random(Lattice<vobj> &l){
+        Real *v_ptr = (Real *)&l._odata[0];
+        size_t v_len = l._grid->oSites()*sizeof(vobj);
+        size_t d_len = v_len/sizeof(Real);
+	
+        for(int i=0;i<d_len;i++){
+
+            v_ptr[i]=drand48();
+        }
+    };
+
+    // FIXME for debug; deprecate this; made obscelete by 
+    // LatticeCoordinate();
+    friend void lex_sites(Lattice<vobj> &l){
+      Real *v_ptr = (Real *)&l._odata[0];
+      size_t o_len = l._grid->oSites();
+      size_t v_len = sizeof(vobj)/sizeof(vRealF);
+      size_t vec_len = vRealF::Nsimd();
+
+      for(int i=0;i<o_len;i++){
+	for(int j=0;j<v_len;j++){
+          for(int vv=0;vv<vec_len;vv+=2){
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
+	  }
+	}}
+    }
+    
+    // FIXME Implement a consistent seed management strategy
+    friend void gaussian(Lattice<vobj> &l){
+        // Zero mean, unit variance.
+        std::normal_distribution<double> distribution(0.0,1.0);
+        Real *v_ptr = (Real *)&l._odata[0];
+        size_t v_len = l._grid->oSites()*sizeof(vobj);
+        size_t d_len = v_len/sizeof(Real);
+
+        for(int i=0;i<d_len;i++){
+	  v_ptr[i]= drand48();
+        }
+    };
+
+    // Unary functions and Unops
+    friend inline Lattice<vobj> operator -(const Lattice<vobj> &r) {
+        Lattice<vobj> ret(r._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<r._grid->oSites();ss++){
+            ret._odata[ss]= -r._odata[ss];
+        }
+        return ret;
+    }
+    // *=,+=,-= operators inherit behvour from correspond */+/- operation
+    template<class T>
+    inline Lattice<vobj> &operator *=(const T &r) {
+        *this = (*this)*r;
+        return *this;
+    }
+    template<class T>
+    inline Lattice<vobj> &operator -=(const T &r) {
+        *this = (*this)-r;
+        return *this;
+    }
+    template<class T>
+    inline Lattice<vobj> &operator +=(const T &r) {
+        *this = (*this)+r;
+        return *this;
+    }
+    
+    inline friend Lattice<vobj> adj(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = adj(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+    inline friend Lattice<vobj> transpose(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = transpose(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+
+    inline friend Lattice<vobj> conj(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = conj(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+    // remove and insert a half checkerboard
+    friend void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
+      half.checkerboard = cb;
+      int ssh=0;
+#pragma omp parallel for
+      for(int ss=0;ss<full._grid->oSites();ss++){
+	std::vector<int> coor;
+	int cbos;
+	
+	full._grid->oCoorFromOindex(coor,ss);
+	cbos=half._grid->CheckerBoard(coor);
+	
+	if (cbos==cb) {
+
+	  half._odata[ssh] = full._odata[ss];
+	  ssh++;
+	}
+      }
+    }
+    friend void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
+      int cb = half.checkerboard;
+      int ssh=0;
+#pragma omp parallel for
+      for(int ss=0;ss<full._grid->oSites();ss++){
+	std::vector<int> coor;
+	int cbos;
+	
+	full._grid->oCoorFromOindex(coor,ss);
+	cbos=half._grid->CheckerBoard(coor);
+
+	if (cbos==cb) {
+	  full._odata[ss]=half._odata[ssh];
+	  ssh++;
+	}
+      }
+    }
+}; // class Lattice
+
+    template<class obj1,class obj2>
+    void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
+    {
+        assert(lhs._grid == rhs._grid);
+        assert(lhs.checkerboard == rhs.checkerboard);
+    }
+
+    template<class obj1,class obj2,class obj3>
+    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+        conformable(lhs,rhs);
+	uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+        for(int ss=0;ss<vec_len;ss++){
+	  mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+        }
+    }
+
+    template<class obj1,class obj2,class obj3>
+    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+        conformable(lhs,rhs);
+	uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+        for(int ss=0;ss<vec_len;ss++){
+	  mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+        }
+    }
+
+    template<class obj1,class obj2,class obj3>
+    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+        conformable(lhs,rhs);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+        }
+    }
+    template<class obj1,class obj2,class obj3>
+    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+        conformable(lhs,rhs);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+        }
+    }
+
+    // Lattice BinOp Lattice,
+    template<class left,class right>
+    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+    {
+        //NB mult performs conformable check. Do not reapply here for performance.
+      Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+      mult(ret,lhs,rhs);
+      return ret;
+    }
+    template<class left,class right>
+    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+    {
+        //NB mult performs conformable check. Do not reapply here for performance.
+        Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+        add(ret,lhs,rhs);
+        return ret;
+    }
+    template<class left,class right>
+    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+    {
+        //NB mult performs conformable check. Do not reapply here for performance.
+        Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+        sub(ret,lhs,rhs);
+        return ret;
+    }
+
+    // Scalar BinOp Lattice ;generate return type
+    template<class left,class right>
+    inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+    {
+        Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=lhs*rhs._odata[ss];
+        }
+        return ret;
+    }
+    template<class left,class right>
+    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+    {
+        Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=lhs+rhs._odata[ss];
+        }
+        return ret;
+    }
+    template<class left,class right>
+    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+    {
+        Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=lhs-rhs._odata[ss];
+        }
+        return ret;
+    }
+    template<class left,class right>
+    inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+        Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites(); ss++){
+            ret._odata[ss]=lhs._odata[ss]*rhs;
+        }
+        return ret;
+    }
+    template<class left,class right>
+    inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+        Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=lhs._odata[ss]+rhs;
+        }
+        return ret;
+    }
+    template<class left,class right>
+    inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+        Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=lhs._odata[ss]-rhs;
+        }
+        return ret;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Trace
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class vobj>
+    inline auto trace(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(trace(lhs._odata[0]))>
+    {
+      Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = trace(lhs._odata[ss]);
+        }
+        return ret;
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Index level dependent operations
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj>
+    inline auto traceIndex(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
+    {
+      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
+      }
+      return ret;
+    };
+    template<int Index,class vobj>
+    inline auto transposeIndex(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
+    {
+      Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+    // Fixme; this is problematic since the number of args is variable and 
+    // may mismatch...
+    template<int Index,class vobj>
+    inline auto peekIndex(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0]))>
+    {
+      Lattice<decltype(peekIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = peekIndex<Index>(lhs._odata[ss]);
+        }
+        return ret;
+    };
+    template<int Index,class vobj>
+      inline auto peekIndex(const Lattice<vobj> &lhs,int i)
+      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
+    {
+      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
+        }
+        return ret;
+    };
+    template<int Index,class vobj>
+      inline auto peekIndex(const Lattice<vobj> &lhs,int i,int j)
+      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
+    {
+      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
+        }
+        return ret;
+    };
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Reduction operations
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class vobj>
+    inline RealD norm2(const Lattice<vobj> &arg){
+
+      typedef typename vobj::scalar_type scalar;
+      typedef typename vobj::vector_type vector;
+      decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
+      scalar nrm;
+      //FIXME make this loop parallelisable
+      vnrm=zero;
+      for(int ss=0;ss<arg._grid->oSites(); ss++){
+	vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]);
+      }
+      vector vvnrm =TensorRemove(vnrm) ;
+      nrm = Reduce(vvnrm);
+      arg._grid->GlobalSum(nrm);
+      return real(nrm);
+    }
+
+    template<class vobj>
+    inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
+    {
+      typedef typename vobj::scalar_type scalar;
+      decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
+
+      scalar nrm;
+      //FIXME make this loop parallelisable
+      for(int ss=0;ss<left._grid->oSites(); ss++){
+	vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
+      }
+      nrm = Reduce(vnrm);
+      right._grid->GlobalSum(nrm);
+      return nrm;
+    }
+
+    /////////////////////////////////////////////////////
+    // Non site reduced routines
+    /////////////////////////////////////////////////////
+
+    // localNorm2,
+    template<class vobj>
+    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+    }
+    
+    template<class vobj>
+    inline auto real(const Lattice<vobj> &z) -> Lattice<decltype(real(z._odata[0]))>
+    {
+      Lattice<decltype(real(z._odata[0]))> ret(z._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<z._grid->oSites();ss++){
+            ret._odata[ss] = real(z._odata[ss]);
+        }
+      return ret;
+    }
+
+    template<class vobj>
+    inline auto imag(const Lattice<vobj> &z) -> Lattice<decltype(imag(z._odata[0]))>
+    {
+      Lattice<decltype(imag(z._odata[0]))> ret(z._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<z._grid->oSites();ss++){
+            ret._odata[ss] = imag(z._odata[ss]);
+        }
+      return ret;
+    }
+
+    // localInnerProduct
+    template<class vobj>
+    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
+      -> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
+      }
+      return ret;
+    }
+    
+    // outerProduct Scalar x Scalar -> Scalar
+    //              Vector x Vector -> Matrix
+    template<class ll,class rr>
+    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
+    {
+        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+     }
+
+
+}
+#endif
--- a/lib/Grid_QCD.h
+++ b/lib/Grid_QCD.h
@ -0,0 +1,106 @@
+#ifndef GRID_QCD_H
+#define GRID_QCD_H
+namespace Grid{
+namespace QCD {
+
+    static const int Nc=3;
+    static const int Ns=4;
+    static const int Nd=4;
+
+    static const int CbRed  =0;
+    static const int CbBlack=1;
+    
+    //////////////////////////////////////////////////////////////////////////////
+    // QCD iMatrix types
+    // Index conventions:                            Lorentz x Spin x Colour
+    //
+    // ChrisK very keen to add extra space for Gparity doubling.
+    //
+    // Also add domain wall index, in a way where Wilson operator 
+    // naturally distributes across the 5th dimensions.
+    //////////////////////////////////////////////////////////////////////////////
+    template<typename vtype> using iSinglet          = iScalar<iScalar<iScalar<vtype> > >;
+    template<typename vtype> using iSpinMatrix       = iScalar<iMatrix<iScalar<vtype>, Ns> >;
+    template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
+    template<typename vtype> using iColourMatrix     = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
+    template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
+
+
+    template<typename vtype> using iSpinVector       = iScalar<iVector<iScalar<vtype>, Ns> >;
+    template<typename vtype> using iColourVector     = iScalar<iScalar<iVector<vtype, Nc> > >;
+    template<typename vtype> using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
+
+    typedef iSpinMatrix<Complex >          SpinMatrix;
+    typedef iColourMatrix<Complex >        ColourMatrix;
+    typedef iSpinColourMatrix<Complex >    SpinColourMatrix;
+    typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
+
+    typedef iSpinVector<Complex >       SpinVector;
+    typedef iColourVector<Complex >     ColourVector;
+    typedef iSpinColourVector<Complex > SpinColourVector;
+
+    
+    typedef iSpinMatrix<vComplex >          vSpinMatrix;
+    typedef iColourMatrix<vComplex >        vColourMatrix;
+    typedef iSpinColourMatrix<vComplex >    vSpinColourMatrix;
+    typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
+    
+    typedef iSpinVector<vComplex >       vSpinVector;
+    typedef iColourVector<vComplex >     vColourVector;
+    typedef iSpinColourVector<vComplex > vSpinColourVector;
+    
+    typedef iSinglet<Complex >          TComplex;    // This is painful. Tensor singlet complex type.
+    typedef iSinglet<vComplex >         vTComplex;   // what if we don't know the tensor structure
+    typedef iSinglet<Real >             TReal;       // Shouldn't need these; can I make it work without?
+    typedef iSinglet<vReal >            vTReal;      
+    typedef iSinglet<vInteger >         vTInteger;
+    typedef iSinglet<Integer >          TInteger;
+
+    typedef Lattice<vTReal>              LatticeReal;
+    typedef Lattice<vTComplex>           LatticeComplex;
+    typedef Lattice<vInteger>            LatticeInteger; // Predicates for "where"
+    
+    typedef Lattice<vColourMatrix>     LatticeColourMatrix;
+    typedef Lattice<vSpinMatrix>       LatticeSpinMatrix;
+    typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix;
+
+    typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
+    typedef Lattice<vSpinVector>       LatticeSpinVector;
+    typedef Lattice<vColourVector>     LatticeColourVector;
+
+    ///////////////////////////////////////////
+    // Physical names for things
+    ///////////////////////////////////////////
+    typedef Lattice<vSpinColourVector> LatticeFermion;
+    typedef Lattice<vSpinColourMatrix> LatticePropagator;
+    typedef Lattice<vLorentzColourMatrix> LatticeGaugeField;
+
+
+    
+
+     inline void LatticeCoordinate(LatticeInteger &l,int mu){
+      GridBase *grid = l._grid;
+      int Nsimd = grid->iSites();
+      std::vector<int> gcoor;
+      std::vector<Integer> mergebuf(Nsimd);
+      std::vector<Integer *> mergeptr(Nsimd);
+      for(int o=0;o<grid->oSites();o++){
+	for(int i=0;i<grid->iSites();i++){
+	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
+	  //	  grid->RankIndexToGlobalCoor(0,o,i,gcoor);
+	  mergebuf[i]=gcoor[mu];
+	  mergeptr[i]=&mergebuf[i];
+	}
+	merge(l._odata[o],mergeptr);
+      }
+    };
+
+#include <Grid_predicated.h>
+
+#if 0
+
+#endif
+
+}   //namespace QCD
+} // Grid
+#endif
--- a/lib/Grid_aligned_allocator.h
+++ b/lib/Grid_aligned_allocator.h
@ -0,0 +1,59 @@
+#ifndef GRID_ALIGNED_ALLOCATOR_H
+#define GRID_ALIGNED_ALLOCATOR_H
+
+#include <immintrin.h>
+
+namespace Grid {
+
+////////////////////////////////////////////////////////////////////
+// A lattice of something, but assume the something is SIMDized.
+////////////////////////////////////////////////////////////////////
+template<typename _Tp>
+class alignedAllocator {
+public: 
+  typedef std::size_t     size_type;
+  typedef std::ptrdiff_t  difference_type;
+  typedef _Tp*       pointer;
+  typedef const _Tp* const_pointer;
+  typedef _Tp&       reference;
+  typedef const _Tp& const_reference;
+  typedef _Tp        value_type;
+
+  template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; };
+  alignedAllocator() throw() { }
+  alignedAllocator(const alignedAllocator&) throw() { }
+  template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
+  ~alignedAllocator() throw() { }
+  pointer address(reference __x) const { return &__x; }
+  const_pointer address(const_reference __x) const { return &__x; }
+  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
+  // Should override allocate and deallocate
+  pointer allocate(size_type __n, const void* = 0)
+  { 
+    //_Tp * ptr = (_Tp *) memalign(sizeof(_Tp),__n*sizeof(_Tp));
+    // _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+#ifdef AVX512
+    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+#else
+    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+#endif
+
+    return ptr;
+  }
+  void deallocate(pointer __p, size_type) { 
+    free(__p); 
+  }
+  void construct(pointer __p, const _Tp& __val) { };
+  void construct(pointer __p) { };
+  void destroy(pointer __p) { };
+};
+
+template<typename _Tp>  inline bool
+operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
+
+template<typename _Tp>  inline bool
+operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+
+    
+}; // namespace Grid
+#endif
--- a/lib/Grid_communicator_fake.cc
+++ b/lib/Grid_communicator_fake.cc
@ -0,0 +1,56 @@
+#include "Grid.h"
+namespace Grid {
+
+CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
+{
+  _ndimension = _processors.size();
+  _processor_coor.resize(_ndimension);
+  _processors = processors;
+  
+  // Require 1^N processor grid for fake
+  for(int d=0;d<_ndimension;d++) if(_processors[d]!=1) exit(-1);
+
+  _processor = 0;// I am the one. The only one..
+  for(int d=0;d<_ndimension;d++) _processor_coor[d] = 0;
+}
+
+void CartesianCommunicator::GlobalSum(float &){}
+void CartesianCommunicator::GlobalSumVector(float *,int N){}
+void CartesianCommunicator::GlobalSum(double &){}
+void CartesianCommunicator::GlobalSumVector(double *,int N){}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFrom(void *xmit,
+					   int dest,
+					   void *recv,
+					   int from,
+					   int bytes)
+{
+  exit(-1);
+}
+
+void CartesianCommunicator::Barrier(void)
+{
+}
+
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+{
+}
+
+
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
+{
+  source =1;
+  dest=1;
+}
+int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
+{
+  return 1;
+}
+void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
+{
+}
+
+
+}
+
--- a/lib/Grid_communicator_mpi.cc
+++ b/lib/Grid_communicator_mpi.cc
@ -0,0 +1,93 @@
+#include "Grid.h"
+#include <mpi.h>
+
+namespace Grid {
+
+  // Should error check all MPI calls.
+
+CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
+{
+  _ndimension = processors.size();
+  std::vector<int> periodic(_ndimension,1);
+
+  _Nprocessors=1;
+  _processors = processors;
+  _processor_coor.resize(_ndimension);
+  
+  MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Comm_rank(communicator,&_processor);
+  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
+
+  for(int i=0;i<_ndimension;i++){
+    _Nprocessors*=_processors[i];
+  }
+  
+  int Size; 
+  MPI_Comm_size(communicator,&Size);
+  
+  assert(Size==_Nprocessors);
+}
+
+void CartesianCommunicator::GlobalSum(float &f){
+  MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+}
+void CartesianCommunicator::GlobalSumVector(float *f,int N)
+{
+  MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+}
+void CartesianCommunicator::GlobalSumVector(double *d,int N)
+{
+  MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+}
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
+{
+  MPI_Cart_shift(communicator,dim,shift,&source,&dest);
+}
+int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
+{
+  int rank;
+  MPI_Cart_rank  (communicator, &coor[0], &rank);
+  return rank;
+}
+void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
+{
+  coor.resize(_ndimension);
+  MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
+}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFrom(void *xmit,
+					   int dest,
+					   void *recv,
+					   int from,
+					   int bytes)
+{
+  MPI_Request reqs[2];
+  MPI_Status OkeyDokey[2];
+  int rank = _processor;
+  MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&reqs[0]);
+  MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&reqs[1]);
+  MPI_Waitall(2,reqs,OkeyDokey);
+
+}
+
+void CartesianCommunicator::Barrier(void)
+{
+  MPI_Barrier(communicator);
+}
+
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+{
+  MPI_Bcast(data,
+	    bytes,
+	    MPI_BYTE,
+	    root,
+	    communicator);
+}
+
+}
+
--- a/lib/Grid_comparison.h
+++ b/lib/Grid_comparison.h
@ -0,0 +1,264 @@
+#ifndef GRID_COMPARISON_H
+#define GRID_COMPARISON_H
+namespace Grid {
+
+    // Generic list of functors
+    template<class lobj,class robj> class veq {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs == rhs;
+	}
+    };
+    template<class lobj,class robj> class vne {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs != rhs;
+	}
+    };
+    template<class lobj,class robj> class vlt {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs < rhs;
+	}
+    };
+    template<class lobj,class robj> class vle {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs <= rhs;
+	}
+    };
+    template<class lobj,class robj> class vgt {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs > rhs;
+	}
+    };
+    template<class lobj,class robj> class vge {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs >= rhs;
+	}
+    };
+
+    // Generic list of functors
+    template<class lobj,class robj> class seq {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs == rhs;
+	}
+    };
+    template<class lobj,class robj> class sne {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs != rhs;
+	}
+    };
+    template<class lobj,class robj> class slt {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs < rhs;
+	}
+    };
+    template<class lobj,class robj> class sle {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs <= rhs;
+	}
+    };
+    template<class lobj,class robj> class sgt {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs > rhs;
+	}
+    };
+    template<class lobj,class robj> class sge {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs >= rhs;
+	}
+    };
+
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Integer gets extra relational functions. Could also implement these for RealF, RealD etc..
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class sfunctor> 
+    inline vInteger Comparison(sfunctor sop,const vInteger & lhs, const vInteger & rhs)
+    {
+      std::vector<Integer> vlhs(vInteger::Nsimd());   // Use functors to reduce this to single implementation
+      std::vector<Integer> vrhs(vInteger::Nsimd());
+      vInteger ret;
+      extract(lhs,vlhs);
+      extract(rhs,vrhs);
+      for(int s=0;s<vInteger::Nsimd();s++){
+	vlhs[s] = sop(vlhs[s],vrhs[s]);
+      }
+      merge(ret,vlhs);
+      return ret;
+    }
+    inline vInteger operator < (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(slt<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator <= (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sle<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator > (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sgt<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator >= (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sge<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator == (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(seq<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator != (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sne<Integer,Integer>(),lhs,rhs);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // relational operators
+    // 
+    // Support <,>,<=,>=,==,!=
+    //
+    //Query supporting bitwise &, |, ^, !
+    //Query supporting logical &&, ||, 
+    //////////////////////////////////////////////////////////////////////////
+    template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
+    {
+      Lattice<vInteger> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+    }
+    template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
+    {
+      Lattice<vInteger> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites(); ss++){
+	  ret._odata[ss]=op(lhs._odata[ss],rhs);
+        }
+        return ret;
+    }
+    template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
+    {
+      Lattice<vInteger> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=op(lhs._odata[ss],rhs);
+        }
+        return ret;
+    }
+
+    // Less than
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vlt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vlt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vlt<lobj,robj>(),lhs,rhs);
+   }
+
+   // Less than equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vle<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vle<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vle<lobj,robj>(),lhs,rhs);
+   }
+
+   // Greater than 
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vgt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vgt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vgt<lobj,robj>(),lhs,rhs);
+   }
+
+
+   // Greater than equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+
+
+   // equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+
+
+   // not equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+
+
+}
+#endif
--- a/lib/Grid_config.h
+++ b/lib/Grid_config.h
@ -0,0 +1,104 @@
+/* Grid_config.h.  Generated from Grid_config.h.in by configure.  */
+/* Grid_config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* AVX */
+#define AVX1 1
+
+/* AVX2 */
+/* #undef AVX2 */
+
+/* AVX512 */
+/* #undef AVX512 */
+
+/* GRID_COMMS_MPI */
+#define GRID_COMMS_MPI 1
+
+/* GRID_COMMS_NONE */
+/* #undef GRID_COMMS_NONE */
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <malloc.h> header file. */
+/* #undef HAVE_MALLOC_H */
+
+/* Define to 1 if you have the <malloc/malloc.h> header file. */
+#define HAVE_MALLOC_MALLOC_H 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Name of package */
+#define PACKAGE "grid"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "paboyle@ph.ed.ac.uk"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "Grid"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "Grid 1.0"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "grid"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "1.0"
+
+/* SSE4 */
+/* #undef SSE4 */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Version number of package */
+#define VERSION "1.0"
+
+/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+/* #undef _UINT32_T */
+
+/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+/* #undef _UINT64_T */
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
+/* Define to the type of an unsigned integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef uint32_t */
+
+/* Define to the type of an unsigned integer type of width exactly 64 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef uint64_t */
--- a/lib/Grid_config.h.in
+++ b/lib/Grid_config.h.in
@ -0,0 +1,103 @@
+/* Grid_config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* AVX */
+#undef AVX1
+
+/* AVX2 */
+#undef AVX2
+
+/* AVX512 */
+#undef AVX512
+
+/* GRID_COMMS_MPI */
+#undef GRID_COMMS_MPI
+
+/* GRID_COMMS_NONE */
+#undef GRID_COMMS_NONE
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#undef HAVE_GETTIMEOFDAY
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Define to 1 if you have the <malloc.h> header file. */
+#undef HAVE_MALLOC_H
+
+/* Define to 1 if you have the <malloc/malloc.h> header file. */
+#undef HAVE_MALLOC_MALLOC_H
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* SSE4 */
+#undef SSE4
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Version number of package */
+#undef VERSION
+
+/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+#undef _UINT32_T
+
+/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+#undef _UINT64_T
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t
+
+/* Define to the type of an unsigned integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+#undef uint32_t
+
+/* Define to the type of an unsigned integer type of width exactly 64 bits if
+   such a type exists and the standard includes do not define it. */
+#undef uint64_t
--- a/lib/Grid_cshift.h
+++ b/lib/Grid_cshift.h
@ -0,0 +1,16 @@
+#ifndef _GRID_CSHIFT_H_
+#define _GRID_CSHIFT_H_
+#include <Grid_cshift_common.h>
+
+#ifdef GRID_COMMS_NONE
+#include <Grid_cshift_none.h>
+#endif
+
+#ifdef GRID_COMMS_FAKE
+#include <Grid_cshift_fake.h>
+#endif
+
+#ifdef GRID_COMMS_MPI
+#include <Grid_cshift_mpi.h>
+#endif 
+#endif
--- a/lib/Grid_cshift_common.h
+++ b/lib/Grid_cshift_common.h
@ -0,0 +1,326 @@
+#ifndef _GRID_CSHIFT_COMMON_H_
+#define _GRID_CSHIFT_COMMON_H_
+
+//////////////////////////////////////////////////////
+// Gather for when there is no need to SIMD split
+//////////////////////////////////////////////////////
+friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                    // relative offset to base within plane
+    int bo  = 0;                                    // offset in buffer
+
+    // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	buffer[bo++]=rhs._odata[so+o+b];
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else { 
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                      // relative offset to base within plane
+    int bo  = 0;                                      // offset in buffer
+
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	if ( ocb &cbmask ) {
+	  buffer[bo]=rhs._odata[so+o+b];
+	  bo++;
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+  }
+}
+
+//////////////////////////////////////////////////////
+// Gather for when there *is* need to SIMD split
+//////////////////////////////////////////////////////
+friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                    // relative offset to base within plane
+    int bo  = 0;                                    // offset in buffer
+
+    // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	extract(rhs._odata[so+o+b],pointers);
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else { 
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                      // relative offset to base within plane
+    int bo  = 0;                                      // offset in buffer
+    
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	if ( ocb & cbmask ) {
+	  extract(rhs._odata[so+o+b],pointers);
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+  }
+}
+
+//////////////////////////////////////////////////////
+// Scatter for when there is no need to SIMD split
+//////////////////////////////////////////////////////
+friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                    // relative offset to base within plane
+    int bo  = 0;                                    // offset in buffer
+
+    // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	rhs._odata[so+o+b]=buffer[bo++];
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else { 
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                      // relative offset to base within plane
+    int bo  = 0;                                      // offset in buffer
+    
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	if ( ocb & cbmask ) {
+	  rhs._odata[so+o+b]=buffer[bo++];
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+  }
+}
+
+//////////////////////////////////////////////////////
+// Scatter for when there *is* need to SIMD split
+//////////////////////////////////////////////////////
+friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                    // relative offset to base within plane
+    int bo  = 0;                                    // offset in buffer
+
+    // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	merge(rhs._odata[so+o+b],pointers);
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else { 
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                      // relative offset to base within plane
+    int bo  = 0;                                      // offset in buffer
+    
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	if ( ocb&cbmask ) {
+	  merge(rhs._odata[so+o+b],pointers);
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+  }
+}
+
+//////////////////////////////////////////////////////
+// local to node block strided copies
+//////////////////////////////////////////////////////
+friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int o   = 0;                                     // relative offset to base within plane
+    int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int lo  = lplane*lhs._grid->_ostride[dimension]; // offset in buffer
+
+  // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else {
+
+    int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                     // relative offset to base within plane
+
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
+
+	if ( ocb&cbmask ) {
+	  lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  }
+}
+
+friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int o   = 0;                                     // relative offset to base within plane
+    int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int lo  = lplane*rhs._grid->_ostride[dimension]; // offset in buffer
+
+  // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else {
+
+    int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                     // relative offset to base within plane
+    
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
+
+	if ( ocb&cbmask ) {
+	  permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  }
+}
+
+//////////////////////////////////////////////////////
+// Local to node Cshift
+//////////////////////////////////////////////////////
+friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
+{
+  int sshift[2];
+
+  sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
+  sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
+
+  if ( sshift[0] == sshift[1] ) {
+    Cshift_local(ret,rhs,dimension,shift,0x3);
+  } else {
+    Cshift_local(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+    Cshift_local(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
+  }
+}
+
+friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  GridBase *grid = rhs._grid;
+  int fd = grid->_fdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
+  int ld = grid->_ldimensions[dimension];
+  int gd = grid->_gdimensions[dimension];
+
+  // Map to always positive shift modulo global full dimension.
+  shift = (shift+fd)%fd;
+
+  ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift);
+        
+  // the permute type
+  int permute_dim =grid->PermuteDim(dimension);
+  int permute_type=grid->PermuteType(dimension);
+
+  for(int x=0;x<rd;x++){       
+
+    int o   = 0;
+    int bo  = x * grid->_ostride[dimension];
+    
+    int cb= (cbmask==0x2)? 1 : 0;
+
+    int sshift = grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
+    int sx     = (x+sshift)%rd;
+	
+    int permute_slice=0;
+    if(permute_dim){
+      int wrap = sshift/rd;
+      int  num = sshift%rd;
+      if ( x< rd-num ) permute_slice=wrap;
+      else permute_slice = 1-wrap;
+    }
+
+    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
+    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+
+  
+  }
+  return ret;
+}
+
+#endif
--- a/lib/Grid_cshift_mpi.h
+++ b/lib/Grid_cshift_mpi.h
@ -0,0 +1,263 @@
+#ifndef _GRID_MPI_CSHIFT_H_
+#define _GRID_MPI_CSHIFT_H_
+
+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#define MIN(x,y) ((x)>(y)?(y):(x))
+#endif
+
+friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  Lattice<vobj> ret(rhs._grid);
+  
+  int fd = rhs._grid->_fdimensions[dimension];
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  // Map to always positive shift modulo global full dimension.
+  shift = (shift+fd)%fd;
+
+  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
+        
+  // the permute type
+  int simd_layout     = rhs._grid->_simd_layout[dimension];
+  int comm_dim        = rhs._grid->_processors[dimension] >1 ;
+  int splice_dim      = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
+
+
+  if ( !comm_dim ) {
+    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
+  } else if ( splice_dim ) {
+    Cshift_comms_simd(ret,rhs,dimension,shift);
+  } else {
+    Cshift_comms(ret,rhs,dimension,shift);
+  }
+  return ret;
+}
+
+friend void Cshift_comms(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
+{
+  int sshift[2];
+
+  sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
+  sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
+
+  if ( sshift[0] == sshift[1] ) {
+    Cshift_comms(ret,rhs,dimension,shift,0x3);
+  } else {
+    Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+    Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
+  }
+}
+
+friend void Cshift_comms_simd(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
+{
+  int sshift[2];
+
+  sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
+  sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
+
+  if ( sshift[0] == sshift[1] ) {
+    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
+  } else {
+    Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
+  }
+}
+
+
+friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  GridBase *grid=rhs._grid;
+  Lattice<vobj> temp(rhs._grid);
+
+  int fd              = rhs._grid->_fdimensions[dimension];
+  int rd              = rhs._grid->_rdimensions[dimension];
+  int simd_layout     = rhs._grid->_simd_layout[dimension];
+  int comm_dim        = rhs._grid->_processors[dimension] >1 ;
+  assert(simd_layout==1);
+  assert(comm_dim==1);
+  assert(shift>=0);
+  assert(shift<fd);
+  
+  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
+  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
+  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
+
+  int cb= (cbmask==0x2)? 1 : 0;
+  int sshift= rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
+
+  for(int x=0;x<rd;x++){       
+
+    int offnode = ( x+sshift >= rd );
+    int sx        = (x+sshift)%rd;
+    int comm_proc = (x+sshift)/rd;
+    
+    if (!offnode) {
+
+      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+
+    } else {
+
+      int words = send_buf.size();
+      if (cbmask != 0x3) words=words>>1;
+
+      int bytes = words * sizeof(vobj);
+
+      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
+
+      int rank           = grid->_processor;
+      int recv_from_rank;
+      int xmit_to_rank;
+      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+
+      grid->SendToRecvFrom((void *)&send_buf[0],
+			   xmit_to_rank,
+			   (void *)&recv_buf[0],
+			   recv_from_rank,
+			   bytes);
+
+      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
+    }
+  }
+}
+
+
+friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  GridBase *grid=rhs._grid;
+  const int Nsimd = grid->Nsimd();
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+   
+  int fd = grid->_fdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
+  int ld = grid->_ldimensions[dimension];
+  int simd_layout     = grid->_simd_layout[dimension];
+  int comm_dim        = grid->_processors[dimension] >1 ;
+
+  assert(comm_dim==1);
+  assert(simd_layout==2);
+  assert(shift>=0);
+  assert(shift<fd);
+
+  int permute_type=grid->PermuteType(dimension);
+
+  ///////////////////////////////////////////////
+  // Simd direction uses an extract/merge pair
+  ///////////////////////////////////////////////
+  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
+  int words = sizeof(vobj)/sizeof(vector_type);
+
+  std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
+  std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
+  int bytes = buffer_size*words*sizeof(scalar_type);
+
+  std::vector<scalar_type *> pointers(Nsimd);  // 
+  std::vector<scalar_type *> rpointers(Nsimd); // received pointers
+
+  ///////////////////////////////////////////
+  // Work out what to send where
+  ///////////////////////////////////////////
+
+  int cb    = (cbmask==0x2)? 1 : 0;
+  int sshift= grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
+  
+  std::vector<int> comm_offnode(simd_layout);
+  std::vector<int> comm_proc   (simd_layout);  //relative processor coord in dim=dimension
+  std::vector<int> icoor(grid->Nd());
+
+  for(int x=0;x<rd;x++){       
+
+    int comm_any = 0;
+    for(int s=0;s<simd_layout;s++) {
+      int shifted_x   = x+s*rd+sshift;
+      comm_offnode[s] = shifted_x >= ld; 
+      comm_any        = comm_any | comm_offnode[s];
+      comm_proc[s]    = shifted_x/ld;     
+    }
+    
+    int o    = 0;
+    int bo   = x*grid->_ostride[dimension];
+    int sx   = (x+sshift)%rd;
+
+    if ( comm_any ) {
+
+      for(int i=0;i<Nsimd;i++){
+	pointers[i] = (scalar_type *)&send_buf_extract[i][0];
+      }
+      Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+
+      for(int i=0;i<Nsimd;i++){
+
+
+	int s;
+	grid->iCoorFromIindex(icoor,i);
+	s = icoor[dimension];
+
+	if(comm_offnode[s]){
+
+	  int rank           = grid->_processor;
+	  int recv_from_rank;
+	  int xmit_to_rank;
+	  grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
+	  
+
+	  grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
+			    xmit_to_rank,
+			    (void *)&recv_buf_extract[i][0],
+			    recv_from_rank,
+			    bytes);
+
+	  rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
+
+	} else { 
+
+	  rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
+
+	}
+
+      }
+
+      // Permute by swizzling pointers in merge
+      int permute_slice=0;
+      int lshift=sshift%ld;
+      int wrap  =lshift/rd;
+      int  num  =lshift%rd;
+
+      if ( x< rd-num ) permute_slice=wrap;
+      else permute_slice = 1-wrap;
+
+      int toggle_bit = (Nsimd>>(permute_type+1));
+      int PermuteMap;
+      for(int i=0;i<Nsimd;i++){
+	if ( permute_slice ) {
+	  PermuteMap=i^toggle_bit;
+	  pointers[i] = rpointers[PermuteMap];
+	} else {
+	  pointers[i] = rpointers[i];
+	}
+      }
+
+      Scatter_plane_merge(ret,pointers,dimension,x,cbmask);
+
+    } else { 
+
+      int permute_slice=0;
+      int wrap = sshift/rd;
+      int  num = sshift%rd;
+      if ( x< rd-num ) permute_slice=wrap;
+      else permute_slice = 1-wrap;
+
+      if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
+      else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+
+    }
+  }
+}
+#endif
--- a/lib/Grid_cshift_none.h
+++ b/lib/Grid_cshift_none.h
@ -0,0 +1,12 @@
+#ifndef _GRID_NONE_CSHIFT_H_
+#define _GRID_NONE_CSHIFT_H_
+
+friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
+{
+  Lattice<vobj> ret(rhs._grid);
+  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
+  Cshift_local(ret,rhs,dimension,shift);
+  return ret;
+}
+        
+#endif
--- a/lib/Grid_init.cc
+++ b/lib/Grid_init.cc
@ -0,0 +1,91 @@
+/****************************************************************************/
+/* PAB: Signal magic. Processor state dump is x86-64 specific               */
+/****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h> 
+#include <sys/time.h>
+#include <signal.h>
+
+#include "Grid.h"
+
+#undef __X86_64
+namespace Grid {
+
+void Grid_init(int *argc,char ***argv)
+{
+#ifdef GRID_COMMS_MPI
+  MPI_Init(argc,argv);
+#endif
+  Grid_debug_handler_init();
+}
+void Grid_finalize(void)
+{
+#ifdef GRID_COMMS_MPI
+  MPI_Finalize();
+#endif
+}
+double usecond(void) {
+  struct timeval tv;
+  gettimeofday(&tv,NULL);
+  return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
+}
+
+void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
+{
+  printf("Caught signal %d\n",si->si_signo);
+  printf("  mem address %llx\n",(uint64_t)si->si_addr);
+  printf("         code %d\n",si->si_code);
+
+#ifdef __X86_64
+    ucontext_t * uc= (ucontext_t *)ptr;
+
+  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
+  printf("  instruction %llx\n",(uint64_t)sc->rip);
+
+#define REG(A)  printf("  %s %lx\n",#A, sc-> A);
+  REG(rdi);
+  REG(rsi);
+  REG(rbp);
+  REG(rbx);
+  REG(rdx);
+  REG(rax);
+  REG(rcx);
+  REG(rsp);
+  REG(rip);
+
+
+  REG(r8);
+  REG(r9);
+  REG(r10);
+  REG(r11);
+  REG(r12);
+  REG(r13);
+  REG(r14);
+  REG(r15);
+#endif
+
+  fflush(stdout);
+
+  if ( si->si_signo == SIGSEGV ) {
+    printf("Grid_sa_signal_handler: Oops... this was a sigsegv you naughty naughty programmer. Goodbye\n");
+    fflush(stdout);
+    exit(-1);
+  }
+  return;
+};
+
+void Grid_debug_handler_init(void)
+{
+  struct sigaction sa,osa;
+  sigemptyset (&sa.sa_mask);
+  sa.sa_sigaction= Grid_sa_signal_handler;
+  sa.sa_flags    = SA_SIGINFO;
+  sigaction(SIGSEGV,&sa,NULL);
+  sigaction(SIGTRAP,&sa,NULL);
+}
+}
--- a/lib/Grid_math_type_mapper.h
+++ b/lib/Grid_math_type_mapper.h
@ -0,0 +1,100 @@
+#ifndef GRID_MATH_TYPE_MAPPER_H
+#define GRID_MATH_TYPE_MAPPER_H
+
+namespace Grid {
+
+
+//////////////////////////////////////////////////////////////////////////////////
+// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
+// Use of a helper class like this allows us to template specialise and "dress"
+// other classes such as RealD == double, ComplexD == std::complex<double> with these
+// traits.
+//
+// It is possible that we could do this more elegantly if I introduced a 
+// queryable trait in iScalar, iMatrix and iVector and used the query on vtype in 
+// place of the type mapper?
+//
+// Not sure how to do this, but probably could be done with a research effort
+// to study C++11's type_traits.h file. (std::enable_if<isGridTensorType<vtype> >)
+//
+//////////////////////////////////////////////////////////////////////////////////
+  
+  template <class T> class GridTypeMapper {
+  public:
+    typedef typename T::scalar_type scalar_type;
+    typedef typename T::vector_type vector_type;
+    typedef typename T::tensor_reduced tensor_reduced;
+    enum { TensorLevel = T::TensorLevel };
+  };
+
+//////////////////////////////////////////////////////////////////////////////////
+// Recursion stops with these template specialisations
+//////////////////////////////////////////////////////////////////////////////////
+  template<> class GridTypeMapper<RealF> {
+  public:
+    typedef RealF scalar_type;
+    typedef RealF vector_type;
+    typedef RealF tensor_reduced ;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<RealD> {
+  public:
+    typedef RealD scalar_type;
+    typedef RealD vector_type;
+    typedef RealD tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<ComplexF> {
+  public:
+    typedef ComplexF scalar_type;
+    typedef ComplexF vector_type;
+    typedef ComplexF tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<ComplexD> {
+  public:
+    typedef ComplexD scalar_type;
+    typedef ComplexD vector_type;
+    typedef ComplexD tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+
+  template<> class GridTypeMapper<vRealF> {
+  public:
+    typedef RealF  scalar_type;
+    typedef vRealF vector_type;
+    typedef vRealF tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<vRealD> {
+  public:
+    typedef RealD  scalar_type;
+    typedef vRealD vector_type;
+    typedef vRealD tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<vComplexF> {
+  public:
+    typedef ComplexF  scalar_type;
+    typedef vComplexF vector_type;
+    typedef vComplexF tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<vComplexD> {
+  public:
+    typedef ComplexD  scalar_type;
+    typedef vComplexD vector_type;
+    typedef vComplexD tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<vInteger> {
+  public:
+    typedef Integer  scalar_type;
+    typedef vInteger vector_type;
+    typedef vInteger tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+
+}
+
+#endif
--- a/lib/Grid_math_types.h
+++ b/lib/Grid_math_types.h
--- a/lib/Grid_predicated.h
+++ b/lib/Grid_predicated.h
@ -0,0 +1,62 @@
+#ifndef GRID_PREDICATED_H
+#define GRID_PREDICATED_H
+
+// Must implement the predicate gating the 
+// Must be able to reduce the predicate down to a single vInteger per site.
+// Must be able to require the type be iScalar x iScalar x ....
+//                              give a GetVtype method in iScalar
+//                              and blow away the tensor structures.
+//
+template<class vobj>
+inline void where(Lattice<vobj> &ret,const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+  conformable(iftrue,ret);
+
+  GridBase *grid=iftrue._grid;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  const int Nsimd = grid->Nsimd();
+  const int words = sizeof(vobj)/sizeof(vector_type);
+
+  std::vector<Integer> mask(Nsimd);
+  std::vector<std::vector<scalar_type> > truevals (Nsimd,std::vector<scalar_type>(words) );
+  std::vector<std::vector<scalar_type> > falsevals(Nsimd,std::vector<scalar_type>(words) );
+  std::vector<scalar_type *> pointers(Nsimd);
+
+#pragma omp parallel for
+  for(int ss=0;ss<iftrue._grid->oSites(); ss++){
+
+    for(int s=0;s<Nsimd;s++) pointers[s] = & truevals[s][0];
+    extract(iftrue._odata[ss]   ,pointers);
+
+    for(int s=0;s<Nsimd;s++) pointers[s] = & falsevals[s][0];
+    extract(iffalse._odata[ss]  ,pointers);
+
+    extract(predicate._odata[ss],mask);
+
+    for(int s=0;s<Nsimd;s++){
+      if (mask[s]) pointers[s]=&truevals[s][0];
+      else         pointers[s]=&falsevals[s][0];
+    }
+
+    merge(ret._odata[ss],pointers);
+  }
+}
+
+template<class vobj>
+inline Lattice<vobj> where(const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+
+  Lattice<vobj> ret(iftrue._grid);
+
+  where(ret,predicate,iftrue,iffalse);
+
+  return ret;
+}
+
+#endif
--- a/lib/Grid_simd.h
+++ b/lib/Grid_simd.h
@ -0,0 +1,293 @@
+#ifndef GRID_SIMD_H
+#define GRID_SIMD_H
+
+////////////////////////////////////////////////////////////////////////
+// Define scalar and vector floating point types
+//
+// Scalar:   RealF, RealD, ComplexF, ComplexD
+//
+// Vector:  vRealF, vRealD, vComplexF, vComplexD
+//
+// Vector types are arch dependent
+////////////////////////////////////////////////////////////////////////
+    
+
+#ifdef SSE4
+#include <pmmintrin.h>
+#endif
+#if defined(AVX1) || defined (AVX2)
+#include <immintrin.h>
+#endif
+#ifdef AVX512
+#include <immintrin.h>
+#endif
+
+namespace Grid {
+
+  typedef  float  RealF;
+  typedef  double RealD;
+  
+  typedef std::complex<RealF> ComplexF;
+  typedef std::complex<RealD> ComplexD;
+
+
+  inline RealF adj(const RealF  & r){ return r; }
+  inline RealF conj(const RealF  & r){ return r; }
+  inline RealF real(const RealF  & r){ return r; }
+
+  inline RealD adj(const RealD  & r){ return r; }
+  inline RealD conj(const RealD  & r){ return r; }
+  inline RealD real(const RealD  & r){ return r; }
+
+  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
+  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
+  inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
+  inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    //Provide support functions for basic real and complex data types required by Grid
+    //Single and double precision versions. Should be able to template this once only.
+    ////////////////////////////////////////////////////////////////////////////////
+    inline void mac (ComplexD * __restrict__ y,const ComplexD * __restrict__ a,const ComplexD *__restrict__ x){ *y = (*a) * (*x)+(*y); };
+    inline void mult(ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r);}
+    inline void sub (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r);}
+    inline void add (ComplexD * __restrict__ y,const ComplexD * __restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r);}
+    inline ComplexD adj(const ComplexD& r){ return(conj(r)); }
+    // conj already supported for complex
+    
+    inline void mac (ComplexF * __restrict__ y,const ComplexF * __restrict__ a,const ComplexF *__restrict__ x){ *y = (*a) * (*x)+(*y); }
+    inline void mult(ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
+    inline void sub (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
+    inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
+    inline ComplexF  adj(const ComplexF& r ){ return(conj(r)); }
+    //conj already supported for complex
+    
+    inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
+    inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
+    inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
+    inline void add (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r);}
+    
+    inline void mac (RealF * __restrict__ y,const RealF * __restrict__ a,const RealF *__restrict__ x){  *y = (*a) * (*x)+(*y); }
+    inline void mult(RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
+    inline void sub (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
+    inline void add (RealF * __restrict__ y,const RealF * __restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
+    
+
+
+  class Zero{};
+  static Zero zero;
+  template<class itype> inline void zeroit(itype &arg){ arg=zero;};
+  template<>            inline void zeroit(ComplexF &arg){ arg=0; };
+  template<>            inline void zeroit(ComplexD &arg){ arg=0; };
+  template<>            inline void zeroit(RealF &arg){ arg=0; };
+  template<>            inline void zeroit(RealD &arg){ arg=0; };
+
+
+#if defined (SSE4)
+    typedef __m128 fvec;
+    typedef __m128d dvec;
+    typedef __m128 cvec;
+    typedef __m128d zvec;
+    typedef __m128i ivec;
+#endif
+#if defined (AVX1) || defined (AVX2)
+    typedef __m256 fvec;
+    typedef __m256d dvec;
+    typedef __m256  cvec;
+    typedef __m256d zvec;
+    typedef __m256i ivec;
+#endif
+#if defined (AVX512)
+    typedef __m512  fvec;
+    typedef __m512d dvec;
+    typedef __m512  cvec;
+    typedef __m512d zvec;
+    typedef __m512i ivec;
+#endif
+#if defined (QPX)
+    typedef float  fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision
+    typedef float  cvec __attribute__ ((vector_size (16)));
+    
+    typedef vector4double dvec;
+    typedef vector4double zvec;
+#endif
+#if defined (AVX1) || defined (AVX2) || defined (AVX512)
+    inline void v_prefetch0(int size, const char *ptr){
+          for(int i=0;i<size;i+=64){ //  Define L1 linesize above// What about SSE?
+            _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
+            _mm_prefetch(ptr+i+512,_MM_HINT_T0);
+          }
+    }
+#else 
+    inline void v_prefetch0(int size, const char *ptr){};
+#endif
+
+
+/////////////////////////////////////////////////////////////////
+// Generic extract/merge/permute
+/////////////////////////////////////////////////////////////////
+template<class vsimd,class scalar>
+inline void Gextract(const vsimd &y,std::vector<scalar *> &extracted){
+  // FIXME: bounce off stack is painful
+  // temporary hack while I figure out better way.
+  // There are intrinsics to do this work without the storage.
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
+  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
+  vstore(y,&buf[0]);
+  for(int i=0;i<Nextr;i++){
+    *extracted[i] = buf[i*s];
+    extracted[i]++;
+  }
+};
+template<class vsimd,class scalar>
+inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
+  std::vector<scalar> buf(Nsimd); 
+  for(int i=0;i<Nextr;i++){
+    for(int ii=0;ii<s;ii++){
+      buf[i*s+ii]=*extracted[i];
+    }
+    extracted[i]++;
+  }
+  vset(y,&buf[0]); 
+};
+template<class vsimd,class scalar>
+inline void Gextract(const vsimd &y,std::vector<scalar> &extracted){
+  // FIXME: bounce off stack is painful
+  // temporary hack while I figure out better way.
+  // There are intrinsics to do this work without the storage.
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
+  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
+
+  vstore(y,&buf[0]);
+
+  for(int i=0;i<Nextr;i++){
+    extracted[i] = buf[i*s];
+  }
+};
+template<class vsimd,class scalar>
+inline void Gmerge(vsimd &y,std::vector<scalar> &extracted){
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
+  std::vector<scalar> buf(Nsimd); 
+  for(int i=0;i<Nextr;i++){
+    for(int ii=0;ii<s;ii++){
+      buf[i*s+ii]=extracted[i];
+    }
+  }
+  vset(y,&buf[0]); 
+};
+
+//////////////////////////////////////////////////////////
+// Permute
+// Permute 0 every ABCDEFGH -> BA DC FE HG
+// Permute 1 every ABCDEFGH -> CD AB GH EF
+// Permute 2 every ABCDEFGH -> EFGH ABCD
+// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
+// Permute 4 possible on half precision @512bit vectors.
+//////////////////////////////////////////////////////////
+template<class vsimd>
+inline void Gpermute(vsimd &y,const vsimd &b,int perm){
+      switch (perm){
+#if defined(AVX1)||defined(AVX2)
+      // 8x32 bits=>3 permutes
+      case 2: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
+      case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
+      case 0: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
+#endif
+#ifdef SSE4
+      case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
+      case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
+#endif
+#ifdef AVX512
+	// 16 floats=> permutes
+        // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo 
+        // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn 
+        // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
+        // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
+      case 3: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break;
+      case 2: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break;
+      case 1: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
+      case 0: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
+#endif
+#ifdef QPX
+#error not implemented
+#endif
+      default: assert(0); break;
+      }
+    };
+};
+
+#include <Grid_vInteger.h>
+#include <Grid_vRealF.h>
+#include <Grid_vRealD.h>
+#include <Grid_vComplexF.h>
+#include <Grid_vComplexD.h>
+
+
+namespace Grid {
+
+  // NB: Template the following on "type Complex" and then implement *,+,- for 
+  // ComplexF, ComplexD, RealF, RealD above to
+  // get full generality of binops with scalars.
+   inline void mac (vComplexF *__restrict__ y,const ComplexF *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vComplexF *__restrict__ y,const ComplexF *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
+   inline void mac (vComplexF *__restrict__ y,const vComplexF *__restrict__ a,const ComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vComplexF *__restrict__ y,const vComplexF *__restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
+
+   inline void mac (vComplexD *__restrict__ y,const ComplexD *__restrict__ a,const vComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vComplexD *__restrict__ y,const ComplexD *__restrict__ l,const vComplexD *__restrict__ r){ *y = (*l) + (*r); }
+   inline void mac (vComplexD *__restrict__ y,const vComplexD *__restrict__ a,const ComplexD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vComplexD *__restrict__ y,const vComplexD *__restrict__ l,const ComplexD *__restrict__ r){ *y = (*l) + (*r); }
+
+   inline void mac (vRealF *__restrict__ y,const RealF *__restrict__ a,const vRealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vRealF *__restrict__ y,const RealF *__restrict__ l,const vRealF *__restrict__ r){ *y = (*l) + (*r); }
+   inline void mac (vRealF *__restrict__ y,const vRealF *__restrict__ a,const RealF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vRealF *__restrict__ y,const vRealF *__restrict__ l,const RealF *__restrict__ r){ *y = (*l) + (*r); }
+
+   inline void mac (vRealD *__restrict__ y,const RealD *__restrict__ a,const vRealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vRealD *__restrict__ y,const RealD *__restrict__ l,const vRealD *__restrict__ r){ *y = (*l) + (*r); }
+   inline void mac (vRealD *__restrict__ y,const vRealD *__restrict__ a,const RealD *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+   inline void mult(vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r); }
+   inline void sub (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r); }
+   inline void add (vRealD *__restrict__ y,const vRealD *__restrict__ l,const RealD *__restrict__ r){ *y = (*l) + (*r); }
+
+  // Default precision
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
+  typedef RealD   Real;
+  typedef vRealD vReal;
+  typedef vComplexD vComplex;
+  typedef std::complex<Real>  Complex;
+#else
+  typedef RealF  Real;
+  typedef vRealF vReal;
+  typedef vComplexF vComplex;
+  typedef std::complex<Real>  Complex;
+#endif
+}
+#endif
--- a/lib/Grid_stencil.h
+++ b/lib/Grid_stencil.h
@ -0,0 +1,351 @@
+#ifndef GRID_STENCIL_H
+#define GRID_STENCIL_H
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Must not lose sight that goal is to be able to construct really efficient
+// gather to a point stencil code. CSHIFT is not the best way, so need
+// additional stencil support.
+//
+// Stencil based code will pre-exchange haloes and use a table lookup for neighbours.
+// This will be done with generality to allow easier efficient implementations.
+// Overlap of comms and compute could be semi-automated by tabulating off-node connected,
+// and 
+//
+// Lattice <foo> could also allocate haloes which get used for stencil code.
+//
+// Grid could create a neighbour index table for a given stencil.
+//
+// Could also implement CovariantCshift, to fuse the loops and enhance performance.
+//
+//
+// General stencil computation:
+//
+// Generic services
+// 0) Prebuild neighbour tables
+// 1) Compute sizes of all haloes/comms buffers; allocate them.
+//
+// 2) Gather all faces, and communicate.
+// 3) Loop over result sites, giving nbr index/offnode info for each
+// 
+// Could take a 
+// SpinProjectFaces 
+// start comms
+// complete comms 
+// Reconstruct Umu
+//
+// Approach.
+//
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace Grid {
+
+  struct CommsRequest { 
+    int words;
+    int unified_buffer_offset;
+    int tag;
+    int to_rank;
+    int from_rank;
+  } ;
+
+  class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
+  public:
+
+      int                               _checkerboard;
+      int                               _npoints; // Move to template param?
+      GridBase *                        _grid;
+      
+      // npoints of these
+      std::vector<int>                  _directions;
+      std::vector<int>                  _distances;
+      std::vector<int>                  _comm_buf_size;
+      std::vector<int>                  _permute_type;
+
+      // npoints x Osites() of these
+      std::vector<std::vector<int>    > _offsets;
+      std::vector<std::vector<int>    > _is_local;
+      std::vector<std::vector<int> >    _permute;
+
+      int _unified_buffer_size;
+      int _request_count;
+
+      std::vector<CommsRequest>  CommsRequests;
+
+      CartesianStencil(GridBase *grid,
+		       int npoints,
+		       int checkerboard,
+		       const std::vector<int> &directions,
+		       const std::vector<int> &distances);
+
+
+      // Add to tables for various cases;  is this mistaken. only local if 1 proc in dim
+      // Can this be avoided with simpler coding of comms?
+      void Local     (int point, int dimension,int shift,int cbmask);
+      void Comms     (int point, int dimension,int shift,int cbmask);
+      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute);
+      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset);
+
+      // Could allow a functional munging of the halo to another type during the comms.
+      // this could implement the 16bit/32bit/64bit compression.
+      template<class vobj> void HaloExchange(Lattice<vobj> &source,
+					     std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf)
+      {
+	// conformable(source._grid,_grid);
+	assert(source._grid==_grid);
+	if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
+	int u_comm_offset=0;
+
+	// Gather all comms buffers
+	typedef typename vobj::vector_type vector_type;
+	typedef typename vobj::scalar_type scalar_type;
+
+	for(int point = 0 ; point < _npoints; point++) {
+
+	  printf("Point %d \n",point);fflush(stdout);
+	  int dimension    = _directions[point];
+	  int displacement = _distances[point];
+	  
+	  int fd = _grid->_fdimensions[dimension];
+	  int rd = _grid->_rdimensions[dimension];
+
+
+	  // Map to always positive shift modulo global full dimension.
+	  int shift = (displacement+fd)%fd;
+	  
+     	  int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
+	  assert (checkerboard== _checkerboard);
+
+	  // the permute type
+	  int simd_layout     = _grid->_simd_layout[dimension];
+	  int comm_dim        = _grid->_processors[dimension] >1 ;
+	  int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+
+	  // Gather phase
+	  int sshift [2];
+	  if ( comm_dim ) {
+	    sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	    sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+	    if ( sshift[0] == sshift[1] ) {
+	      if (splice_dim) {
+		printf("splice 0x3 \n");fflush(stdout);
+		GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
+	      } else { 
+		printf("NO splice 0x3 \n");fflush(stdout);
+		GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset);
+	      }
+	    } else {
+	      if(splice_dim){
+		printf("splice 0x1,2 \n");fflush(stdout);
+		GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);// if checkerboard is unfavourable take two passes
+		GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);// both with block stride loop iteration
+	      } else {
+		printf("NO splice 0x1,2 \n");fflush(stdout);
+		GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset);
+		GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset);
+	      }
+	    }
+	  }
+	}
+      }
+
+      template<class vobj> void GatherStartComms(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
+						 std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
+						 int &u_comm_offset)
+	{
+	  typedef typename vobj::vector_type vector_type;
+	  typedef typename vobj::scalar_type scalar_type;
+	  
+	  GridBase *grid=_grid;
+	  assert(rhs._grid==_grid);
+	  //	  conformable(_grid,rhs._grid);
+
+	  int fd              = _grid->_fdimensions[dimension];
+	  int rd              = _grid->_rdimensions[dimension];
+	  int simd_layout     = _grid->_simd_layout[dimension];
+	  int comm_dim        = _grid->_processors[dimension] >1 ;
+	  assert(simd_layout==1);
+	  assert(comm_dim==1);
+	  assert(shift>=0);
+	  assert(shift<fd);
+	  
+	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+	  
+	  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size); // hmm...
+	  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
+	  
+	  int cb= (cbmask==0x2)? 1 : 0;
+	  int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
+	  
+	  for(int x=0;x<rd;x++){       
+	    
+	    printf("GatherStartComms x %d/%d\n",x,rd);fflush(stdout);
+	    int offnode = ( x+sshift >= rd );
+	    int sx        = (x+sshift)%rd;
+	    int comm_proc = (x+sshift)/rd;
+	    
+	    if (offnode) {
+	      
+	      printf("GatherStartComms offnode x %d\n",x);fflush(stdout);
+	      int words = send_buf.size();
+	      if (cbmask != 0x3) words=words>>1;
+	    
+	      int bytes = words * sizeof(vobj);
+
+	      printf("Gather_plane_simple dimension %d sx %d cbmask %d\n",dimension,sx,cbmask);fflush(stdout);
+	      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
+
+	      printf("GatherStartComms gathered offnode x %d\n",x);fflush(stdout);
+
+	      int rank           = _grid->_processor;
+	      int recv_from_rank;
+	      int xmit_to_rank;
+	      _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+	      
+	      //      FIXME Implement asynchronous send & also avoid buffer copy
+	      _grid->SendToRecvFrom((void *)&send_buf[0],
+				   xmit_to_rank,
+				   (void *)&recv_buf[0],
+				   recv_from_rank,
+				   bytes);
+	      printf("GatherStartComms communicated offnode x %d\n",x);fflush(stdout);
+
+	      printf("GatherStartComms inserting %d buf size %d\n",u_comm_offset,buffer_size);fflush(stdout);
+	      for(int i=0;i<buffer_size;i++){
+		u_comm_buf[u_comm_offset+i]=recv_buf[i];
+	      }
+	      u_comm_offset+=buffer_size;
+	      printf("GatherStartComms inserted x %d\n",x);fflush(stdout);
+	    }
+	  }
+	}
+
+
+      template<class vobj>
+	void  GatherStartCommsSimd(Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
+				   std::vector<vobj,alignedAllocator<vobj> > &u_comm_buf,
+				   int &u_comm_offset)
+	{
+	  const int Nsimd = _grid->Nsimd();
+	  typedef typename vobj::vector_type vector_type;
+	  typedef typename vobj::scalar_type scalar_type;
+	  
+	  int fd = _grid->_fdimensions[dimension];
+	  int rd = _grid->_rdimensions[dimension];
+	  int ld = _grid->_ldimensions[dimension];
+	  int simd_layout     = _grid->_simd_layout[dimension];
+	  int comm_dim        = _grid->_processors[dimension] >1 ;
+
+	  assert(comm_dim==1);
+	  assert(simd_layout==2);
+	  assert(shift>=0);
+	  assert(shift<fd);
+
+	  int permute_type=_grid->PermuteType(dimension);
+
+	  ///////////////////////////////////////////////
+	  // Simd direction uses an extract/merge pair
+	  ///////////////////////////////////////////////
+	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+	  int words = sizeof(vobj)/sizeof(vector_type);
+
+	  /*   FIXME ALTERNATE BUFFER DETERMINATION */
+	  std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) ); 
+	  std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
+	  int bytes = buffer_size*words*sizeof(scalar_type);
+
+	  std::vector<scalar_type *> pointers(Nsimd);  //
+	  std::vector<scalar_type *> rpointers(Nsimd); // received pointers
+	  
+	  ///////////////////////////////////////////
+	  // Work out what to send where
+	  ///////////////////////////////////////////
+
+	  int cb    = (cbmask==0x2)? 1 : 0;
+	  int sshift= _grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
+	  
+	  std::vector<int> comm_offnode(simd_layout);
+	  std::vector<int> comm_proc   (simd_layout);  //relative processor coord in dim=dimension
+	  std::vector<int> icoor(_grid->Nd());
+	  
+	  for(int x=0;x<rd;x++){
+	    
+	    int comm_any = 0;
+	    for(int s=0;s<simd_layout;s++) {
+	      int shifted_x   = x+s*rd+sshift;
+	      comm_offnode[s] = shifted_x >= ld;
+	      comm_any        = comm_any | comm_offnode[s];
+	      comm_proc[s]    = shifted_x/ld;
+	    }
+    
+	    int o    = 0;
+	    int bo   = x*_grid->_ostride[dimension];
+	    int sx   = (x+sshift)%rd;
+	    
+	    if ( comm_any ) {
+
+	      for(int i=0;i<Nsimd;i++){
+		pointers[i] = (scalar_type *)&send_buf_extract[i][0];
+	      }
+	      Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+	      
+	      for(int i=0;i<Nsimd;i++){
+		
+		int s;
+		_grid->iCoorFromIindex(icoor,i);
+		s = icoor[dimension];
+		
+		if(comm_offnode[s]){
+		  
+		  int rank           = _grid->_processor;
+		  int recv_from_rank;
+		  int xmit_to_rank;
+
+		  _grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
+	  
+
+		  _grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
+					xmit_to_rank,
+					(void *)&recv_buf_extract[i][0],
+					recv_from_rank,
+					bytes);
+
+		  rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
+		  
+		} else { 
+		  
+		  rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
+
+		}
+		
+	      }
+
+	      // Permute by swizzling pointers in merge
+	      int permute_slice=0;
+	      int lshift=sshift%ld;
+	      int wrap  =lshift/rd;
+	      int  num  =lshift%rd;
+
+	      if ( x< rd-num ) permute_slice=wrap;
+	      else permute_slice = 1-wrap;
+
+	      int toggle_bit = (Nsimd>>(permute_type+1));
+	      int PermuteMap;
+	      for(int i=0;i<Nsimd;i++){
+		if ( permute_slice ) {
+		  PermuteMap=i^toggle_bit;
+		  pointers[i] = rpointers[PermuteMap];
+		} else {
+		  pointers[i] = rpointers[i];
+		}
+	      }
+
+	      // Here we don't want to scatter, just place into a buffer.
+	      for(int i=0;i<buffer_size;i++){
+		merge(u_comm_buf[u_comm_offset+i],pointers);
+	      }
+
+	    }
+	  }
+	}
+  };
+}
+#endif
--- a/lib/Grid_stencil_common.cc
+++ b/lib/Grid_stencil_common.cc
@ -0,0 +1,258 @@
+#include "Grid.h"
+
+namespace Grid {
+
+  CartesianStencil::CartesianStencil(GridBase *grid,
+				     int npoints,
+				     int checkerboard,
+				     const std::vector<int> &directions,
+				     const std::vector<int> &distances) 
+    :   _offsets(npoints), 
+	_is_local(npoints), 
+	_comm_buf_size(npoints), 
+	_permute_type(npoints),
+	_permute(npoints)
+    {
+      _npoints = npoints;
+      _grid    = grid;
+      _directions = directions;
+      _distances  = distances;
+      _unified_buffer_size=0;
+      _request_count =0;
+      CommsRequests.resize(0);
+
+      int osites  = _grid->oSites();
+
+      for(int i=0;i<npoints;i++){
+
+	int point = i;
+
+	_offsets[i].resize( osites);
+	_is_local[i].resize(osites);
+	_permute[i].resize( osites);
+
+	int dimension    = directions[i];
+	int displacement = distances[i];
+	int shift = displacement;
+	
+	int fd = _grid->_fdimensions[dimension];
+	int rd = _grid->_rdimensions[dimension];
+	_permute_type[point]=_grid->PermuteType(dimension);
+
+	_checkerboard = checkerboard;
+
+	// the permute type
+	int simd_layout     = _grid->_simd_layout[dimension];
+	int comm_dim        = _grid->_processors[dimension] >1 ;
+	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+
+	int sshift[2];
+	
+	// Underlying approach. For each local site build
+	// up a table containing the npoint "neighbours" and whether they 
+	// live in lattice or a comms buffer.
+	if ( !comm_dim ) {
+	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+
+	  if ( sshift[0] == sshift[1] ) {
+	    Local(point,dimension,shift,0x3);
+	  } else {
+	    Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Local(point,dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	} else { // All permute extract done in comms phase prior to Stencil application
+	  //        So tables are the same whether comm_dim or splice_dim
+	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+	  if ( sshift[0] == sshift[1] ) {
+	    Comms(point,dimension,shift,0x3);
+	  } else {
+	    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	}
+      }
+    }
+
+
+    void CartesianStencil::Local     (int point, int dimension,int shift,int cbmask)
+    {
+      int fd = _grid->_fdimensions[dimension];
+      int rd = _grid->_rdimensions[dimension];
+      int ld = _grid->_ldimensions[dimension];
+      int gd = _grid->_gdimensions[dimension];
+      
+      // Map to always positive shift modulo global full dimension.
+      shift = (shift+fd)%fd;
+      
+      // the permute type
+      int permute_dim =_grid->PermuteDim(dimension);
+      
+      for(int x=0;x<rd;x++){       
+	
+	int o   = 0;
+	int bo  = x * _grid->_ostride[dimension];
+	
+	int cb= (cbmask==0x2)? 1 : 0;
+	  
+	int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
+	int sx     = (x+sshift)%rd;
+	  
+	int permute_slice=0;
+	if(permute_dim){
+	  int wrap = sshift/rd;
+	  int  num = sshift%rd;
+	  if ( x< rd-num ) permute_slice=wrap;
+	  else permute_slice = 1-wrap;
+	}
+
+  	CopyPlane(point,dimension,x,sx,cbmask,permute_slice);
+  
+      }
+    }
+
+    void CartesianStencil::Comms     (int point,int dimension,int shift,int cbmask)
+    {
+      GridBase *grid=_grid;
+      
+      int fd              = _grid->_fdimensions[dimension];
+      int rd              = _grid->_rdimensions[dimension];
+      int simd_layout     = _grid->_simd_layout[dimension];
+      int comm_dim        = _grid->_processors[dimension] >1 ;
+      
+      assert(simd_layout==1);
+      assert(comm_dim==1);
+      assert(shift>=0);
+      assert(shift<fd);
+      
+      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+      _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
+                                           // send to one or more remote nodes.
+
+      int cb= (cbmask==0x2)? 1 : 0;
+      int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
+      
+      for(int x=0;x<rd;x++){       
+	
+	int offnode = ( x+sshift >= rd );
+	int sx        = (x+sshift)%rd;
+	int comm_proc = (x+sshift)/rd;
+	  
+	if (!offnode) {
+	  
+	  int permute_slice=0;
+	  CopyPlane(point,dimension,x,sx,cbmask,permute_slice); 
+	  
+	} else {
+	  
+	  int words = buffer_size;
+	  if (cbmask != 0x3) words=words>>1;
+	  
+	  //	  GatherPlaneSimple (point,dimension,sx,cbmask);
+	  
+	  int rank           = grid->_processor;
+	  int recv_from_rank;
+	  int xmit_to_rank;
+
+	  CommsRequest cr;
+
+	  cr.tag                   = _request_count++;
+	  cr.words                 = words;
+	  cr.unified_buffer_offset = _unified_buffer_size;
+	  _unified_buffer_size    += words;
+	  grid->ShiftedRanks(dimension,comm_proc,cr.to_rank,cr.from_rank);
+
+	  CommsRequests.push_back(cr);
+
+	  ScatterPlane(point,dimension,x,cbmask,cr.unified_buffer_offset); // permute/extract/merge is done in comms phase
+	  
+	}
+      }
+    }
+  // Routine builds up integer table for each site in _offsets, _is_local, _permute
+  void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute)
+    {
+      int rd = _grid->_rdimensions[dimension];
+      
+      if ( !_grid->CheckerBoarded(dimension) ) {
+	
+	int o   = 0;                                     // relative offset to base within plane
+	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int lo  = lplane*_grid->_ostride[dimension]; // offset in buffer
+	
+	// Simple block stride gather of SIMD objects
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    _offsets [point][lo+o+b]=ro+o+b;
+	    _is_local[point][lo+o+b]=1;
+	    _permute [point][lo+o+b]=permute;
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+	
+      } else {
+	
+	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int lo  = lplane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                     // relative offset to base within plane
+	
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    
+	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
+	    
+	    if ( ocb&cbmask ) {
+	      _offsets [point][lo+o+b]=ro+o+b;
+	      _is_local[point][lo+o+b]=1;
+	      _permute [point][lo+o+b]=permute;
+	    }
+	    
+	    }
+	  o +=_grid->_slice_stride[dimension];
+	}
+	
+      }
+    }
+  // Routine builds up integer table for each site in _offsets, _is_local, _permute
+    void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset)
+    {
+      int rd = _grid->_rdimensions[dimension];
+      
+      if ( !_grid->CheckerBoarded(dimension) ) {
+	
+	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                    // relative offset to base within plane
+	int bo  = 0;                                    // offset in buffer
+
+	// Simple block stride gather of SIMD objects
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+	    _offsets [point][so+o+b]=offset+(bo++);
+	    _is_local[point][so+o+b]=0;
+	    _permute [point][so+o+b]=0;
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+
+      } else { 
+	
+	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
+	int o   = 0;                                      // relative offset to base within plane
+	int bo  = 0;                                      // offset in buffer
+    
+	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
+	  for(int b=0;b<_grid->_slice_block[dimension];b++){
+
+	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	    if ( ocb & cbmask ) {
+	      _offsets [point][so+o+b]=offset+(bo++);
+	      _is_local[point][so+o+b]=0;
+	      _permute [point][so+o+b]=0;
+	    }
+	  }
+	  o +=_grid->_slice_stride[dimension];
+	}
+      }
+    }
+}
--- a/lib/Grid_summation.h
+++ b/lib/Grid_summation.h
@ -0,0 +1,12 @@
+#ifndef GRID_SUMMATION_H
+#define GRID_SUMMATION_H
+
+template<class vobj>
+inline void sumBlocks(Lattice<vobj> &coarseData,const Lattice<vobj &fineData)
+{
+  GridBase * fine  = findData._grid;
+  GridBase * coarse= findData._grid;
+  
+  return;
+}
+#endif
--- a/lib/Grid_vComplexD.h
+++ b/lib/Grid_vComplexD.h
@ -0,0 +1,346 @@
+#ifndef VCOMPLEXD_H
+#define  VCOMPLEXD_H
+#include "Grid.h"
+#include "Grid_vComplexF.h"
+
+namespace Grid {
+    class vComplexD {
+    public:
+        zvec v;
+    public:
+	typedef zvec     vector_type;
+	typedef ComplexD scalar_type;
+
+        vComplexD & operator = ( Zero & z){
+            vzero(*this);
+            return (*this);
+        }
+        vComplexD(){};
+        vComplexD(ComplexD a){
+	  vsplat(*this,a);
+	};
+        vComplexD(double a){
+	  vsplat(*this,ComplexD(a));
+	};
+ 
+        ///////////////////////////////////////////////
+        // mac, mult, sub, add, adj
+        // Should do an AVX2 version with mac.
+       ///////////////////////////////////////////////
+        friend inline void mac (vComplexD * __restrict__ y,const vComplexD * __restrict__ a,const vComplexD *__restrict__ x) {*y = (*a)*(*x)+(*y);};
+        friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);}
+        friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);}
+        friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);}
+        friend inline vComplexD adj(const vComplexD &in){ return conj(in); }
+
+        //////////////////////////////////
+        // Initialise to 1,0,i
+        //////////////////////////////////
+        friend inline void vone      (vComplexD &ret){ vsplat(ret,1.0,0.0);}
+        friend inline void vzero     (vComplexD &ret){ vsplat(ret,0.0,0.0);}
+        friend inline void vcomplex_i(vComplexD &ret){ vsplat(ret,0.0,1.0);}
+        
+        ////////////////////////////////////
+        // Arithmetic operator overloads +,-,*
+        ////////////////////////////////////
+        friend inline vComplexD operator + (vComplexD a, vComplexD b)
+        {
+            vComplexD ret;
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_add_pd(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_add_pd(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_add_pd(a.v,b.v);
+#endif
+#ifdef QPX
+            ret.v = vec_add(a.v,b.v);
+#endif
+            return ret;
+        };
+        
+        friend inline vComplexD operator - (vComplexD a, vComplexD b)
+        {
+            vComplexD ret;
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_sub_pd(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_sub_pd(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_sub_pd(a.v,b.v);
+#endif
+#ifdef QPX
+            ret.v = vec_sub(a.v,b.v);
+#endif
+            return ret;
+        };
+        
+        friend inline vComplexD operator * (vComplexD a, vComplexD b)
+        {
+            vComplexD ret;
+            
+            //Multiplicationof (ak+ibk)*(ck+idk)
+            // a + i b can be stored as a data structure
+            //From intel optimisation reference guide
+            /*
+             movsldup xmm0, Src1; load real parts into the destination,
+             ; a1, a1, a0, a0
+             movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
+             mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
+             shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
+             movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
+             mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
+             addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
+             VSHUFPD (VEX.256 encoded version)
+             IF IMM0[0] = 0
+             THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
+             IF IMM0[1] = 0
+             THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
+             IF IMM0[2] = 0
+             THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
+             IF IMM0[3] = 0
+             THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI;
+             */
+#if defined (AVX1)|| defined (AVX2)
+            zvec ymm0,ymm1,ymm2;
+            ymm0 = _mm256_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar, ar,ar b'00,00
+            ymm0 = _mm256_mul_pd(ymm0,b.v);        // ymm0 <- ar bi, ar br
+            ymm1 = _mm256_shuffle_pd(b.v,b.v,0x5); // ymm1 <- br,bi  b'01,01
+            ymm2 = _mm256_shuffle_pd(a.v,a.v,0xF); // ymm2 <- ai,ai  b'11,11
+            ymm1 = _mm256_mul_pd(ymm1,ymm2);       // ymm1 <- br ai, ai bi
+            ret.v= _mm256_addsub_pd(ymm0,ymm1);
+#endif
+#ifdef SSE4
+            zvec ymm0,ymm1,ymm2;
+            ymm0 = _mm_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar,
+            ymm0 = _mm_mul_pd(ymm0,b.v);        // ymm0 <- ar bi, ar br
+            ymm1 = _mm_shuffle_pd(b.v,b.v,0x1); // ymm1 <- br,bi   b01
+            ymm2 = _mm_shuffle_pd(a.v,a.v,0x3); // ymm2 <- ai,ai   b11
+            ymm1 = _mm_mul_pd(ymm1,ymm2);       // ymm1 <- br ai, ai bi
+            ret.v= _mm_addsub_pd(ymm0,ymm1);
+#endif
+#ifdef AVX512
+            /* This is from
+             * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets 
+             * @inproceedings{McFarlin:2011:ASV:1995896.1995938,
+             * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
+             * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
+             * booktitle = {Proceedings of the International Conference on Supercomputing},
+             * series = {ICS '11},
+             * year = {2011},
+             * isbn = {978-1-4503-0102-2},
+             * location = {Tucson, Arizona, USA},
+             * pages = {265--274},
+             * numpages = {10},
+             * url = {http://doi.acm.org/10.1145/1995896.1995938},
+             * doi = {10.1145/1995896.1995938},
+             * acmid = {1995938},
+             * publisher = {ACM},
+             * address = {New York, NY, USA},
+             * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
+ *                } 
+             */
+            zvec vzero,ymm0,ymm1,real,imag;
+            vzero =  _mm512_setzero();
+            ymm0 =  _mm512_swizzle_pd(a.v, _MM_SWIZ_REG_CDAB); // 
+            real =  _mm512_mask_or_epi64(a.v, 0xAAAA,vzero, ymm0);
+            imag =  _mm512_mask_sub_pd(a.v, 0x5555,vzero, ymm0);
+            ymm1 =  _mm512_mul_pd(real, b.v);
+            ymm0 =  _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK
+            ret.v=  _mm512_fmadd_pd(ymm0,imag,ymm1);
+             /* Imag OK */
+#endif
+#ifdef QPX
+            ret.v = vec_mul(a.v,b.v);
+#endif
+            return ret;
+        };
+
+	////////////////////////////////////////////////////////////////////
+	// General permute; assumes vector length is same across 
+	// all subtypes; may not be a good assumption, but could
+	// add the vector width as a template param for BG/Q for example
+	////////////////////////////////////////////////////////////////////
+	friend inline void permute(vComplexD &y,vComplexD b,int perm)
+	{
+	  Gpermute<vComplexD>(y,b,perm);
+	}
+	friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
+	{
+	  Gmerge<vComplexD,ComplexD >(y,extracted);
+	}
+	friend inline void extract(const vComplexD &y,std::vector<ComplexD *> &extracted)
+	{
+	  Gextract<vComplexD,ComplexD>(y,extracted);
+	}
+	friend inline void merge(vComplexD &y,std::vector<ComplexD > &extracted)
+	{
+	  Gmerge<vComplexD,ComplexD >(y,extracted);
+	}
+	friend inline void extract(const vComplexD &y,std::vector<ComplexD > &extracted)
+	{
+	  Gextract<vComplexD,ComplexD>(y,extracted);
+	}
+
+        ///////////////////////
+        // Splat
+        ///////////////////////
+        friend inline void vsplat(vComplexD &ret,ComplexD c){
+            float a= real(c);
+            float b= imag(c);
+            vsplat(ret,a,b);
+        }
+
+
+        friend inline void vsplat(vComplexD &ret,double rl,double ig){
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_set_pd(ig,rl,ig,rl);
+#endif
+#ifdef SSE4
+            ret.v = _mm_set_pd(ig,rl);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_set_pd(ig,rl,ig,rl,ig,rl,ig,rl);
+#endif
+#ifdef QPX
+            ret.v = {ig,rl,ig,rl};
+#endif
+        }
+
+        friend inline void vset(vComplexD &ret,ComplexD *a){
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+#endif
+#ifdef SSE4
+            ret.v = _mm_set_pd(a[0].imag(),a[0].real());
+#endif
+#ifdef AVX512
+            ret.v = _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+            // Note v has a0 a1 a2 a3 
+#endif
+#ifdef QPX
+            ret.v = {a[0].real(),a[0].imag(),a[1].real(),a[3].imag()};
+#endif
+        }
+
+friend inline void vstore(const vComplexD &ret, ComplexD *a){
+#if defined (AVX1)|| defined (AVX2)
+       _mm256_store_pd((double *)a,ret.v);
+#endif
+#ifdef SSE4
+       _mm_store_pd((double *)a,ret.v);
+#endif
+#ifdef AVX512
+       _mm512_store_pd((double *)a,ret.v);
+   //Note v has a3 a2 a1 a0
+#endif
+#ifdef QPX
+	assert(0);
+#endif
+        }
+      friend inline void vprefetch(const vComplexD &v)
+        {
+            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
+        }
+
+        ////////////////////////
+        // Conjugate
+        ////////////////////////
+        friend inline vComplexD conj(const vComplexD &in){
+            vComplexD ret ; vzero(ret);
+#if defined (AVX1)|| defined (AVX2)
+            // addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
+            __m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
+             ret.v=_mm256_shuffle_pd(tmp,tmp,0x5);
+#endif
+#ifdef SSE4
+            ret.v = _mm_addsub_pd(ret.v,in.v);
+#endif
+#ifdef AVX512
+             // Xeon does not have fmaddsub or addsub 
+             // with mask 0xa (1010), v[0] -v[1] v[2] -v[3] ....
+             ret.v = _mm512_mask_sub_pd(in.v, 0xaaaa,ret.v, in.v);
+             
+#endif
+#ifdef QPX
+	     assert(0);
+#endif
+            return ret;
+        }
+// REDUCE FIXME must be a cleaner implementation
+       friend inline ComplexD Reduce(const vComplexD & in)
+       { 
+#if defined (AVX1) || defined(AVX2)
+	 //            return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
+	 __attribute__ ((aligned(32))) double c_[4];
+         _mm256_store_pd(c_,in.v);
+	 return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
+#endif 
+#ifdef AVX512
+            return ComplexD(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
+#endif 
+#ifdef QPX
+#endif
+        }
+        
+        // Unary negation
+        friend inline vComplexD operator -(const vComplexD &r) {
+            vComplexD ret;
+            vzero(ret);
+            ret = ret - r;
+            return ret;
+        }
+        // *=,+=,-= operators
+        inline vComplexD &operator *=(const vComplexD &r) {
+            *this = (*this)*r;
+            return *this;
+        }
+        inline vComplexD &operator +=(const vComplexD &r) {
+            *this = *this+r;
+            return *this;
+        }
+        inline vComplexD &operator -=(const vComplexD &r) {
+            *this = *this-r;
+            return *this;
+        }
+
+    public:
+        static int Nsimd(void) { return sizeof(zvec)/sizeof(double)/2;}
+    };
+
+
+    inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
+
+
+    typedef  vComplexD vDComplex;
+    inline void zeroit(vComplexD &z){ vzero(z);}
+
+    inline vComplexD outerProduct(const vComplexD &l, const vComplexD& r)
+    {
+        return l*r;
+    }
+    inline vComplexD trace(const vComplexD &arg){
+        return arg;
+    }
+/////////////////////////////////////////////////////////////////////////
+//// Generic routine to promote object<complex> -> object<vcomplex>
+//// Supports the array reordering transformation that gives me SIMD utilisation
+///////////////////////////////////////////////////////////////////////////
+/*
+template<template<class> class object>
+inline object<vComplex> splat(object<Complex >s){
+      object<vComplex> ret;
+      vComplex * v_ptr = (vComplex *)& ret;
+      Complex * s_ptr = (Complex *) &s;
+      for(int i=0;i<sizeof(ret);i+=sizeof(vComplex)){
+          vsplat(*(v_ptr++),*(s_ptr++));
+      }
+      return ret;
+    }
+*/
+}
+#endif
--- a/lib/Grid_vComplexF.h
+++ b/lib/Grid_vComplexF.h
@ -0,0 +1,384 @@
+#ifndef VCOMPLEXF
+#define VCOMPLEXF
+#include "Grid.h"
+
+namespace Grid {
+
+  /*
+  inline void Print(const char *A,cvec c) { 
+	  float *fp=(float *)&c; 
+	  printf(A); 
+	  printf(" %le %le %le %le %le %le %le %le\n",
+		 fp[0],fp[1],fp[2],fp[3],fp[4],fp[5],fp[6],fp[7]);
+	}
+  */
+
+    class vComplexF {
+      //    protected:
+
+    public:
+        cvec v;
+        
+    public:
+        static inline int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
+    public:
+	typedef cvec     vector_type;
+	typedef ComplexF scalar_type;
+
+        vComplexF & operator = ( Zero & z){
+            vzero(*this);
+            return (*this);
+        }
+        vComplexF(){};
+        vComplexF(ComplexF a){
+	  vsplat(*this,a);
+	};
+        vComplexF(double a){
+	  vsplat(*this,ComplexF(a));
+	};
+       
+        ///////////////////////////////////////////////
+        // mac, mult, sub, add, adj
+        // Should do an AVX2 version with mac.
+        ///////////////////////////////////////////////
+        friend inline void mac (vComplexF * __restrict__ y,const vComplexF * __restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
+        friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
+        friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
+        friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
+        friend inline vComplexF adj(const vComplexF &in){ return conj(in); }
+        
+        //////////////////////////////////
+        // Initialise to 1,0,i
+        //////////////////////////////////
+        friend inline void vone(vComplexF &ret)      { vsplat(ret,1.0,0.0); }
+        friend inline void vzero(vComplexF &ret)     { vsplat(ret,0.0,0.0); }
+        friend inline void vcomplex_i(vComplexF &ret){ vsplat(ret,0.0,1.0);}
+          
+        ////////////////////////////////////
+        // Arithmetic operator overloads +,-,*
+        ////////////////////////////////////
+        friend inline vComplexF operator + (vComplexF a, vComplexF b)
+        {
+            vComplexF ret;
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_add_ps(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_add_ps(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_add_ps(a.v,b.v);
+#endif
+#ifdef QPX
+#error
+#endif
+            return ret;
+        };
+        
+        friend inline vComplexF operator - (vComplexF a, vComplexF b)
+        {
+            vComplexF ret;
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_sub_ps(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_sub_ps(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_sub_ps(a.v,b.v);
+#endif
+#ifdef QPX
+#error
+#endif
+            return ret;
+        };
+        
+        friend inline vComplexF operator * (vComplexF a, vComplexF b)
+        {
+            vComplexF ret;
+            
+            //Multiplicationof (ak+ibk)*(ck+idk)
+            // a + i b can be stored as a data structure
+            //From intel optimisation reference
+            /*
+             movsldup xmm0, Src1; load real parts into the destination,
+             ; a1, a1, a0, a0
+             movaps xmm1, src2; load the 2nd pair of complex values, ; i.e. d1, c1, d0, c0
+             mulps xmm0, xmm1; temporary results, a1d1, a1c1, a0d0, ; a0c0
+             shufps xmm1, xmm1, b1; reorder the real and imaginary ; parts, c1, d1, c0, d0
+             movshdup xmm2, Src1; load the imaginary parts into the ; destination, b1, b1, b0, b0
+             mulps xmm2, xmm1; temporary results, b1c1, b1d1, b0c0, ; b0d0
+             addsubps xmm0, xmm2; b1c1+a1d1, a1c1 -b1d1, b0c0+a0d
+             */
+#if defined (AVX1)|| defined (AVX2)
+            cvec ymm0,ymm1,ymm2;
+            ymm0 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
+            ymm0 = _mm256_mul_ps(ymm0,b.v);        // ymm0 <- ar bi, ar br
+            // FIXME AVX2 could MAC
+            ymm1 = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
+            ymm2 = _mm256_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
+            ymm1 = _mm256_mul_ps(ymm1,ymm2);       // ymm1 <- br ai, ai bi
+            ret.v= _mm256_addsub_ps(ymm0,ymm1);    
+#endif
+#ifdef SSE4
+            cvec ymm0,ymm1,ymm2;
+            ymm0 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
+            ymm0 = _mm_mul_ps(ymm0,b.v);        // ymm0 <- ar bi, ar br
+            ymm1 = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
+            ymm2 = _mm_shuffle_ps(a.v,a.v,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
+            ymm1 = _mm_mul_ps(ymm1,ymm2);       // ymm1 <- br ai, ai bi
+            ret.v= _mm_addsub_ps(ymm0,ymm1);
+#endif
+#ifdef AVX512
+//
+            cvec vzero,ymm0,ymm1,real, imag;
+            vzero = _mm512_setzero();
+            ymm0  = _mm512_swizzle_ps(a.v, _MM_SWIZ_REG_CDAB); // 
+            real  = _mm512_mask_or_epi32(a.v, 0xAAAA,vzero, ymm0);
+            imag  = _mm512_mask_sub_ps(a.v, 0x5555,vzero, ymm0);
+            ymm1  = _mm512_mul_ps(real, b.v);
+            ymm0  = _mm512_swizzle_ps(b.v, _MM_SWIZ_REG_CDAB); // OK
+            ret.v = _mm512_fmadd_ps(ymm0,imag,ymm1);
+
+
+#endif
+#ifdef QPX
+            ret.v = vec_mul(a.v,b.v);
+#endif
+            return ret;
+        };
+      
+
+	////////////////////////////////////////////////////////////////////////
+	// FIXME:  gonna remove these load/store, get, set, prefetch
+	////////////////////////////////////////////////////////////////////////
+        friend inline void vset(vComplexF &ret, ComplexF *a){
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_set_ps(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+#endif
+#ifdef SSE4
+            ret.v = _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
+#endif
+#ifdef AVX512
+            ret.v = _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
+            // Note v has a0 a1 a2 a3 a4 a5 a6 a7
+#endif
+#ifdef QPX
+	    ret.v = {a[0].real(),a[0].imag(),a[1].real(),a[1].imag(),a[2].real(),a[2].imag(),a[3].real(),a[3].imag()};
+#endif
+        }
+        
+        ///////////////////////
+        // Splat
+        ///////////////////////
+        friend inline void vsplat(vComplexF &ret,ComplexF c){
+            float a= real(c);
+            float b= imag(c);
+            vsplat(ret,a,b);
+        }
+
+friend inline void vstore(const vComplexF &ret, ComplexF *a){
+#if defined (AVX1)|| defined (AVX2)
+        _mm256_store_ps((float *)a,ret.v);
+#endif
+#ifdef SSE4
+        _mm_store_ps((float *)a,ret.v);
+#endif
+#ifdef AVX512
+	_mm512_store_ps((float *)a,ret.v);
+//Note v has a3 a2 a1 a0
+#endif
+#ifdef QPX
+	assert(0);
+#endif
+}
+      friend inline void vprefetch(const vComplexF &v)
+        {
+            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
+        }
+
+        friend inline void vsplat(vComplexF &ret,float a,float b){
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_set_ps(b,a,b,a,b,a,b,a);
+#endif
+#ifdef SSE4
+            ret.v = _mm_set_ps(a,b,a,b);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
+#endif
+#ifdef QPX
+            ret.v = {a,b,a,b};
+#endif
+        }
+       friend inline ComplexF Reduce(const vComplexF & in)
+       {
+#ifdef SSE4
+#error
+#endif
+#if defined (AVX1) || defined(AVX2)
+	 // FIXME this is inefficient and use 
+         __attribute__ ((aligned(32))) float c_[8];
+         _mm256_store_ps(c_,in.v);
+         return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
+
+#endif
+#ifdef AVX512
+            return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
+#endif
+#ifdef QPX
+#endif
+        }
+
+        friend inline vComplexF operator * (const ComplexF &a, vComplexF b){
+            vComplexF va;
+            vsplat(va,a);
+            return va*b;
+        }
+        friend inline vComplexF operator * (vComplexF b,const ComplexF &a){
+	  return a*b;
+        }
+
+       /*
+	template<class real>
+        friend inline vComplexF operator * (vComplexF b,const real &a){
+            vComplexF va;
+	    Complex ca(a,0);
+            vsplat(va,ca);
+            return va*b;
+        }
+	template<class real>
+	friend inline vComplexF operator * (const real &a,vComplexF b){
+	  return a*b;
+	}
+
+        friend inline vComplexF operator + (const Complex &a, vComplexF b){
+            vComplexF va;
+            vsplat(va,a);
+            return va+b;
+        }
+        friend inline vComplexF operator + (vComplexF b,const Complex &a){
+            return a+b;
+        }
+	template<class real>
+        friend inline vComplexF operator + (vComplexF b,const real &a){
+            vComplexF va;
+	    Complex ca(a,0);
+            vsplat(va,ca);
+            return va+b;
+        }
+	template<class real>
+	friend inline vComplexF operator + (const real &a,vComplexF b){
+	  return a+b;
+	}
+        friend inline vComplexF operator - (const Complex &a, vComplexF b){
+            vComplexF va;
+            vsplat(va,a);
+            return va-b;
+        }
+        friend inline vComplexF operator - (vComplexF b,const Complex &a){
+            vComplexF va;
+            vsplat(va,a);
+            return b-va;
+        }
+	template<class real>
+        friend inline vComplexF operator - (vComplexF b,const real &a){
+            vComplexF va;
+	    Complex ca(a,0);
+            vsplat(va,ca);
+            return b-va;
+        }
+	template<class real>
+	friend inline vComplexF operator - (const real &a,vComplexF b){
+            vComplexF va;
+	    Complex ca(a,0);
+            vsplat(va,ca);
+            return va-b;
+	}
+       */
+       
+
+        ///////////////////////
+        // Conjugate
+        ///////////////////////
+								     
+        friend inline vComplexF conj(const vComplexF &in){
+            vComplexF ret ; vzero(ret);
+#if defined (AVX1)|| defined (AVX2)
+             cvec tmp;
+             tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
+             ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
+#endif
+#ifdef SSE4
+            ret.v = _mm_addsub_ps(ret.v,in.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // Zero out 0+real 0-imag 
+#endif
+#ifdef QPX
+            assert(0);
+#endif
+            return ret;
+        }
+        
+        // Unary negation
+        friend inline vComplexF operator -(const vComplexF &r) {
+            vComplexF ret;
+            vzero(ret);
+            ret = ret - r;
+            return ret;
+        }
+        // *=,+=,-= operators
+        inline vComplexF &operator *=(const vComplexF &r) {
+            *this = (*this)*r;
+            return *this;
+        }
+        inline vComplexF &operator +=(const vComplexF &r) {
+            *this = *this+r;
+            return *this;
+        }
+        inline vComplexF &operator -=(const vComplexF &r) {
+            *this = *this-r;
+            return *this;
+        }
+
+      friend inline void permute(vComplexF &y,vComplexF b,int perm)
+      {
+	Gpermute<vComplexF>(y,b,perm);
+      }
+      friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
+      {
+	Gmerge<vComplexF,ComplexF >(y,extracted);
+      }
+      friend inline void extract(const vComplexF &y,std::vector<ComplexF *> &extracted)
+      {
+	Gextract<vComplexF,ComplexF>(y,extracted);
+      }
+      friend inline void merge(vComplexF &y,std::vector<ComplexF > &extracted)
+      {
+	Gmerge<vComplexF,ComplexF >(y,extracted);
+      }
+      friend inline void extract(const vComplexF &y,std::vector<ComplexF > &extracted)
+      {
+	Gextract<vComplexF,ComplexF>(y,extracted);
+      }
+
+
+    };
+
+    inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) 
+    {
+      return conj(l)*r; 
+    }
+
+    inline void zeroit(vComplexF &z){ vzero(z);}
+
+    inline vComplexF outerProduct(const vComplexF &l, const vComplexF& r)
+    {
+        return l*r;
+    }
+    inline vComplexF trace(const vComplexF &arg){
+        return arg;
+    }
+}
+#endif
--- a/lib/Grid_vInteger.h
+++ b/lib/Grid_vInteger.h
@ -0,0 +1,259 @@
+#ifndef VINTEGER_H
+#define VINTEGER_H
+
+#include "Grid.h"
+
+namespace Grid {
+
+#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
+// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
+
+  typedef uint32_t Integer;
+
+  class vInteger {
+    protected:
+
+    public:
+
+      ivec v;
+
+	typedef ivec     vector_type;
+	typedef Integer scalar_type;
+
+        vInteger(){};
+        vInteger & operator = (const Zero & z){
+            vzero(*this);
+            return (*this);
+        }
+        vInteger(Integer a){
+	  vsplat(*this,a);
+	};
+        ////////////////////////////////////
+        // Arithmetic operator overloads +,-,*
+        ////////////////////////////////////
+        friend inline vInteger operator + ( vInteger a,  vInteger b)
+        {
+	  vInteger ret;
+#if defined (AVX1) 
+	  __m128i a0,a1;
+	  __m128i b0,b1;
+	  a0 = _mm256_extractf128_si256(a.v,0);
+	  b0 = _mm256_extractf128_si256(b.v,0);
+	  a1 = _mm256_extractf128_si256(a.v,1);
+	  b1 = _mm256_extractf128_si256(b.v,1);
+	  a0 = _mm_add_epi32(a0,b0);
+	  a1 = _mm_add_epi32(a1,b1);
+	  ret.v = _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+            ret.v = _mm256_add_epi32(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_add_epi32(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_add_epi32(a.v,b.v);
+#endif
+#ifdef QPX
+            // Implement as array of ints is only option
+#error
+#endif
+            return ret;
+        };
+        
+        friend inline vInteger operator - ( vInteger a, vInteger b)
+        {
+            vInteger ret;
+#if defined (AVX1) 
+	  __m128i a0,a1;
+	  __m128i b0,b1;
+	  a0 = _mm256_extractf128_si256(a.v,0);
+	  b0 = _mm256_extractf128_si256(b.v,0);
+	  a1 = _mm256_extractf128_si256(a.v,1);
+	  b1 = _mm256_extractf128_si256(b.v,1);
+	  a0 = _mm_sub_epi32(a0,b0);
+	  a1 = _mm_sub_epi32(a1,b1);
+	  ret.v = _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+            ret.v = _mm256_sub_epi32(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_sub_epi32(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_sub_epi32(a.v,b.v);
+#endif
+#ifdef QPX
+            // Implement as array of ints is only option
+#error
+#endif
+            return ret;
+        };
+
+        friend inline vInteger operator * ( vInteger a, vInteger b)
+        {
+            vInteger ret;
+#if defined (AVX1) 
+	  __m128i a0,a1;
+	  __m128i b0,b1;
+	  a0 = _mm256_extractf128_si256(a.v,0);
+	  b0 = _mm256_extractf128_si256(b.v,0);
+	  a1 = _mm256_extractf128_si256(a.v,1);
+	  b1 = _mm256_extractf128_si256(b.v,1);
+	  a0 = _mm_mul_epi32(a0,b0);
+	  a1 = _mm_mul_epi32(a1,b1);
+	  ret.v = _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+            ret.v = _mm256_mul_epi32(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_mul_epi32(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_mul_epi32(a.v,b.v);
+#endif
+#ifdef QPX
+            // Implement as array of ints is only option
+#error
+#endif
+            return ret;
+        };
+        
+        ///////////////////////////////////////////////
+        // mult, sub, add, adj,conj, mac functions
+        ///////////////////////////////////////////////
+        friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
+        friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}
+        friend inline void add (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) + (*r);}
+        friend inline void mac (vInteger &y,const vInteger a,const vInteger x){
+            y = a*x+y;
+	}
+        
+        //////////////////////////////////
+        // Initialise to 1,0,i
+        //////////////////////////////////
+        friend inline void vone (vInteger &ret){vsplat(ret,1);}
+        friend inline void vzero(vInteger &ret){vsplat(ret,0);}
+        friend inline void vtrue (vInteger &ret){vsplat(ret,0xFFFFFFFF);}
+        friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
+
+        
+        /////////////////////////////////////////////////////
+        // Broadcast a value across Nsimd copies.
+        /////////////////////////////////////////////////////
+        friend inline void vsplat(vInteger &ret,scalar_type a){
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_set1_epi32(a);
+#endif
+#ifdef SSE4
+            ret.v = _mm_set1_epi32(a);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_set1_epi32(a);
+#endif
+#ifdef QPX
+#error
+#endif
+        }
+        friend inline void vset(vInteger &ret,scalar_type *a){
+#if defined (AVX1)|| defined (AVX2)
+	  ret.v = _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+#endif
+#ifdef SSE4
+	  ret.v = _mm_set_epi32(a[0],a[1],a[2],a[3]);
+#endif
+#ifdef AVX512
+	  ret.v = _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+                                   a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+#endif
+#ifdef QPX
+#error
+#endif
+	}
+
+	friend inline void vstore(const vInteger &ret, Integer *a){
+#if defined (AVX1)|| defined (AVX2)
+	  _mm256_store_si256((__m256i*)a,ret.v);
+#endif
+#ifdef SSE4
+	  _mm_store_si128((__m128i *)a,ret.v);
+#endif
+#ifdef AVX512
+	  _mm512_store_si512(a,ret.v);
+#endif
+#ifdef QPX
+	  assert(0);
+#endif
+        }
+
+        friend inline void vprefetch(const vInteger &v)
+        {
+            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
+        }
+
+        // Unary negation
+        friend inline vInteger operator -(const vInteger &r) {
+            vInteger ret;
+            vzero(ret);
+            ret = ret - r;
+            return ret;
+        }
+       friend inline Integer Reduce(const vInteger & in)
+       {
+	 // unimplemented
+	 assert(0);
+       }
+        // *=,+=,-= operators
+        inline vInteger &operator *=(const vInteger &r) {
+            *this = (*this)*r;
+            return *this;
+        }
+        inline vInteger &operator +=(const vInteger &r) {
+            *this = *this+r;
+            return *this;
+        }
+        inline vInteger &operator -=(const vInteger &r) {
+            *this = *this-r;
+            return *this;
+        }
+
+      friend inline void permute(vInteger &y,const vInteger b,int perm)
+      {
+	Gpermute<vInteger>(y,b,perm);
+      }
+      friend inline void merge(vInteger &y,std::vector<Integer *> &extracted)
+      {
+	Gmerge<vInteger,Integer>(y,extracted);
+      }
+      friend inline void extract(const vInteger &y,std::vector<Integer *> &extracted)
+      {
+	Gextract<vInteger,Integer>(y,extracted);
+      }
+      friend inline void merge(vInteger &y,std::vector<Integer> &extracted)
+      {
+	Gmerge<vInteger,Integer>(y,extracted);
+      }
+      friend inline void extract(const vInteger &y,std::vector<Integer> &extracted)
+      {
+	Gextract<vInteger,Integer>(y,extracted);
+      }
+
+
+    public:
+        static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);}
+  };
+
+    inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; }
+
+    inline void  zeroit(vInteger &z){ vzero(z);}
+
+    inline vInteger outerProduct(const vInteger &l, const vInteger& r)
+    {
+        return l*r;
+    }
+    
+}
+
+#endif
--- a/lib/Grid_vRealD.h
+++ b/lib/Grid_vRealD.h
@ -0,0 +1,259 @@
+#ifndef VREALD_H
+#define VREALD_H
+
+#include "Grid.h"
+
+namespace Grid {
+    class vRealD  {
+    public:
+        dvec v; // dvec is double precision vector
+
+    public:
+	typedef dvec  vector_type;
+	typedef RealD scalar_type;
+
+        vRealD(){};
+        vRealD(RealD a){
+	  vsplat(*this,a);
+	};
+
+        friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
+        friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
+        friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
+        friend inline vRealD adj(const vRealD &in) { return in; }
+        friend inline vRealD conj(const vRealD &in){ return in; }
+        
+        friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
+#if defined (AVX1) || defined (SSE4)
+            y = a*x+y;
+#endif
+#ifdef AVX2     // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
+            // accelerates multiply accumulate, but not general multiply add
+            y.v = _mm256_fmadd_pd(a.v,x.v,y.v);
+#endif
+#ifdef AVX512
+            // here precision of vector are still single
+            y.v = _mm512_fmadd_pd(a.v,x.v,y.v);
+#endif
+#ifdef QPX
+            y.v = vec_madd(a.v,x.v,y.v);
+#endif
+        }
+        //////////////////////////////////
+        // Initialise to 1,0
+        //////////////////////////////////
+        friend inline void vone (vRealD &ret){ vsplat(ret,1.0);}
+        friend inline void vzero(vRealD &ret){ vsplat(ret,0.0);}
+        
+        
+        ////////////////////////////////////
+        // Arithmetic operator overloads +,-,*
+        ////////////////////////////////////
+        friend inline vRealD operator + (vRealD a, vRealD b)
+        {
+            vRealD ret;
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_add_pd(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_add_pd(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_add_pd(a.v,b.v);
+#endif
+#ifdef QPX
+            ret.v = vec_add(a.v,b.v);
+#endif
+            return ret;
+        };
+        friend inline vRealD operator - (vRealD a, vRealD b)
+        {
+            vRealD ret;
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_sub_pd(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_sub_pd(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_sub_pd(a.v,b.v);
+#endif
+#ifdef QPX
+            ret.v = vec_sub(a.v,b.v);
+#endif
+            return ret;
+        };
+        
+        friend inline vRealD operator * (vRealD a, vRealD b)
+        {
+            vRealD ret;
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_mul_pd(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_mul_pd(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_mul_pd(a.v,b.v);
+#endif
+#ifdef QPX
+            ret.v = vec_mul(a.v,b.v);
+#endif
+            return ret;
+        };
+
+	////////////////////////////////////////////////////////////////////
+	// General permute; assumes vector length is same across 
+	// all subtypes; may not be a good assumption, but could
+	// add the vector width as a template param for BG/Q for example
+	////////////////////////////////////////////////////////////////////
+	friend inline void permute(vRealD &y,vRealD b,int perm)
+	{
+	  Gpermute<vRealD>(y,b,perm);
+	}
+	friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
+	{
+	  Gmerge<vRealD,RealD >(y,extracted);
+	}
+	friend inline void extract(const vRealD &y,std::vector<RealD *> &extracted)
+	{
+	  Gextract<vRealD,RealD>(y,extracted);
+	}
+	friend inline void merge(vRealD &y,std::vector<RealD > &extracted)
+	{
+	  Gmerge<vRealD,RealD >(y,extracted);
+	}
+	friend inline void extract(const vRealD &y,std::vector<RealD > &extracted)
+	{
+	  Gextract<vRealD,RealD>(y,extracted);
+	}
+
+        
+        friend inline void vsplat(vRealD &ret,double a){
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_set_pd(a,a,a,a);
+#endif
+#ifdef SSE4
+            ret.v = _mm_set_pd(a,a);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_set1_pd(a);
+#endif
+#ifdef QPX
+            ret.v = {a,a,a,a};
+#endif
+        }
+	friend inline void vset(vRealD &ret, double *a){
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
+#endif
+#ifdef SSE4
+            ret.v = _mm_set_pd(a[0],a[1]);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+            // Note v has a0 a1 a2 a3 a4 a5 a6 a7
+#endif
+#ifdef QPX
+            ret.v = {a[0],a[1],a[2],a[3]};
+#endif
+	}
+
+	friend inline void vstore(const vRealD &ret, double *a){
+#if defined (AVX1)|| defined (AVX2)
+            _mm256_store_pd(a,ret.v);
+#endif
+#ifdef SSE4
+            _mm_store_pd(a,ret.v);
+#endif
+#ifdef AVX512
+            _mm512_store_pd(a,ret.v);
+            // Note v has a7 a6 a5ba4 a3 a2 a1 a0
+#endif
+#ifdef QPX
+	    assert(0);
+#endif
+	}
+        friend inline void vprefetch(const vRealD &v)
+        {
+            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
+        }
+        // Unary negation
+        friend inline vRealD operator -(const vRealD &r) {
+            vRealD ret;
+            vzero(ret);
+            ret = ret - r;
+            return ret;
+        }
+
+       friend inline RealD Reduce(const vRealD & in)
+       {
+#if defined (AVX1) || defined(AVX2)
+	 typedef union  {
+	   uint64_t l;
+	   double   d;
+	 } my_conv_t;
+	 my_conv_t converter;
+// more reduce_add
+/*
+            __attribute__ ((aligned(32))) double c_[16];
+	    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
+            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
+  	             tmp = _mm256_permute2f128_pd(hadd,hadd,0x01);// tmp  = 3+2,1+0,1+0,3+2
+                    hadd = _mm256_hadd_pd(tmp,tmp);               // tmp  = 3+2+1+0,3+2+1+0,1+0+3+2,1+0+3+2
+                    _mm256_store_pd(c_,hadd);<3B>
+             return c[0]
+*/
+	    __m256d tmp  = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
+            __m256d hadd = _mm256_hadd_pd(in.v,tmp);              // hadd = 1+0,3+2,3+2,1+0
+                    hadd = _mm256_hadd_pd(hadd,hadd);             // hadd = 1+0+3+2...
+	    converter.l = _mm256_extract_epi64(hadd,0);
+            return converter.d;
+#endif
+#ifdef AVX512
+            return _mm512_reduce_add_pd(in.v);
+/*
+            __attribute__ ((aligned(32))) double c_[8];
+           _mm512_store_pd(c_,in.v);
+            return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7];
+*/
+#endif
+#ifdef QPX
+#endif
+        }
+
+        // *=,+=,-= operators
+        inline vRealD &operator *=(const vRealD &r) {
+            *this = (*this)*r;
+            return *this;
+        }
+        inline vRealD &operator +=(const vRealD &r) {
+            *this = *this+r;
+            return *this;
+        }
+        inline vRealD &operator -=(const vRealD &r) {
+            *this = *this-r;
+            return *this;
+        }
+
+    public:
+        static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
+    };
+
+    inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
+    inline void zeroit(vRealD &z){ vzero(z);}
+
+    inline vRealD outerProduct(const vRealD &l, const vRealD& r)
+    {
+        return l*r;
+    }
+    inline vRealD trace(const vRealD &arg){
+        return arg;
+    }
+    inline vRealD real(const vRealD &arg){
+        return arg;
+    }
+
+
+}
+#endif
--- a/lib/Grid_vRealF.h
+++ b/lib/Grid_vRealF.h
@ -0,0 +1,279 @@
+#ifndef VREALF_H
+#define VREALF_H
+
+#include "Grid.h"
+
+namespace Grid {
+    class vRealF  {
+    public:
+        fvec v;
+
+    public:
+
+	typedef fvec  vector_type;
+	typedef RealF scalar_type;
+
+        vRealF(){};
+        vRealF(RealF a){
+	  vsplat(*this,a);
+	};
+        ////////////////////////////////////
+        // Arithmetic operator overloads +,-,*
+        ////////////////////////////////////
+        friend inline vRealF operator + ( vRealF a,  vRealF b)
+        {
+            vRealF ret;
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_add_ps(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_add_ps(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_add_ps(a.v,b.v);
+#endif
+#ifdef QPX
+            vector4double aa,bb,cc;
+            aa = vec_lda(0,(float *)&a);
+            bb = vec_lda(0,(float *)&b);
+            cc = vec_add(aa,bb);
+            vec_sta(cc,0,(float *)&ret.v);
+#endif
+            return ret;
+        };
+        
+        friend inline vRealF operator - ( vRealF a, vRealF b)
+        {
+            vRealF ret;
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_sub_ps(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_sub_ps(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_sub_ps(a.v,b.v);
+#endif
+#ifdef QPX
+            vector4double aa,bb,cc;
+            aa = vec_lda(0,(float *)&a);
+            bb = vec_lda(0,(float *)&b);
+            cc = vec_sub(aa,bb);
+            vec_sta(cc,0,(float *)&ret.v);
+#endif
+            return ret;
+        };
+
+        friend inline vRealF operator * ( vRealF a, vRealF b)
+        {
+            vRealF ret;
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_mul_ps(a.v,b.v);
+#endif
+#ifdef SSE4
+            ret.v = _mm_mul_ps(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_mul_ps(a.v,b.v);
+#endif
+#ifdef QPX
+            vector4double aa,bb,cc; // QPX single we are forced to load as this promotes single mem->double regs.
+            aa = vec_lda(0,(float *)&a);
+            bb = vec_lda(0,(float *)&b);
+            cc = vec_mul(aa,bb);
+            vec_sta(cc,0,(float *)&ret.v);
+#endif
+            return ret;
+        };
+        
+        ///////////////////////////////////////////////
+        // mult, sub, add, adj,conj, mac functions
+        ///////////////////////////////////////////////
+        friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);}
+        friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);}
+        friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);}
+        friend inline vRealF adj(const vRealF &in) { return in; }
+        friend inline vRealF conj(const vRealF &in){ return in; }
+
+        friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
+#if defined (AVX1) || defined (SSE4)
+            y = a*x+y;
+#endif
+#ifdef AVX2     // AVX 2 introduced FMA support. FMA4 eliminates a copy, but AVX only has FMA3
+            // accelerates multiply accumulate, but not general multiply add
+            y.v = _mm256_fmadd_ps(a.v,x.v,y.v);
+#endif
+#ifdef AVX512
+            y.v = _mm512_fmadd_ps(a.v,x.v,y.v);
+#endif
+#ifdef QPX
+            vector4double aa,xx,yy; // QPX single we are forced to load as this promotes single mem->double regs.
+            aa = vec_lda(0,(float *)&a.v);
+            xx = vec_lda(0,(float *)&x.v);
+            yy = vec_lda(0,(float *)&y.v);
+            yy = vec_madd(aa,xx,yy);
+            vec_sta(yy,0,(float *)&y.v);
+#endif
+        }
+        
+        //////////////////////////////////
+        // Initialise to 1,0,i
+        //////////////////////////////////
+        friend inline void vone (vRealF &ret){vsplat(ret,1.0);}
+        friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
+
+
+	////////////////////////////////////////////////////////////////////
+	// General permute; assumes vector length is same across 
+	// all subtypes; may not be a good assumption, but could
+	// add the vector width as a template param for BG/Q for example
+	////////////////////////////////////////////////////////////////////
+	friend inline void permute(vRealF &y,vRealF b,int perm)
+	{
+	  Gpermute<vRealF>(y,b,perm);
+	}
+	friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
+	{
+	  Gmerge<vRealF,RealF >(y,extracted);
+	}
+	friend inline void extract(const vRealF &y,std::vector<RealF *> &extracted)
+	{
+	  Gextract<vRealF,RealF>(y,extracted);
+	}
+	friend inline void merge(vRealF &y,std::vector<RealF> &extracted)
+	{
+	  Gmerge<vRealF,RealF >(y,extracted);
+	}
+	friend inline void extract(const vRealF &y,std::vector<RealF> &extracted)
+	{
+	  Gextract<vRealF,RealF>(y,extracted);
+	}
+
+
+        
+        /////////////////////////////////////////////////////
+        // Broadcast a value across Nsimd copies.
+        /////////////////////////////////////////////////////
+        friend inline void vsplat(vRealF &ret,float a){
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_set_ps(a,a,a,a,a,a,a,a);
+#endif
+#ifdef SSE4
+            ret.v = _mm_set_ps(a,a,a,a);
+#endif
+#ifdef AVX512
+            //ret.v = _mm512_set_ps(a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a);
+            ret.v = _mm512_set1_ps(a);
+#endif
+#ifdef QPX
+            ret.v = {a,a,a,a};
+#endif
+        }
+
+
+        friend inline void vset(vRealF &ret, float *a){
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+#endif
+#ifdef SSE4
+            ret.v = _mm_set_ps(a[0],a[1],a[2],a[3]);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+                                   a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+            // Note v has a0 a1 a2 a3 a4 a5 a6 a7
+#endif
+#ifdef QPX
+            ret.v = {a[0],a[1],a[2],a[3],a[4],a[5],a[6],a[7]};
+#endif
+	}
+
+	////////////////////////////////////////////////////////////////////////
+	// FIXME:  gonna remove these load/store, get, set, prefetch
+	////////////////////////////////////////////////////////////////////////
+friend inline void vstore(const vRealF &ret, float *a){
+#if defined (AVX1)|| defined (AVX2)
+	_mm256_store_ps(a,ret.v);
+#endif
+#ifdef SSE4
+	_mm_store_ps(a,ret.v);
+#endif
+#ifdef AVX512
+	_mm512_store_ps(a,ret.v);
+	// Note v has a7 a6 a5ba4 a3 a2 a1 a0
+#endif
+#ifdef QPX
+	assert(0);
+#endif
+        }
+
+
+        friend inline void vprefetch(const vRealF &v)
+        {
+            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
+        }
+        // Unary negation
+        friend inline vRealF operator -(const vRealF &r) {
+            vRealF ret;
+            vzero(ret);
+            ret = ret - r;
+            return ret;
+        }
+       friend inline RealF Reduce(const vRealF & in)
+       {
+#if defined (AVX1) || defined(AVX2)
+            __attribute__ ((aligned(32))) float c_[16];
+            __m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01);
+            __m256 hadd = _mm256_hadd_ps(in.v,tmp);
+                   tmp = _mm256_permute2f128_ps(hadd,hadd,0x01);
+                   hadd = _mm256_hadd_ps(tmp,tmp);
+                  _mm256_store_ps(c_,hadd);
+         return (float)c_[0];
+
+#endif
+#ifdef AVX512
+            return _mm512_reduce_add_ps(in.v);
+/*
+             __attribute__ ((aligned(64))) float c_[16];
+             _mm512_store_ps(c_,in.v);
+             return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7]
+                    +c_[8]+c_[9]+c_[10]+c_[11]+c_[12]+c_[13]+c_[14]+c_[15];
+*/
+#endif
+#ifdef QPX
+#endif
+        }
+
+        // *=,+=,-= operators
+        inline vRealF &operator *=(const vRealF &r) {
+            *this = (*this)*r;
+            return *this;
+        }
+        inline vRealF &operator +=(const vRealF &r) {
+            *this = *this+r;
+            return *this;
+        }
+        inline vRealF &operator -=(const vRealF &r) {
+            *this = *this-r;
+            return *this;
+        }
+    public:
+        static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
+    };
+    inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
+    inline void  zeroit(vRealF &z){ vzero(z);}
+
+    inline vRealF outerProduct(const vRealF &l, const vRealF& r)
+    {
+        return l*r;
+    }
+    inline vRealF trace(const vRealF &arg){
+        return arg;
+    }
+    inline vRealF real(const vRealF &arg){
+        return arg;
+    }
+
+    
+}
+#endif
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -0,0 +1,44 @@
+# additional include paths necessary to compile the C++ library
+AM_CXXFLAGS = -I$(top_srcdir)/
+
+
+extra_sources=
+if BUILD_COMMS_MPI
+  extra_sources+=Grid_communicator_mpi.cc
+  extra_sources+=Grid_stencil_common.cc
+endif
+if BUILD_COMMS_NONE
+  extra_sources+=Grid_communicator_fake.cc
+  extra_sources+=Grid_stencil_common.cc
+endif
+
+#
+# Libraries
+#
+lib_LIBRARIES = libGrid.a
+libGrid_a_SOURCES = Grid_init.cc $(extra_sources)
+
+#
+# Include files
+#
+include_HEADERS = Grid_config.h\
+	Grid.h\
+	Grid_simd.h\
+	Grid_vComplexD.h\
+	Grid_vComplexF.h\
+	Grid_vRealD.h\
+	Grid_vRealF.h\
+	Grid_Cartesian.h\
+	Grid_Lattice.h\
+	Grid_Communicator.h\
+	Grid_QCD.h\
+	Grid_aligned_allocator.h\
+	Grid_cshift.h\
+	Grid_cshift_common.h\
+	Grid_cshift_mpi.h\
+	Grid_cshift_none.h\
+	Grid_stencil.h\
+	Grid_math_types.h
+
+
+