From e06a11ee5e7be701f37db24505844c85fe5ee61c Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 6 Apr 2015 06:30:48 +0100
Subject: [PATCH] Bringing in LatticeInteger with the idea of implemented
 predicated assignment, subsets etc. c.f the QDP++ "where" syntax

---
 Grid_Cartesian.h     | 283 ++++++++++++++++++++++++++-------------
 Grid_Communicator.h  |  54 +++++++-
 Grid_Lattice.h       |  93 +++++++------
 Grid_QCD.h           |  14 +-
 Grid_cshift_common.h |  12 +-
 Grid_cshift_fake.h   |   8 +-
 Grid_cshift_mpi.h    |  14 +-
 Grid_main.cc         |  51 ++++---
 Grid_math_types.h    |  22 +---
 Grid_mpi.cc          |  29 +++-
 Grid_simd.h          | 119 +++++++++++++----
 Grid_vInteger.h      | 307 +++++++++++++++++++++++++++++++++++++++++++
 TODO                 |   3 +-
 13 files changed, 788 insertions(+), 221 deletions(-)
 create mode 100644 Grid_vInteger.h
diff --git a/Grid_Cartesian.h b/Grid_Cartesian.h
index a983639c..004c2fa5 100644
--- a/Grid_Cartesian.h
+++ b/Grid_Cartesian.h
@@ -6,126 +6,236 @@
 namespace Grid{
     
 /////////////////////////////////////////////////////////////////////////////////////////
-// Grid Support. Following will go into Grid.h.
+// Grid Support.
 /////////////////////////////////////////////////////////////////////////////////////////
-    // Cartesian grids
-    // Grid::Grid
-    // Grid::GridCartesian
-    // Grid::GridCartesianRedBlack
+//
+// Cartesian grid inheritance
+//            Grid::GridBase
+//                     |
+//           __________|___________
+//          |                      |
+// Grid::GridCartesian   Grid::GridCartesianRedBlack
+//
+// TODO: document the following as an API guaranteed public interface
 
+    /* 
+     *       Rough map of functionality against QDP++ Layout
+     *
+     *       Param     |     Grid                     |     QDP++             
+     *       -----------------------------------------
+     *                 |                              |
+     *        void     |     oSites, iSites, lSites   |  sitesOnNode 
+     *        void     |     gSites                   |  vol
+     *                 |                              |
+     *        gcoor    |     oIndex, iIndex           |  linearSiteIndex // no virtual node in QDP
+     *        lcoor    |                              |
+     * 
+     *        void     |     CheckerBoarded           |  -        // No checkerboarded in QDP
+     *        void     |     FullDimensions           |  lattSize
+     *        void     |     GlobalDimensions         |  lattSize // No checkerboarded in QDP
+     *        void     |     LocalDimensions          |  subgridLattSize
+     *        void     |     VirtualLocalDimensions   |  subgridLattSize // no virtual node in QDP
+     *                 |                              |
+     *       int x 3   |     oiSiteRankToGlobal       |  siteCoords
+     *                 |     ProcessorCoorLocalCoorToGlobalCoor | 
+     *                 |                              |
+     *     vector<int> |     GlobalCoorToRankIndex   |  nodeNumber(coord)
+     *     vector<int> |     GlobalCoorToProcessorCoorLocalCoor|  nodeCoord(coord)
+     *                 |                              |
+     *     void        |     Processors               |  logicalSize    // returns cart array shape
+     *     void        |     ThisRank        |  nodeNumber();  // returns this node rank
+     *     void        |     ThisProcessorCoor        |    // returns this node coor
+     *     void        |     isBoss(void)             |  primaryNode();
+     *                 |                              |
+     *                 |     RankFromProcessorCoor    |  getLogicalCoorFrom(node)
+     *                 |     ProcessorCoorFromRank    |  getNodeNumberFrom(logical_coord)
+     */
 
-class SimdGrid : public CartesianCommunicator {
+class GridBase : public CartesianCommunicator {
 public:
 
+ // Give Lattice access
+ template<class object> friend class Lattice;
   
- SimdGrid(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
+ GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
 
-    // Give Lattice access
-    template<class object> friend class Lattice;
         
-//protected:
-        
-  // Lattice wide random support. not yet fully implemented. Need seed strategy
-  // and one generator per site.
-  //std::default_random_engine generator;
-  //    static std::mt19937  generator( 9 );
-        
-    // Grid information.
-
-    // Commicator provides
+ //protected:
+ // Lattice wide random support. not yet fully implemented. Need seed strategy
+ // and one generator per site.
+ // std::default_random_engine generator;
+ //    static std::mt19937  generator( 9 );
+ 
+    //////////////////////////////////////////////////////////////////////
+    // Commicator provides information on the processor grid
+    //////////////////////////////////////////////////////////////////////
     //    unsigned long _ndimension;
     //    std::vector<int> _processors; // processor grid
     //    int              _processor;  // linear processor rank
     //    std::vector<int> _processor_coor;  // linear processor rank
+    //////////////////////////////////////////////////////////////////////
 
+    // Physics Grid information.
     std::vector<int> _simd_layout;     // Which dimensions get relayed out over simd lanes.
-
-
     std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
     std::vector<int> _gdimensions;// Global dimensions of array after cb removal
     std::vector<int> _ldimensions;// local dimensions of array with processor images removed
     std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed 
+    std::vector<int> _ostride;    // Outer stride for each dimension
+    std::vector<int> _istride;    // Inner stride i.e. within simd lane
+    int _osites;                  // _isites*_osites = product(dimensions).
+    int _isites;
+    std::vector<int> _slice_block;   // subslice information
+    std::vector<int> _slice_stride;
+    std::vector<int> _slice_nblock;
 
+    // Might need these at some point
     //    std::vector<int> _lstart;     // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
     //    std::vector<int> _lend;       // local end of array in gcoors    _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
 
- 
-    std::vector<int> _ostride;    // Outer stride for each dimension
-    std::vector<int> _istride;    // Inner stride i.e. within simd lane
-
-    int _osites;                  // _isites*_osites = product(dimensions).
-    int _isites;
-        
-    // subslice information
-    std::vector<int> _slice_block;
-    std::vector<int> _slice_stride;
-    std::vector<int> _slice_nblock;
 public:
-    
-    // These routines are key. Subdivide the linearised cartesian index into
-    //      "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
-    //      "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
-    // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
-    // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
-    // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
-    // lanes are operated upon simultaneously.
-    
-    inline int oIndexReduced(std::vector<int> &rcoor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*rcoor[d];
-        return idx;
-    }
-    virtual int oIndex(std::vector<int> &coor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
-        return idx;
-    }
-    inline int iIndex(std::vector<int> &rcoor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]);
-        return idx;
-    }
-    inline int iCoordFromIsite(int lane,int mu)
-    {
-      std::vector<int> coor(_ndimension);
-      for(int d=0;d<_ndimension;d++){
-	coor[d] = lane % _simd_layout[d];
-	lane    = lane / _simd_layout[d];
-      }
-      return coor[mu];
-    }
-    
-    inline int oSites(void) { return _osites; };
-    inline int iSites(void) { return _isites; };
 
-    inline int CheckerBoardFromOsite (int Osite){
+    ////////////////////////////////////////////////////////////////
+    // Checkerboarding interface is virtual and overridden by 
+    // GridCartesian / GridRedBlackCartesian
+    ////////////////////////////////////////////////////////////////
+    virtual int CheckerBoarded(int dim)=0;
+    virtual int CheckerBoard(std::vector<int> site)=0;
+    virtual int CheckerBoardDestination(int source_cb,int shift)=0;
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
+    inline int  CheckerBoardFromOindex (int Oindex){
       std::vector<int> ocoor;
-      CoordFromOsite(ocoor,Osite);
+      oCoorFromOindex(ocoor,Oindex); 
       int ss=0;
       for(int d=0;d<_ndimension;d++){
 	ss=ss+ocoor[d];
       }      
       return ss&0x1;
     }
-    inline void CoordFromOsite (std::vector<int>& coor,int Osite){
+
+    //////////////////////////////////////////////////////////////////////////////////////////////
+    // Local layout calculations
+    //////////////////////////////////////////////////////////////////////////////////////////////
+    // These routines are key. Subdivide the linearised cartesian index into
+    //      "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
+    //      "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
+    //
+    // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
+    // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
+    // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
+    // lanes are operated upon simultaneously.
+  
+    virtual int oIndex(std::vector<int> &coor)
+    {
+        int idx=0;
+	// Works with either global or local coordinates
+        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
+        return idx;
+    }
+    inline int oIndexReduced(std::vector<int> &ocoor)
+    {
+      int idx=0; 
+      // ocoor is already reduced so can eliminate the modulo operation
+      // for fast indexing and inline the routine
+      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
+      return idx;
+    }
+    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
       coor.resize(_ndimension);
       for(int d=0;d<_ndimension;d++){
-	coor[d] = Osite % _rdimensions[d];
-	Osite   = Osite / _rdimensions[d];
+	coor[d] = Oindex % _rdimensions[d];
+	Oindex  = Oindex / _rdimensions[d];
       }
     }
 
-    virtual int CheckerBoarded(int dim)=0;
-    virtual int CheckerBoard(std::vector<int> site)=0;
-    virtual int CheckerBoardDestination(int source_cb,int shift)=0;
-    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
+    //////////////////////////////////////////////////////////
+    // SIMD lane addressing
+    //////////////////////////////////////////////////////////
+    inline int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+        return idx;
+    }
+    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
+    {
+      coor.resize(_ndimension);
+      for(int d=0;d<_ndimension;d++){
+	coor[d] = lane % _simd_layout[d];
+	lane    = lane / _simd_layout[d];
+      }
+    }
+    
+    ////////////////////////////////////////////////////////////////
+    // Array sizing queries
+    ////////////////////////////////////////////////////////////////
+
+    inline int iSites(void) { return _isites; };
+    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
+    inline int oSites(void) { return _osites; };
+    inline int lSites(void) { return _isites*_osites; }; 
+    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
+    inline int Nd    (void) { return _ndimension;};
+    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
+    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
+    inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
+    inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
+
+    ////////////////////////////////////////////////////////////////
+    // Global addressing
+    ////////////////////////////////////////////////////////////////
+    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
+    {
+      gcoor.resize(_ndimension);
+      std::vector<int> coor(_ndimension);
+
+      ProcessorCoorFromRank(rank,coor);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]&coor[mu];
+
+      iCoorFromIindex(coor,i_idx);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]&coor[mu];
+
+      oCoorFromOindex (coor,o_idx);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
+      
+    }
+    void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
+    {
+      RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
+      if(CheckerBoarded(0)){
+	fcoor[0] = fcoor[0]*2+cb;
+      }
+    }
+    void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
+    {
+      gcoor.resize(_ndimension);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
+    }
+    void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
+    {
+      pcoor.resize(_ndimension);
+      lcoor.resize(_ndimension);
+      for(int mu=0;mu<_ndimension;mu++){
+	pcoor[mu] = gcoor[mu]/_ldimensions[mu];
+	lcoor[mu] = gcoor[mu]%_ldimensions[mu];
+      }
+    }
+    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
+    {
+      std::vector<int> pcoor;
+      std::vector<int> lcoor;
+      GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
+      rank = RankFromProcessorCoor(pcoor);
+      i_idx= iIndex(lcoor);
+      o_idx= oIndex(lcoor);
+    }
+
 };
 
-class GridCartesian: public SimdGrid {
+class GridCartesian: public GridBase {
+
 public:
+
     virtual int CheckerBoarded(int dim){
       return 0;
     }
@@ -141,7 +251,7 @@ public:
     GridCartesian(std::vector<int> &dimensions,
 		  std::vector<int> &simd_layout,
 		  std::vector<int> &processor_grid
-		  ) : SimdGrid(processor_grid)
+		  ) : GridBase(processor_grid)
     {
         ///////////////////////
         // Grid information
@@ -200,14 +310,12 @@ public:
             _slice_nblock[d]=nblock;
             block = block*_rdimensions[d];
         }
-            
-        assert( _isites == vComplex::Nsimd());
 
     };
 };
  
 // Specialise this for red black grids storing half the data like a chess board.
-class GridRedBlackCartesian : public SimdGrid
+class GridRedBlackCartesian : public GridBase
 {
 public:
     virtual int CheckerBoarded(int dim){
@@ -230,7 +338,7 @@ public:
 
       // Probably faster with table lookup;
       // or by looping over x,y,z and multiply rather than computing checkerboard.
-      int ocb=CheckerBoardFromOsite(osite);
+      int ocb=CheckerBoardFromOindex(osite);
 	  
       if ( (source_cb+ocb)&1 ) {
 	return (shift)/2;
@@ -248,7 +356,7 @@ public:
     };
     GridRedBlackCartesian(std::vector<int> &dimensions,
 			  std::vector<int> &simd_layout,
-			  std::vector<int> &processor_grid) : SimdGrid(processor_grid)
+			  std::vector<int> &processor_grid) : GridBase(processor_grid)
     {
     ///////////////////////
     // Grid information
@@ -310,7 +418,6 @@ public:
             block = block*_rdimensions[d];
         }
             
-        assert ( _isites == vComplex::Nsimd());
     };
 protected:
     virtual int oIndex(std::vector<int> &coor)
diff --git a/Grid_Communicator.h b/Grid_Communicator.h
index 4f83da34..c7e71f12 100644
--- a/Grid_Communicator.h
+++ b/Grid_Communicator.h
@@ -22,23 +22,63 @@ class CartesianCommunicator {
     MPI_Comm communicator;
 #endif
 
+    // Constructor
     CartesianCommunicator(std::vector<int> &pdimensions_in);
 
+    // Wraps MPI_Cart routines
     void ShiftedRanks(int dim,int shift,int & source, int & dest);
-    int  Rank(std::vector<int> coor);
-    int  MyRank(void);
-    void GlobalSumF(float &);
-    void GlobalSumFVector(float *,int N);
+    int  RankFromProcessorCoor(std::vector<int> &coor);
+    void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
 
-    void GlobalSumF(double &);
-    void GlobalSumFVector(double *,int N);
+    /////////////////////////////////
+    // Grid information queries
+    /////////////////////////////////
+    int                      IsBoss(void)            { return _processor==0; };
+    int                      ThisRank(void)          { return _processor; };
+    const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
+    const std::vector<int> & ProcessorGrid(void)     { return _processors; };
+    int                      ProcessorCount(void)    { return _Nprocessors; };
 
+    ////////////////////////////////////////////////////////////
+    // Reduction
+    ////////////////////////////////////////////////////////////
+    void GlobalSum(float &);
+    void GlobalSumVector(float *,int N);
+
+    void GlobalSum(double &);
+    void GlobalSumVector(double *,int N);
+    
+    template<class obj> void GlobalSumObj(obj &o){
+
+      typedef typename obj::scalar_type scalar_type;
+      int words = sizeof(obj)/sizeof(scalar_type);
+
+      scalar_type * ptr = (scalar_type *)& o;
+      GlobalSum(ptr,words);
+    }
+    ////////////////////////////////////////////////////////////
+    // Face exchange
+    ////////////////////////////////////////////////////////////
     void SendToRecvFrom(void *xmit,
 			int xmit_to_rank,
 			void *recv,
 			int recv_from_rank,
 			int bytes);
-    
+
+    ////////////////////////////////////////////////////////////
+    // Barrier
+    ////////////////////////////////////////////////////////////
+    void Barrier(void);
+
+    ////////////////////////////////////////////////////////////
+    // Broadcast a buffer and composite larger
+    ////////////////////////////////////////////////////////////
+    void Broadcast(int root,void* data, int bytes);
+    template<class obj> void Broadcast(int root,obj &data)
+    {
+      Broadcast(root,(void *)&data,sizeof(data));
+    };
+
 }; 
 }
 
diff --git a/Grid_Lattice.h b/Grid_Lattice.h
index aab4fec9..2aa46b97 100644
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -20,7 +20,7 @@ template<class vobj>
 class Lattice
 {
 public:
-    SimdGrid *_grid;
+    GridBase *_grid;
     int checkerboard;
     std::vector<vobj,alignedAllocator<vobj> > _odata;
     typedef typename vobj::scalar_type scalar_type;
@@ -28,7 +28,7 @@ public:
 public:
 
 
-    Lattice(SimdGrid *grid) : _grid(grid) {
+    Lattice(GridBase *grid) : _grid(grid) {
         _odata.reserve(_grid->oSites());
         assert((((uint64_t)&_odata[0])&0xF) ==0);
         checkerboard=0;
@@ -95,56 +95,67 @@ public:
     template<class sobj>
     friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
 
-      typedef typename vobj::scalar_type stype;
-      typedef typename vobj::vector_type vtype;
+      GridBase *grid=l._grid;
 
-      assert( l.checkerboard == l._grid->CheckerBoard(site));
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
 
-      int o_index = l._grid->oIndex(site);
-      int i_index = l._grid->iIndex(site);
+      int Nsimd = grid->Nsimd();
+
+      assert( l.checkerboard== l._grid->CheckerBoard(site));
+      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+      int rank,odx,idx;
+      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+
+      // Optional to broadcast from node 0.
+      grid->Broadcast(0,s);
+
+      std::vector<sobj> buf(Nsimd);
+      std::vector<scalar_type *> pointers(Nsimd);  
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+
+      // extract-modify-merge cycle is easiest way and this is not perf critical
+      extract(l._odata[odx],pointers);
       
-      stype *v_ptr = (stype *)&l._odata[o_index];
-      stype *s_ptr = (stype *)&s;
-      v_ptr = v_ptr + 2*i_index;
-        
-      for(int i=0;i<sizeof(sobj);i+=2*sizeof(stype)){
-	v_ptr[0] = s_ptr[0];
-	v_ptr[1] = s_ptr[1];
-	v_ptr+=2*vtype::Nsimd();
-	s_ptr+=2;
-      }
+      buf[idx] = s;
+
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+      merge(l._odata[odx],pointers);
+
       return;
     };
     
-    
     // Peek a scalar object from the SIMD array
     template<class sobj>
-    friend void peekSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
+    friend void peekSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
         
-      typedef typename vobj::scalar_type stype;
-      typedef typename vobj::vector_type vtype;
+      GridBase *grid=l._grid;
+
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      int Nsimd = grid->Nsimd();
 
       assert( l.checkerboard== l._grid->CheckerBoard(site));
+      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
 
-      int o_index = l._grid->oIndex(site);
-      int i_index = l._grid->iIndex(site);
+      int rank,odx,idx;
+      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+      std::vector<sobj> buf(Nsimd);
+      std::vector<scalar_type *> pointers(Nsimd);  
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+
+      extract(l._odata[odx],pointers);
       
-      stype *v_ptr = (stype *)&l._odata[o_index];
-      stype *s_ptr = (stype *)&s;
-      v_ptr = v_ptr + 2*i_index;
-      
-      for(int i=0;i<sizeof(sobj);i+=2*sizeof(stype)){
-	s_ptr[0] = v_ptr[0];
-	s_ptr[1] = v_ptr[1];
-	v_ptr+=2*vtype::Nsimd();
-	s_ptr+=2;
-      }
+      s = buf[idx];
+      grid->Broadcast(rank,s);
+
       return;
     };
     
-    // Randomise
+    // FIXME Randomise; deprecate this
     friend void random(Lattice<vobj> &l){
-
         Real *v_ptr = (Real *)&l._odata[0];
         size_t v_len = l._grid->oSites()*sizeof(vobj);
         size_t d_len = v_len/sizeof(Real);
@@ -154,9 +165,9 @@ public:
             v_ptr[i]=drand48();
         }
     };
-
+    
+    // FIXME for debug; deprecate this
     friend void lex_sites(Lattice<vobj> &l){
-
         Real *v_ptr = (Real *)&l._odata[0];
 	size_t o_len = l._grid->oSites();
         size_t v_len = sizeof(vobj)/sizeof(vRealF);
@@ -183,7 +194,6 @@ public:
         for(int i=0;i<d_len;i++){
 	  v_ptr[i]= drand48();
         }
-        
     };
 
 
@@ -241,6 +251,7 @@ public:
         }
         return ret;
     };
+
     inline friend Lattice<vobj> conj(const Lattice<vobj> &lhs){
         Lattice<vobj> ret(lhs._grid);
 #pragma omp parallel for
@@ -259,7 +270,7 @@ public:
 	std::vector<int> coor;
 	int cbos;
 	
-	full._grid->CoordFromOsite(coor,ss);
+	full._grid->oCoorFromOindex(coor,ss);
 	cbos=half._grid->CheckerBoard(coor);
 	
 	if (cbos==cb) {
@@ -277,7 +288,7 @@ public:
 	std::vector<int> coor;
 	int cbos;
 	
-	full._grid->CoordFromOsite(coor,ss);
+	full._grid->oCoorFromOindex(coor,ss);
 	cbos=half._grid->CheckerBoard(coor);
 
 	if (cbos==cb) {
@@ -351,7 +362,6 @@ public:
     template<class left,class right>
     inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
     {
-	std::cerr <<"Oscalar * Lattice calling mult"<<std::endl;
         Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
 
 #pragma omp parallel for
@@ -383,7 +393,6 @@ public:
     template<class left,class right>
     inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
     {
-	std::cerr <<"Lattice * Oscalar calling mult"<<std::endl;
         Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
 #pragma omp parallel for
         for(int ss=0;ss<rhs._grid->oSites(); ss++){
diff --git a/Grid_QCD.h b/Grid_QCD.h
index f6cc6087..a0f7801a 100644
--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@@ -21,7 +21,13 @@ namespace QCD {
 
     typedef iSinglet<Complex >          TComplex;    // This is painful. Tensor singlet complex type.
     typedef iSinglet<vComplex >         vTComplex;
-    typedef iSinglet<Real >          TReal;    // This is painful. Tensor singlet complex type.
+    typedef iSinglet<Real >             TReal;    // This is painful. Tensor singlet complex type.
+
+
+    typedef iSinglet<vIntegerF >         vTIntegerF;
+    typedef iSinglet<vIntegerD >         vTIntegerD;
+    typedef iSinglet<vIntegerC >         vTIntegerC;
+    typedef iSinglet<vIntegerZ >         vTIntegerZ;
 
     typedef iSpinMatrix<Complex >       SpinMatrix;
     typedef iColourMatrix<Complex >     ColourMatrix;
@@ -39,9 +45,13 @@ namespace QCD {
     typedef iSpinVector<vComplex >       vSpinVector;
     typedef iColourVector<vComplex >     vColourVector;
     typedef iSpinColourVector<vComplex > vSpinColourVector;
-
     
     typedef Lattice<vTComplex>         LatticeComplex;
+
+    typedef Lattice<vTIntegerF>            LatticeIntegerF; // Predicates for "where"
+    typedef Lattice<vTIntegerD>            LatticeIntegerD; 
+    typedef Lattice<vTIntegerC>            LatticeIntegerC;
+    typedef Lattice<vTIntegerZ>            LatticeIntegerZ;
     
     typedef Lattice<vColourMatrix>     LatticeColourMatrix;
     typedef Lattice<vSpinMatrix>       LatticeSpinMatrix;
diff --git a/Grid_cshift_common.h b/Grid_cshift_common.h
index 203bf2f5..72518d7d 100644
--- a/Grid_cshift_common.h
+++ b/Grid_cshift_common.h
@@ -45,7 +45,7 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
     for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
       for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 
-	int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb &cbmask ) {
 	  buffer[bo]=rhs._odata[so+o+b];
 	  bo++;
@@ -90,7 +90,7 @@ friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> p
     for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
       for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 
-	int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	if ( ocb & cbmask ) {
 	  extract(rhs._odata[so+o+b],pointers);
 	}
@@ -135,7 +135,7 @@ friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAll
     for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
       for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 
-	int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 	if ( ocb & cbmask ) {
 	  rhs._odata[so+o+b]=buffer[bo++];
 	}
@@ -179,7 +179,7 @@ friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> po
     for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
       for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 
-	int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 	if ( ocb&cbmask ) {
 	  merge(rhs._odata[so+o+b],pointers);
 	}
@@ -224,7 +224,7 @@ friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int
     for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
       for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 
-	int ocb=1<<lhs._grid->CheckerBoardFromOsite(o+b);
+	int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
 
 	if ( ocb&cbmask ) {
 	  lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
@@ -267,7 +267,7 @@ friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimens
     for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
       for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
 
-	int ocb=1<<lhs._grid->CheckerBoardFromOsite(o+b);
+	int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
 
 	if ( ocb&cbmask ) {
 	  permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
diff --git a/Grid_cshift_fake.h b/Grid_cshift_fake.h
index 0f306b1b..8b6c293b 100644
--- a/Grid_cshift_fake.h
+++ b/Grid_cshift_fake.h
@@ -7,9 +7,11 @@ friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
 {
   typedef typename vobj::vector_type vector_type;
   typedef typename vobj::scalar_type scalar_type;
-  const int Nsimd = vector_type::Nsimd();
 
   Lattice<vobj> ret(rhs._grid);
+
+  GridBase *grid=rhs._grid;
+  const int Nsimd = grid->Nsimd();
   
   int fd = rhs._grid->_fdimensions[dimension];
   int rd = rhs._grid->_rdimensions[dimension];
@@ -43,7 +45,7 @@ friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
 
   for(int x=0;x<rd;x++){       
 
-    for(int i=0;i<vobj::vector_type::Nsimd();i++){
+    for(int i=0;i<Nsimd;i++){
       pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
     }
 
@@ -79,7 +81,7 @@ friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
 	o +=rhs._grid->_slice_stride[dimension];
       }
 
-      for(int i=0;i<vobj::vector_type::Nsimd();i++){
+      for(int i=0;i<Nsimd;i++){
 	pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
       }
 
diff --git a/Grid_cshift_mpi.h b/Grid_cshift_mpi.h
index 17aff2f4..75a4dc96 100644
--- a/Grid_cshift_mpi.h
+++ b/Grid_cshift_mpi.h
@@ -73,7 +73,7 @@ friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int
   typedef typename vobj::vector_type vector_type;
   typedef typename vobj::scalar_type scalar_type;
 
-  SimdGrid *grid=rhs._grid;
+  GridBase *grid=rhs._grid;
   Lattice<vobj> temp(rhs._grid);
 
   int fd              = rhs._grid->_fdimensions[dimension];
@@ -130,8 +130,8 @@ friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int
 
 friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
-  const int Nsimd = vector_type::Nsimd();
-  SimdGrid *grid=rhs._grid;
+  GridBase *grid=rhs._grid;
+  const int Nsimd = grid->Nsimd();
   typedef typename vobj::vector_type vector_type;
   typedef typename vobj::scalar_type scalar_type;
    
@@ -173,6 +173,7 @@ friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
   
   std::vector<int> comm_offnode(simd_layout);
   std::vector<int> comm_proc   (simd_layout);  //relative processor coord in dim=dimension
+  std::vector<int> icoor(grid->Nd());
 
   for(int x=0;x<rd;x++){       
 
@@ -197,7 +198,10 @@ friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
 
       for(int i=0;i<Nsimd;i++){
 
-	int s = grid->iCoordFromIsite(i,dimension);
+
+	int s;
+	grid->iCoorFromIindex(icoor,i);
+	s = icoor[dimension];
 
 	if(comm_offnode[s]){
 
@@ -232,7 +236,7 @@ friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
       if ( x< rd-num ) permute_slice=wrap;
       else permute_slice = 1-wrap;
 
-      for(int i=0;i<vobj::vector_type::Nsimd();i++){
+      for(int i=0;i<Nsimd;i++){
 	if ( permute_slice ) {
 	  pointers[i] = rpointers[permute_map[permute_type][i]];
 	} else {
diff --git a/Grid_main.cc b/Grid_main.cc
index 504f7778..cb45fd85 100644
--- a/Grid_main.cc
+++ b/Grid_main.cc
@@ -19,9 +19,9 @@ int main (int argc, char ** argv)
   std::vector<int> simd_layout(4);
   
   std::vector<int> mpi_layout(4);
-  mpi_layout[0]=4;
-  mpi_layout[1]=2;
-  mpi_layout[2]=4;
+  mpi_layout[0]=2;
+  mpi_layout[1]=1;
+  mpi_layout[2]=1;
   mpi_layout[3]=2;
 
 #ifdef AVX512
@@ -34,7 +34,7 @@ int main (int argc, char ** argv)
    omp_set_num_threads(omp);
 #endif 
 
-  for(int lat=16;lat<=16;lat+=40){
+  for(int lat=8;lat<=16;lat+=40){
     latt_size[0] = lat;
     latt_size[1] = lat;
     latt_size[2] = lat;
@@ -166,6 +166,7 @@ int main (int argc, char ** argv)
 
 
     // Lattice SU(3) x SU(3)
+    Fine.Barrier();
     FooBar = Foo * Bar;
     
     // Lattice 12x12 GEMM
@@ -179,37 +180,47 @@ int main (int argc, char ** argv)
 
     flops = ncall*1.0*volume*(8*Nc*Nc*Nc);
     bytes = ncall*1.0*volume*Nc*Nc    *2*3*sizeof(Grid::Real);
-    printf("%f flop and %f bytes\n",flops,bytes/ncall);
+    if ( Fine.IsBoss() ) {
+      printf("%f flop and %f bytes\n",flops,bytes/ncall);
+    }
         FooBar = Foo * Bar;
+    Fine.Barrier();
     t0=usecond();
     for(int i=0;i<ncall;i++){
-        mult(FooBar,Foo,Bar); // this is better
+      Fine.Barrier();
+      mult(FooBar,Foo,Bar); // this is better
     }
     t1=usecond();
+    Fine.Barrier();
+    if ( Fine.IsBoss() ) {
 #ifdef OMP
-    printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp_get_max_threads(),lat,(t1-t0)/ncall);
+      printf("mult NumThread %d , Lattice size %d , %f us per call\n",omp_get_max_threads(),lat,(t1-t0)/ncall);
 #endif
-    printf("mult NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
-    printf("mult NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
-
+      printf("mult NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
+      printf("mult NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
+    }
     mult(FooBar,Foo,Bar);
     FooBar = Foo * Bar;
 
     bytes = ncall*1.0*volume*Nc*Nc    *2*5*sizeof(Grid::Real);
+    Fine.Barrier();
     t0=usecond();
     for(int i=0;i<ncall;i++){
-      mult(FooBar,Foo,Cshift(Bar,1,-2));
+      Fine.Barrier();
+      mult(FooBar,Foo,Cshift(Bar,1,-1));
       //mult(FooBar,Foo,Bar);
       //FooBar = Foo * Bar; // this is bad
     }
     t1=usecond();
+    Fine.Barrier();
 
     FooBar = Foo * Bar;
     
-    printf("Cshift Mult: NumThread %d , Lattice size %d , %f us per call\n",omp,lat,(t1-t0)/ncall);
-    printf("Cshift Mult: NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
-    printf("Cshift Mult: NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
-
+    if ( Fine.IsBoss() ) {
+      printf("Cshift Mult: NumThread %d , Lattice size %d , %f us per call\n",omp,lat,(t1-t0)/ncall);
+      printf("Cshift Mult: NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
+      printf("Cshift Mult: NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));
+    }
     //    pickCheckerboard(0,rFoo,FooBar);
     //    pickCheckerboard(1,bFoo,FooBar);
     //    setCheckerboard(FooBar,rFoo);
@@ -225,12 +236,12 @@ int main (int argc, char ** argv)
 	pickCheckerboard(0,rFoo,Foo);    // Pick out red or black checkerboards
 	pickCheckerboard(1,bFoo,Foo);
     
-	std::cout << "Shifting both parities by "<< shift <<" direction "<< dir <<std::endl;
+	if ( Fine.IsBoss() ) {
+	  std::cout << "Shifting both parities by "<< shift <<" direction "<< dir <<std::endl;
+	}
 	Shifted  = Cshift(Foo,dir,shift);    // Shift everything
 
-	std::cout << "Shifting even source parities to odd result"<<std::endl;
 	bShifted = Cshift(rFoo,dir,shift);   // Shift red->black
-	std::cout << "Shifting odd parities to even result"<<std::endl;
 	rShifted = Cshift(bFoo,dir,shift);   // Shift black->red
     
 	ShiftedCheck=zero;
@@ -332,8 +343,10 @@ int main (int argc, char ** argv)
             nrm = nrm + real(conj(diff)*diff);
         }}
     }}}}
+	if( Fine.IsBoss() ){
+	  std::cout << "LatticeColorMatrix * LatticeColorMatrix nrm diff = "<<nrm<<std::endl;
+	}
       }}
-    std::cout << "LatticeColorMatrix * LatticeColorMatrix nrm diff = "<<nrm<<std::endl;
 
    } // loop for lat
  } // loop for omp
diff --git a/Grid_math_types.h b/Grid_math_types.h
index 3f7714d5..25524966 100644
--- a/Grid_math_types.h
+++ b/Grid_math_types.h
@@ -66,7 +66,7 @@ namespace Grid {
 template<class vtype> class iScalar
 {
 public:
-  SIMDalign vtype _internal;
+  vtype _internal;
 
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
@@ -113,7 +113,7 @@ public:
 template<class vtype,int N> class iVector
 {
 public:
-  SIMDalign vtype _internal[N];
+  vtype _internal[N];
 
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
@@ -170,7 +170,7 @@ public:
 template<class vtype,int N> class iMatrix
 {
 public:
-  SIMDalign    vtype _internal[N][N];
+  vtype _internal[N][N];
 
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
@@ -908,21 +908,5 @@ inline auto trace(const iScalar<vtype> &arg) -> iScalar<decltype(trace(arg._inte
     return ret;
 }
 };
-/////////////////////////////////////////////////////////////////////////
-// Generic routine to promote object<complex> -> object<vcomplex>
-// Supports the array reordering transformation that gives me SIMD utilisation
-/////////////////////////////////////////////////////////////////////////
-/*
-template<template<class> class object>
-inline object<vComplex> splat(object<Complex >s){
-    object<vComplex> ret;
-    vComplex * v_ptr = (vComplex *)& ret;
-    Complex * s_ptr = (Complex *) &s;
-    for(int i=0;i<sizeof(ret);i+=sizeof(vComplex)){
-        vsplat(*(v_ptr++),*(s_ptr++));
-    }
-    return ret;
-}
-*/
     
 #endif
diff --git a/Grid_mpi.cc b/Grid_mpi.cc
index c9b91210..a4943478 100644
--- a/Grid_mpi.cc
+++ b/Grid_mpi.cc
@@ -28,18 +28,18 @@ CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
   assert(Size==_Nprocessors);
 }
 
-void CartesianCommunicator::GlobalSumF(float &f){
+void CartesianCommunicator::GlobalSum(float &f){
   MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 }
-void CartesianCommunicator::GlobalSumFVector(float *f,int N)
+void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
   MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
 }
-void CartesianCommunicator::GlobalSumF(double &d)
+void CartesianCommunicator::GlobalSum(double &d)
 {
   MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
 }
-void CartesianCommunicator::GlobalSumFVector(double *d,int N)
+void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
   MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
 }
@@ -48,12 +48,17 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
 {
   MPI_Cart_shift(communicator,dim,shift,&source,&dest);
 }
-int CartesianCommunicator::Rank(std::vector<int> coor)
+int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
   int rank;
   MPI_Cart_rank  (communicator, &coor[0], &rank);
   return rank;
 }
+void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
+{
+  coor.resize(_ndimension);
+  MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
+}
 
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
@@ -71,5 +76,19 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 
 }
 
+void CartesianCommunicator::Barrier(void)
+{
+  MPI_Barrier(communicator);
+}
+
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+{
+  MPI_Bcast(data,
+	    bytes,
+	    MPI_BYTE,
+	    root,
+	    communicator);
+}
+
 }
 
diff --git a/Grid_simd.h b/Grid_simd.h
index b5d34d4a..ffcbff15 100644
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -37,6 +37,29 @@
     // Fourier transform equivalent.
     //
     
+  ////////////////////////////////////////////////////////////
+  // SIMD Alignment controls
+  ////////////////////////////////////////////////////////////
+#ifdef HAVE_VAR_ATTRIBUTE_ALIGNED
+#define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A)))
+#else
+#define ALIGN_DIRECTIVE(A) __declspec(align(A))
+#endif
+
+#ifdef SSE2
+#include <pmmintrin.h>
+#define SIMDalign ALIGN_DIRECTIVE(16)
+#endif
+
+#if defined(AVX1) || defined (AVX2)
+#include <immintrin.h>
+#define SIMDalign ALIGN_DIRECTIVE(32)
+#endif
+
+#ifdef AVX512
+#include <immintrin.h>
+#define SIMDalign ALIGN_DIRECTIVE(64)
+#endif
 
 namespace Grid {
 
@@ -49,6 +72,8 @@ namespace Grid {
   typedef std::complex<Real>  Complex;
 
 
+
+
   inline RealF adj(const RealF  & r){ return r; }
   inline RealF conj(const RealF  & r){ return r; }
   inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
@@ -97,47 +122,27 @@ namespace Grid {
   template<>            inline void ZeroIt(RealD &arg){ arg=0; };
 
 
-  ////////////////////////////////////////////////////////////
-  // SIMD Alignment controls
-  ////////////////////////////////////////////////////////////
-#ifdef HAVE_VAR_ATTRIBUTE_ALIGNED
-#define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A)))
-#else
-#define ALIGN_DIRECTIVE(A) __declspec(align(A))
-#endif
-
-#ifdef SSE2
-#include <pmmintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(16)
-#endif
-
-#if defined(AVX1) || defined (AVX2)
-#include <immintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(32)
-#endif
-
-#ifdef AVX512
-#include <immintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(64)
-#endif
 
 #if defined (SSE2)
     typedef __m128 fvec;
     typedef __m128d dvec;
     typedef __m128 cvec;
     typedef __m128d zvec;
+    typedef __m128i ivec;
 #endif
 #if defined (AVX1) || defined (AVX2)
     typedef __m256 fvec;
     typedef __m256d dvec;
     typedef __m256  cvec;
     typedef __m256d zvec;
+    typedef __m256i ivec;
 #endif
 #if defined (AVX512)
     typedef __m512  fvec;
     typedef __m512d dvec;
     typedef __m512  cvec;
     typedef __m512d zvec;
+    typedef __m512i ivec;
 #endif
 #if defined (QPX)
     typedef float  fvec __attribute__ ((vector_size (16))); // QPX has same SIMD width irrespective of precision
@@ -159,10 +164,76 @@ namespace Grid {
 
 };
 
+
+/////////////////////////////////////////////////////////////////
+// Generic extract/merge/permute
+/////////////////////////////////////////////////////////////////
+template<class vsimd,class scalar,int Nsimd>
+inline void Gextract(vsimd &y,std::vector<scalar *> &extracted){
+  // Bounce off stack is painful
+  // temporary hack while I figure out the right interface
+  scalar buf[Nsimd]; 
+  vstore(y,buf);
+  for(int i=0;i<Nsimd;i++){
+    *extracted[i] = buf[i];
+    extracted[i]++;
+  }
+};
+template<class vsimd,class scalar,int Nsimd>
+inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
+  scalar buf[Nsimd]; 
+  for(int i=0;i<Nsimd;i++){
+    buf[i]=*extracted[i];
+    extracted[i]++;
+  }
+  vset(y,buf); 
+};
+
+//////////////////////////////////////////////////////////
+// Permute
+// Permute 0 every ABCDEFGH -> BA DC FE HG
+// Permute 1 every ABCDEFGH -> CD AB GH EF
+// Permute 2 every ABCDEFGH -> EFGH ABCD
+// Permute 3 possible on longer iVector lengths (512bit = 8 double = 16 single)
+// Permute 4 possible on half precision @512bit vectors.
+//////////////////////////////////////////////////////////
+// Should be able to make the permute/extract/merge independent of the
+// vector subtype and reduce the volume of code.
+template<class vsimd>
+inline void Gpermute(vsimd &y,vsimd b,int perm){
+      switch (perm){
+#if defined(AVX1)||defined(AVX2)
+      // 8x32 bits=>3 permutes
+      case 2: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
+      case 1: y.v = _mm256_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2)); break;
+      case 0: y.v = _mm256_permute2f128_ps(b.v,b.v,0x01); break;
+#endif
+#ifdef SSE2
+      case 1: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(2,3,0,1)); break;
+      case 0: y.v = _mm_shuffle_ps(b.v,b.v,_MM_SHUFFLE(1,0,3,2));break;
+#endif
+#ifdef AVX512
+	// 16 floats=> permutes
+        // Permute 0 every abcd efgh ijkl mnop -> badc fehg jilk nmpo 
+        // Permute 1 every abcd efgh ijkl mnop -> cdab ghef jkij opmn 
+        // Permute 2 every abcd efgh ijkl mnop -> efgh abcd mnop ijkl
+        // Permute 3 every abcd efgh ijkl mnop -> ijkl mnop abcd efgh
+      case 3: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_CDAB); break;
+      case 2: y.v = _mm512_swizzle_ps(b.v,_MM_SWIZ_REG_BADC); break;
+      case 1: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
+      case 0: y.v = _mm512_permute4f128_ps(b.v,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
+#endif
+#ifdef QPX
+#error not implemented
+#endif
+      default: assert(0); break;
+      }
+    };
+
 #include <Grid_vRealF.h>
 #include <Grid_vRealD.h>
 #include <Grid_vComplexF.h>
 #include <Grid_vComplexD.h>
-
+#include <Grid_vInteger.h>
 
 #endif
diff --git a/Grid_vInteger.h b/Grid_vInteger.h
new file mode 100644
index 00000000..7a0c1c4f
--- /dev/null
+++ b/Grid_vInteger.h
@@ -0,0 +1,307 @@
+#ifndef VINTEGER_H
+#define VINTEGER_H
+
+#include "Grid.h"
+
+namespace Grid {
+
+#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
+// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
+
+  typedef uint32_t Integer;
+
+    class vInteger {
+    protected:
+
+    public:
+
+      ivec v;
+
+	typedef ivec     vector_type;
+	typedef Integer scalar_type;
+
+        vInteger(){};
+        ////////////////////////////////////
+        // Arithmetic operator overloads +,-,*
+        ////////////////////////////////////
+        friend inline vInteger operator + ( vInteger a,  vInteger b)
+        {
+	  vInteger ret;
+#if defined (AVX1) 
+	  __m128i a0,a1;
+	  __m128i b0,b1;
+	  a0 = _mm256_extractf128_si256(a.v,0);
+	  b0 = _mm256_extractf128_si256(b.v,0);
+	  a1 = _mm256_extractf128_si256(a.v,1);
+	  b1 = _mm256_extractf128_si256(b.v,1);
+	  a0 = _mm_add_epi32(a0,b0);
+	  a1 = _mm_add_epi32(a1,b1);
+	  ret.v = _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+            ret.v = _mm256_add_epi32(a.v,b.v);
+#endif
+#ifdef SSE2
+            ret.v = _mm_add_epi32(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_add_epi32(a.v,b.v);
+#endif
+#ifdef QPX
+            // Implement as array of ints is only option
+#error
+#endif
+            return ret;
+        };
+        
+        friend inline vInteger operator - ( vInteger a, vInteger b)
+        {
+            vInteger ret;
+#if defined (AVX1) 
+	  __m128i a0,a1;
+	  __m128i b0,b1;
+	  a0 = _mm256_extractf128_si256(a.v,0);
+	  b0 = _mm256_extractf128_si256(b.v,0);
+	  a1 = _mm256_extractf128_si256(a.v,1);
+	  b1 = _mm256_extractf128_si256(b.v,1);
+	  a0 = _mm_sub_epi32(a0,b0);
+	  a1 = _mm_sub_epi32(a1,b1);
+	  ret.v = _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+            ret.v = _mm256_sub_epi32(a.v,b.v);
+#endif
+#ifdef SSE2
+            ret.v = _mm_sub_epi32(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_sub_epi32(a.v,b.v);
+#endif
+#ifdef QPX
+            // Implement as array of ints is only option
+#error
+#endif
+            return ret;
+        };
+
+        friend inline vInteger operator * ( vInteger a, vInteger b)
+        {
+            vInteger ret;
+#if defined (AVX1) 
+	  __m128i a0,a1;
+	  __m128i b0,b1;
+	  a0 = _mm256_extractf128_si256(a.v,0);
+	  b0 = _mm256_extractf128_si256(b.v,0);
+	  a1 = _mm256_extractf128_si256(a.v,1);
+	  b1 = _mm256_extractf128_si256(b.v,1);
+	  a0 = _mm_mul_epi32(a0,b0);
+	  a1 = _mm_mul_epi32(a1,b1);
+	  ret.v = _mm256_set_m128i(a1,a0);
+#endif
+#if defined (AVX2)
+            ret.v = _mm256_mul_epi32(a.v,b.v);
+#endif
+#ifdef SSE2
+            ret.v = _mm_mul_epi32(a.v,b.v);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_mul_epi32(a.v,b.v);
+#endif
+#ifdef QPX
+            // Implement as array of ints is only option
+#error
+#endif
+            return ret;
+        };
+        
+        ///////////////////////////////////////////////
+        // mult, sub, add, adj,conj, mac functions
+        ///////////////////////////////////////////////
+        friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
+        friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}
+        friend inline void add (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) + (*r);}
+        friend inline void mac (vInteger &y,const vInteger a,const vInteger x){
+            y = a*x+y;
+	}
+        
+        //////////////////////////////////
+        // Initialise to 1,0,i
+        //////////////////////////////////
+        friend inline void vone (vInteger &ret){vsplat(ret,1);}
+        friend inline void vzero(vInteger &ret){vsplat(ret,0);}
+        friend inline void vtrue (vInteger &ret){vsplat(ret,0xFFFFFFFF);}
+        friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
+
+        
+        /////////////////////////////////////////////////////
+        // Broadcast a value across Nsimd copies.
+        /////////////////////////////////////////////////////
+        friend inline void vsplat(vInteger &ret,scalar_type a){
+#if defined (AVX1)|| defined (AVX2)
+            ret.v = _mm256_set1_epi32(a);
+#endif
+#ifdef SSE2
+            ret.v = _mm_set1_epi32(a);
+#endif
+#ifdef AVX512
+            ret.v = _mm512_set1_epi32(a);
+#endif
+#ifdef QPX
+#error
+#endif
+        }
+        friend inline void vset(vInteger &ret,scalar_type *a){
+#if defined (AVX1)|| defined (AVX2)
+	  ret.v = _mm256_set_epi32(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+#endif
+#ifdef SSE2
+	  ret.v = _mm_set_epi32(a[0],a[1],a[2],a[3]);
+#endif
+#ifdef AVX512
+	  ret.v = _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
+                                   a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
+#endif
+#ifdef QPX
+#error
+#endif
+	}
+
+friend inline void vstore(vInteger &ret, Integer *a){
+#if defined (AVX1)|| defined (AVX2)
+        _mm256_store_si256((__m256i*)a,ret.v);
+#endif
+#ifdef SSE2
+	_mm_store_si128(a,ret.v);
+#endif
+#ifdef AVX512
+	_mm512_store_si512(a,ret.v);
+#endif
+#ifdef QPX
+	assert(0);
+#endif
+        }
+
+        friend inline void vprefetch(const vInteger &v)
+        {
+            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
+        }
+        // Unary negation
+        friend inline vInteger operator -(const vInteger &r) {
+            vInteger ret;
+            vzero(ret);
+            ret = ret - r;
+            return ret;
+        }
+       friend inline Integer Reduce(const vInteger & in)
+       {
+	 // unimplemented
+	 assert(0);
+       }
+        // *=,+=,-= operators
+        inline vInteger &operator *=(const vInteger &r) {
+            *this = (*this)*r;
+            return *this;
+        }
+        inline vInteger &operator +=(const vInteger &r) {
+            *this = *this+r;
+            return *this;
+        }
+        inline vInteger &operator -=(const vInteger &r) {
+            *this = *this-r;
+            return *this;
+        }
+    public:
+        static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
+    };
+
+    inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; }
+
+    inline void  zeroit(vInteger &z){ vzero(z);}
+
+    inline vInteger outerProduct(const vInteger &l, const vInteger& r)
+    {
+        return l*r;
+    }
+ 
+
+    class vIntegerF : public vInteger
+    {
+    public:
+      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(float);}
+      
+      friend inline void permute(vIntegerF &y,vIntegerF b,int perm)
+      {
+	Gpermute<vIntegerF>(y,b,perm);
+      }
+      friend inline void merge(vIntegerF &y,std::vector<Integer *> &extracted)
+      {
+	Gmerge<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
+      }
+      friend inline void extract(vIntegerF &y,std::vector<Integer *> &extracted)
+      {
+	Gextract<vIntegerF,Integer,sizeof(ivec)/sizeof(float) >(y,extracted);
+      }
+    };
+
+
+    class vIntegerD : public vInteger
+    {
+    public:
+      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(double);}
+      
+      friend inline void permute(vIntegerD &y,vIntegerD b,int perm)
+      {
+	Gpermute<vIntegerD>(y,b,perm);
+      }
+      friend inline void merge(vIntegerD &y,std::vector<Integer *> &extracted)
+      {
+	Gmerge<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
+      }
+      friend inline void extract(vIntegerD &y,std::vector<Integer *> &extracted)
+      {
+	Gextract<vIntegerD,Integer,sizeof(ivec)/sizeof(double) >(y,extracted);
+      }
+    };
+
+
+    class vIntegerC : public vInteger
+    {
+    public:
+      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexF);}
+      
+      friend inline void permute(vIntegerC &y,vIntegerC b,int perm)
+      {
+	Gpermute<vIntegerC>(y,b,perm);
+      }
+      friend inline void merge(vIntegerC &y,std::vector<Integer *> &extracted)
+      {
+	Gmerge<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
+      }
+      friend inline void extract(vIntegerC &y,std::vector<Integer *> &extracted)
+      {
+	Gextract<vIntegerC,Integer,sizeof(ivec)/sizeof(ComplexF) >(y,extracted);
+      }
+    };
+
+    class vIntegerZ : public vInteger
+    {
+    public:
+      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(ComplexD);}
+      
+      friend inline void permute(vIntegerZ &y,vIntegerZ b,int perm)
+      {
+	Gpermute<vIntegerZ>(y,b,perm);
+      }
+      friend inline void merge(vIntegerZ &y,std::vector<Integer *> &extracted)
+      {
+	Gmerge<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
+      }
+      friend inline void extract(vIntegerZ &y,std::vector<Integer *> &extracted)
+      {
+	Gextract<vIntegerZ,Integer,sizeof(ivec)/sizeof(ComplexD) >(y,extracted);
+      }
+    };
+
+}
+
+#endif
diff --git a/TODO b/TODO
index 8108fbc6..9febd6c3 100644
--- a/TODO
+++ b/TODO
@@ -1,8 +1,9 @@
-* Make peekSite, pokeSite work in an MPI context.
+* FIXME audit
 
 * Conditional execution Subset, where etc...
 * Coordinate information, integers etc...
 * Integer type padding/union to vector.
+* LatticeCoordinate[mu]
 
 * Optimise the extract/merge SIMD routines