Shrinking and organising the files

2025-07-21 07:07:07 +01:00 · 2015-04-18 20:44:19 +01:00
parent 0fce523792
commit d964d01d6a
18 changed files with 750 additions and 638 deletions
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -1,5 +1,5 @@
 //
-//  Grid.cpp
+//  Grid.h
 //  simd
 //
 //  Created by Peter Boyle on 09/05/2014.
--- a/lib/Grid_QCD.h
+++ b/lib/Grid_QCD.h
@@ -139,30 +139,6 @@ namespace QCD {
      return peekIndex<LorentzIndex>(rhs,i,j);
    }

-    // FIXME this is rather generic and should find a way to place it earlier.
-     inline void LatticeCoordinate(LatticeInteger &l,int mu){
-      GridBase *grid = l._grid;
-      int Nsimd = grid->iSites();
-      std::vector<int> gcoor;
-      std::vector<Integer> mergebuf(Nsimd);
-      std::vector<Integer *> mergeptr(Nsimd);
-      for(int o=0;o<grid->oSites();o++){
-	for(int i=0;i<grid->iSites();i++){
-	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
-	  //	  grid->RankIndexToGlobalCoor(0,o,i,gcoor);
-	  mergebuf[i]=gcoor[mu];
-	  mergeptr[i]=&mergebuf[i];
-	}
-	merge(l._odata[o],mergeptr);
-      }
-    };
-
-#include <Grid_predicated.h>
-
-#if 0
-
-#endif
-
 }   //namespace QCD
 } // Grid
 #endif
--- a/lib/Grid_cshift_common.h
+++ b/lib/Grid_cshift_common.h
@@ -1,10 +1,11 @@
 #ifndef _GRID_CSHIFT_COMMON_H_
 #define _GRID_CSHIFT_COMMON_H_

+namespace Grid {
 //////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
+template<class vobj> void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];

@@ -48,7 +49,7 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
 //////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split
 //////////////////////////////////////////////////////
-friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
+ template<class vobj,class scalar_type> void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];

@@ -91,7 +92,7 @@ friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> p
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];

@@ -134,7 +135,7 @@ friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAll
 //////////////////////////////////////////////////////
 // Scatter for when there *is* need to SIMD split
 //////////////////////////////////////////////////////
-friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
+ template<class vobj,class scalar_type> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];

@@ -177,7 +178,7 @@ friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> po
 //////////////////////////////////////////////////////
 // local to node block strided copies
 //////////////////////////////////////////////////////
-friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
+template<class vobj> void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];

@@ -219,7 +220,7 @@ friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int
  }
 }

-friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
+template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
 {
  int rd = rhs._grid->_rdimensions[dimension];

@@ -265,7 +266,7 @@ friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimens
 //////////////////////////////////////////////////////
 // Local to node Cshift
 //////////////////////////////////////////////////////
-friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
+template<class vobj> void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];

@@ -280,7 +281,7 @@ friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int
  }
 }

-friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid = rhs._grid;
  int fd = grid->_fdimensions[dimension];
@@ -322,5 +323,5 @@ friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dime
  }
  return ret;
 }
-
+}
 #endif
--- a/lib/Grid_cshift_mpi.h
+++ b/lib/Grid_cshift_mpi.h
@@ -6,7 +6,9 @@
 #define MIN(x,y) ((x)>(y)?(y):(x))
 #endif

-friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
+namespace Grid { 
+
+template<class vobj> Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
 {
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
@@ -37,7 +39,7 @@ friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
  return ret;
 }

-friend void Cshift_comms(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
+template<class vobj> void Cshift_comms(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];

@@ -52,7 +54,7 @@ friend void Cshift_comms(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int
  }
 }

-friend void Cshift_comms_simd(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
+template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];

@@ -67,8 +69,7 @@ friend void Cshift_comms_simd(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimensio
  }
 }

-
-friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+template<class vobj> void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
@@ -127,8 +128,7 @@ friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int
  }
 }

-
-friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
 {
  GridBase *grid=rhs._grid;
  const int Nsimd = grid->Nsimd();
@@ -260,4 +260,5 @@ friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimensi
    }
  }
 }
+}
 #endif
--- a/lib/Grid_cshift_none.h
+++ b/lib/Grid_cshift_none.h
@@ -1,12 +1,12 @@
 #ifndef _GRID_CSHIFT_NONE_H_
 #define _GRID_CSHIFT_NONE_H_
-
-friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
+namespace Grid {
+template<class vobj> Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
 {
  Lattice<vobj> ret(rhs._grid);
  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
  Cshift_local(ret,rhs,dimension,shift);
  return ret;
 }
-        
+}
 #endif
--- a/lib/Grid_lattice.h
+++ b/lib/Grid_lattice.h
@@ -3,10 +3,10 @@

 namespace Grid {

-// TODO: Indexing ()
+// TODO: 
 //       mac,real,imag
-//
-// Functionality required:
+
+// Functionality:
 //     -=,+=,*=,()
 //     add,+,sub,-,mult,mac,*
 //     adj,conj
@@ -17,7 +17,6 @@ namespace Grid {
 //     innerProduct,outerProduct,
 //     localNorm2
 //     localInnerProduct
-//     

 extern int GridCshiftPermuteMap[4][16];

@@ -33,51 +32,39 @@ public:
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;

+    //////////////////////////////////////////////////////////////////
+    // Constructor requires "grid" passed.
+    // what about a default grid?
+    //////////////////////////////////////////////////////////////////
    Lattice(GridBase *grid) : _grid(grid) {
        _odata.reserve(_grid->oSites());
        assert((((uint64_t)&_odata[0])&0xF) ==0);
        checkerboard=0;
    }

-#include <Grid_cshift.h>
-   
-    template<class obj1,class obj2>
-    friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);
+    template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
+#pragma omp parallel for
+        for(int ss=0;ss<_grid->oSites();ss++){
+            this->_odata[ss]=r;
+        }
+        return *this;
+    }

-    // FIXME Performance difference between operator * and mult is troubling.
-    // Auto move constructor seems to lose surprisingly much.
-
-    // Site wise binary operations
-    // We eliminate a temporary object assignment if use the mult,add,sub routines.
-    // For the operator versions we rely on move constructor to eliminate the
-    // vector copy back.
-    template<class obj1,class obj2,class obj3>
-    friend void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
-
-    template<class obj1,class obj2,class obj3>
-    friend void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
-
-    template<class obj1,class obj2,class obj3>
-    friend void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
-
-    template<class obj1,class obj2,class obj3>
-    friend void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs);
+    // *=,+=,-= operators inherit behvour from correspond */+/- operation
+    template<class T> inline Lattice<vobj> &operator *=(const T &r) {
+        *this = (*this)*r;
+        return *this;
+    }

+    template<class T> inline Lattice<vobj> &operator -=(const T &r) {
+        *this = (*this)-r;
+        return *this;
+    }
+    template<class T> inline Lattice<vobj> &operator +=(const T &r) {
+        *this = (*this)+r;
+        return *this;
+    }
    
-    friend void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
-        conformable(lhs,rhs);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
-        }
-    }
-    friend void axpy(Lattice<vobj> &ret,std::complex<double> a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
-        conformable(lhs,rhs);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
-        }
-    }
    inline friend Lattice<vobj> operator / (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
        conformable(lhs,rhs);
        Lattice<vobj> ret(lhs._grid);
@@ -87,550 +74,23 @@ public:
        }
        return ret;
    };
-
-    template<class sobj>
-    inline Lattice<vobj> & operator = (const sobj & r){
-#pragma omp parallel for
-        for(int ss=0;ss<_grid->oSites();ss++){
-            this->_odata[ss]=r;
-        }
-        return *this;
-    }
-    
-    // Poke a scalar object into the SIMD array
-    template<class sobj>
-    friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
-
-      GridBase *grid=l._grid;
-
-      typedef typename vobj::scalar_type scalar_type;
-      typedef typename vobj::vector_type vector_type;
-
-      int Nsimd = grid->Nsimd();
-
-      assert( l.checkerboard== l._grid->CheckerBoard(site));
-      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
-
-      int rank,odx,idx;
-      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
-
-      // Optional to broadcast from node 0.
-      grid->Broadcast(0,s);
-
-      std::vector<sobj> buf(Nsimd);
-      std::vector<scalar_type *> pointers(Nsimd);  
-      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
-
-      // extract-modify-merge cycle is easiest way and this is not perf critical
-      extract(l._odata[odx],pointers);
-      
-      buf[idx] = s;
-
-      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
-      merge(l._odata[odx],pointers);
-
-      return;
-    };
-    
-    // Peek a scalar object from the SIMD array
-    template<class sobj>
-    friend void peekSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
-        
-      GridBase *grid=l._grid;
-
-      typedef typename vobj::scalar_type scalar_type;
-      typedef typename vobj::vector_type vector_type;
-
-      int Nsimd = grid->Nsimd();
-
-      assert( l.checkerboard== l._grid->CheckerBoard(site));
-      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
-
-      int rank,odx,idx;
-      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
-      std::vector<sobj> buf(Nsimd);
-      std::vector<scalar_type *> pointers(Nsimd);  
-      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
-
-      extract(l._odata[odx],pointers);
-      
-      s = buf[idx];
-      grid->Broadcast(rank,s);
-
-      return;
-    };
-    
-    // FIXME Randomise; deprecate this
-    friend void random(Lattice<vobj> &l){
-        Real *v_ptr = (Real *)&l._odata[0];
-        size_t v_len = l._grid->oSites()*sizeof(vobj);
-        size_t d_len = v_len/sizeof(Real);
-	
-        for(int i=0;i<d_len;i++){
-
-            v_ptr[i]=drand48();
-        }
-    };
-
-    // FIXME for debug; deprecate this; made obscelete by 
-    // LatticeCoordinate();
-    friend void lex_sites(Lattice<vobj> &l){
-      Real *v_ptr = (Real *)&l._odata[0];
-      size_t o_len = l._grid->oSites();
-      size_t v_len = sizeof(vobj)/sizeof(vRealF);
-      size_t vec_len = vRealF::Nsimd();
-
-      for(int i=0;i<o_len;i++){
-	for(int j=0;j<v_len;j++){
-          for(int vv=0;vv<vec_len;vv+=2){
-	    v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
-	    v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
-	  }
-	}}
-    }
-    
-    // FIXME Implement a consistent seed management strategy
-    friend void gaussian(Lattice<vobj> &l){
-        // Zero mean, unit variance.
-        std::normal_distribution<double> distribution(0.0,1.0);
-        Real *v_ptr = (Real *)&l._odata[0];
-        size_t v_len = l._grid->oSites()*sizeof(vobj);
-        size_t d_len = v_len/sizeof(Real);
-
-        for(int i=0;i<d_len;i++){
-	  v_ptr[i]= drand48();
-        }
-    };
-
-    // Unary functions and Unops
-    friend inline Lattice<vobj> operator -(const Lattice<vobj> &r) {
-        Lattice<vobj> ret(r._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<r._grid->oSites();ss++){
-            ret._odata[ss]= -r._odata[ss];
-        }
-        return ret;
-    }
-    // *=,+=,-= operators inherit behvour from correspond */+/- operation
-    template<class T>
-    inline Lattice<vobj> &operator *=(const T &r) {
-        *this = (*this)*r;
-        return *this;
-    }
-    template<class T>
-    inline Lattice<vobj> &operator -=(const T &r) {
-        *this = (*this)-r;
-        return *this;
-    }
-    template<class T>
-    inline Lattice<vobj> &operator +=(const T &r) {
-        *this = (*this)+r;
-        return *this;
-    }
-    
-    inline friend Lattice<vobj> adj(const Lattice<vobj> &lhs){
-        Lattice<vobj> ret(lhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = adj(lhs._odata[ss]);
-        }
-        return ret;
-    };
-
-    inline friend Lattice<vobj> transpose(const Lattice<vobj> &lhs){
-        Lattice<vobj> ret(lhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = transpose(lhs._odata[ss]);
-        }
-        return ret;
-    };
-
-
-    inline friend Lattice<vobj> conj(const Lattice<vobj> &lhs){
-        Lattice<vobj> ret(lhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = conj(lhs._odata[ss]);
-        }
-        return ret;
-    };
-
-    // remove and insert a half checkerboard
-    friend void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
-      half.checkerboard = cb;
-      int ssh=0;
-#pragma omp parallel for
-      for(int ss=0;ss<full._grid->oSites();ss++){
-	std::vector<int> coor;
-	int cbos;
-	
-	full._grid->oCoorFromOindex(coor,ss);
-	cbos=half._grid->CheckerBoard(coor);
-	
-	if (cbos==cb) {
-
-	  half._odata[ssh] = full._odata[ss];
-	  ssh++;
-	}
-      }
-    }
-    friend void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
-      int cb = half.checkerboard;
-      int ssh=0;
-#pragma omp parallel for
-      for(int ss=0;ss<full._grid->oSites();ss++){
-	std::vector<int> coor;
-	int cbos;
-	
-	full._grid->oCoorFromOindex(coor,ss);
-	cbos=half._grid->CheckerBoard(coor);
-
-	if (cbos==cb) {
-	  full._odata[ss]=half._odata[ssh];
-	  ssh++;
-	}
-      }
-    }
-}; // class Lattice
-
-    template<class obj1,class obj2>
-    void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
-    {
-        assert(lhs._grid == rhs._grid);
-        assert(lhs.checkerboard == rhs.checkerboard);
-    }
-
-    template<class obj1,class obj2,class obj3>
-    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-        conformable(lhs,rhs);
-	uint32_t vec_len = lhs._grid->oSites();
-#pragma omp parallel for
-        for(int ss=0;ss<vec_len;ss++){
-	  mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
-        }
-    }
-
-    template<class obj1,class obj2,class obj3>
-    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-        conformable(lhs,rhs);
-	uint32_t vec_len = lhs._grid->oSites();
-#pragma omp parallel for
-        for(int ss=0;ss<vec_len;ss++){
-	  mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
-        }
-    }
-
-    template<class obj1,class obj2,class obj3>
-    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-        conformable(lhs,rhs);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
-        }
-    }
-    template<class obj1,class obj2,class obj3>
-    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
-        conformable(lhs,rhs);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
-        }
-    }
-
-    // Lattice BinOp Lattice,
-    template<class left,class right>
-    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
-    {
-        //NB mult performs conformable check. Do not reapply here for performance.
-      Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
-      mult(ret,lhs,rhs);
-      return ret;
-    }
-    template<class left,class right>
-    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
-    {
-        //NB mult performs conformable check. Do not reapply here for performance.
-        Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
-        add(ret,lhs,rhs);
-        return ret;
-    }
-    template<class left,class right>
-    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
-    {
-        //NB mult performs conformable check. Do not reapply here for performance.
-        Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
-        sub(ret,lhs,rhs);
-        return ret;
-    }
-
-    // Scalar BinOp Lattice ;generate return type
-    template<class left,class right>
-    inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
-    {
-        Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=lhs*rhs._odata[ss];
-        }
-        return ret;
-    }
-    template<class left,class right>
-    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
-    {
-        Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=lhs+rhs._odata[ss];
-        }
-        return ret;
-    }
-    template<class left,class right>
-    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
-    {
-        Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=lhs-rhs._odata[ss];
-        }
-        return ret;
-    }
-    template<class left,class right>
-    inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
-    {
-        Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites(); ss++){
-            ret._odata[ss]=lhs._odata[ss]*rhs;
-        }
-        return ret;
-    }
-    template<class left,class right>
-    inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
-    {
-        Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=lhs._odata[ss]+rhs;
-        }
-        return ret;
-    }
-    template<class left,class right>
-    inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
-    {
-        Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=lhs._odata[ss]-rhs;
-        }
-        return ret;
-    }
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Trace
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    template<class vobj>
-    inline auto trace(const Lattice<vobj> &lhs)
-      -> Lattice<decltype(trace(lhs._odata[0]))>
-    {
-      Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = trace(lhs._odata[ss]);
-        }
-        return ret;
-    };
-    
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Index level dependent operations
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    template<int Index,class vobj>
-    inline auto traceIndex(const Lattice<vobj> &lhs)
-      -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
-    {
-      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
-#pragma omp parallel for
-      for(int ss=0;ss<lhs._grid->oSites();ss++){
-	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
-      }
-      return ret;
-    };
-    template<int Index,class vobj>
-    inline auto transposeIndex(const Lattice<vobj> &lhs)
-      -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
-    {
-      Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
-        }
-        return ret;
-    };
-
-    // Fixme; this is problematic since the number of args is variable and 
-    // may mismatch...
-    template<int Index,class vobj>
-    inline auto peekIndex(const Lattice<vobj> &lhs)
-      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0]))>
-    {
-      Lattice<decltype(peekIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = peekIndex<Index>(lhs._odata[ss]);
-        }
-        return ret;
-    };
-    template<int Index,class vobj>
-      inline auto peekIndex(const Lattice<vobj> &lhs,int i)
-      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
-    {
-      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
-        }
-        return ret;
-    };
-    template<int Index,class vobj>
-      inline auto peekIndex(const Lattice<vobj> &lhs,int i,int j)
-      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
-    {
-      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
-        }
-        return ret;
-    };
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Poke internal indices of a Lattice object
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    template<int Index,class vobj> inline
-    void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0]))> & rhs)
-    {
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss]);
-	}      
-    }
-    template<int Index,class vobj> inline
-    void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
-    {
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
-	}      
-    }
-    template<int Index,class vobj> inline
-    void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
-    {
-#pragma omp parallel for
-        for(int ss=0;ss<lhs._grid->oSites();ss++){
-	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
-	}      
-    }
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Reduction operations
-    ////////////////////////////////////////////////////////////////////////////////////////////////////
-    template<class vobj>
-    inline RealD norm2(const Lattice<vobj> &arg){
-
-      typedef typename vobj::scalar_type scalar;
-      typedef typename vobj::vector_type vector;
-      decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
-      scalar nrm;
-      //FIXME make this loop parallelisable
-      vnrm=zero;
-      for(int ss=0;ss<arg._grid->oSites(); ss++){
-	vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]);
-      }
-      vector vvnrm =TensorRemove(vnrm) ;
-      nrm = Reduce(vvnrm);
-      arg._grid->GlobalSum(nrm);
-      return real(nrm);
-    }
-
-    template<class vobj>
-    inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
-    {
-      typedef typename vobj::scalar_type scalar;
-      decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
-
-      scalar nrm;
-      //FIXME make this loop parallelisable
-      for(int ss=0;ss<left._grid->oSites(); ss++){
-	vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
-      }
-      nrm = Reduce(vnrm);
-      right._grid->GlobalSum(nrm);
-      return nrm;
-    }
-
-    /////////////////////////////////////////////////////
-    // Non site reduced routines
-    /////////////////////////////////////////////////////
-
-    // localNorm2,
-    template<class vobj>
-    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
-    {
-      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-	  ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
-    }
-    
-    template<class vobj>
-    inline auto real(const Lattice<vobj> &z) -> Lattice<decltype(real(z._odata[0]))>
-    {
-      Lattice<decltype(real(z._odata[0]))> ret(z._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<z._grid->oSites();ss++){
-            ret._odata[ss] = real(z._odata[ss]);
-        }
-      return ret;
-    }
-
-    template<class vobj>
-    inline auto imag(const Lattice<vobj> &z) -> Lattice<decltype(imag(z._odata[0]))>
-    {
-      Lattice<decltype(imag(z._odata[0]))> ret(z._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<z._grid->oSites();ss++){
-            ret._odata[ss] = imag(z._odata[ss]);
-        }
-      return ret;
-    }
-
-    // localInnerProduct
-    template<class vobj>
-    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-      -> Lattice<typename vobj::tensor_reduced>
-    {
-      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
-#pragma omp parallel for
-      for(int ss=0;ss<rhs._grid->oSites(); ss++){
-	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
-      }
-      return ret;
-    }
-    
-    // outerProduct Scalar x Scalar -> Scalar
-    //              Vector x Vector -> Matrix
-    template<class ll,class rr>
-    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
-    {
-        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
-     }
-
-
+ }; // class Lattice
 }
+
+#include <Grid_lattice_conformable.h>
+#include <Grid_lattice_arith.h>
+#include <Grid_lattice_trace.h>
+#include <Grid_lattice_transpose.h>
+#include <Grid_lattice_local.h>
+#include <Grid_lattice_reduction.h>
+#include <Grid_lattice_peekpoke.h>
+#include <Grid_lattice_reality.h>
+#include <Grid_cshift.h>
+#include <Grid_where.h>
+#include <Grid_lattice_coordinate.h>
+#include <Grid_lattice_rng.h>
+#include <Grid_lattice_transfer.h>
+
+
+
 #endif
--- a/lib/Grid_lattice_arith.h
+++ b/lib/Grid_lattice_arith.h
@@ -0,0 +1,160 @@
+#ifndef GRID_LATTICE_ARITH_H
+#define GRID_LATTICE_ARITH_H
+
+namespace Grid {
+
+  template<class vobj>
+  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
+  {
+    Lattice<vobj> ret(r._grid);
+#pragma omp parallel for
+    for(int ss=0;ss<r._grid->oSites();ss++){
+      ret._odata[ss]= -r._odata[ss];
+    }
+    return ret;
+  }
+  
+  template<class vobj>
+  inline void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  template<class vobj>
+  inline void axpy(Lattice<vobj> &ret,std::complex<double> a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  
+  
+  template<class obj1,class obj2,class obj3>
+    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  template<class obj1,class obj2,class obj3>
+    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+    }
+  }
+  
+  // Lattice BinOp Lattice,
+  template<class left,class right>
+    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+  {
+    //NB mult performs conformable check. Do not reapply here for performance.
+    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+    mult(ret,lhs,rhs);
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+  {
+    //NB mult performs conformable check. Do not reapply here for performance.
+    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+    add(ret,lhs,rhs);
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+  {
+    //NB mult performs conformable check. Do not reapply here for performance.
+    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+    sub(ret,lhs,rhs);
+    return ret;
+  }
+  
+  // Scalar BinOp Lattice ;generate return type
+  template<class left,class right>
+    inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+  {
+    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+#pragma omp parallel for
+    for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=lhs*rhs._odata[ss];
+    }
+        return ret;
+  }
+  template<class left,class right>
+    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+    {
+      Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=lhs+rhs._odata[ss];
+      }
+        return ret;
+    }
+  template<class left,class right>
+    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+  {
+    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+#pragma omp parallel for
+    for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=lhs-rhs._odata[ss];
+    }
+    return ret;
+  }
+    template<class left,class right>
+      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<lhs._grid->oSites(); ss++){
+            ret._odata[ss]=lhs._odata[ss]*rhs;
+      }
+      return ret;
+    }
+    template<class left,class right>
+      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+        Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=lhs._odata[ss]+rhs;
+        }
+        return ret;
+    }
+    template<class left,class right>
+      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=lhs._odata[ss]-rhs;
+      }
+      return ret;
+    }
+}
+#endif
--- a/lib/Grid_lattice_conformable.h
+++ b/lib/Grid_lattice_conformable.h
@@ -0,0 +1,14 @@
+#ifndef GRID_LATTICE_CONFORMABLE_H
+#define GRID_LATTICE_CONFORMABLE_H
+
+namespace Grid {
+
+    template<class obj1,class obj2>
+    void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
+    {
+        assert(lhs._grid == rhs._grid);
+        assert(lhs.checkerboard == rhs.checkerboard);
+    }
+
+}
+#endif
--- a/lib/Grid_lattice_coordinate.h
+++ b/lib/Grid_lattice_coordinate.h
@@ -0,0 +1,45 @@
+#ifndef GRID_LATTICE_COORDINATE_H
+#define GRID_LATTICE_COORDINATE_H
+
+namespace Grid {
+
+    template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
+    {
+      GridBase *grid = l._grid;
+      int Nsimd = grid->iSites();
+      std::vector<int> gcoor;
+      std::vector<Integer> mergebuf(Nsimd);
+      std::vector<Integer *> mergeptr(Nsimd);
+      vInteger vI;
+      for(int o=0;o<grid->oSites();o++){
+	for(int i=0;i<grid->iSites();i++){
+	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
+	  //	  grid->RankIndexToGlobalCoor(0,o,i,gcoor);
+	  mergebuf[i]=gcoor[mu];
+	  mergeptr[i]=&mergebuf[i];
+	}
+	merge(vI,mergeptr);
+	l._odata[o]=vI;
+      }
+    };
+
+    // LatticeCoordinate();
+    // FIXME for debug; deprecate this; made obscelete by 
+    template<class vobj> void lex_sites(Lattice<vobj> &l){
+      Real *v_ptr = (Real *)&l._odata[0];
+      size_t o_len = l._grid->oSites();
+      size_t v_len = sizeof(vobj)/sizeof(vRealF);
+      size_t vec_len = vRealF::Nsimd();
+
+      for(int i=0;i<o_len;i++){
+	for(int j=0;j<v_len;j++){
+          for(int vv=0;vv<vec_len;vv+=2){
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
+	  }
+	}}
+    }
+
+
+}
+#endif
--- a/lib/Grid_lattice_local.h
+++ b/lib/Grid_lattice_local.h
@@ -0,0 +1,54 @@
+#ifndef GRID_LATTICE_LOCALREDUCTION_H
+#define GRID_LATTICE_LOCALREDUCTION_H
+
+///////////////////////////////////////////////
+// localInner, localNorm, outerProduct
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    /////////////////////////////////////////////////////
+    // Non site, reduced locally reduced routines
+    /////////////////////////////////////////////////////
+
+    // localNorm2,
+    template<class vobj>
+    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+    }
+    
+    // localInnerProduct
+    template<class vobj>
+    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
+      -> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
+      }
+      return ret;
+    }
+    
+    // outerProduct Scalar x Scalar -> Scalar
+    //              Vector x Vector -> Matrix
+    template<class ll,class rr>
+    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
+    {
+        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+     }
+
+}
+
+#endif
--- a/lib/Grid_lattice_peekpoke.h
+++ b/lib/Grid_lattice_peekpoke.h
@@ -0,0 +1,143 @@
+#ifndef GRID_LATTICE_PEEK_H
+#define GRID_LATTICE_PEEK_H
+
+///////////////////////////////////////////////
+// Peeking and poking around
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Peek internal indices of a Lattice object
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj>
+    inline auto peekIndex(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0]))>
+    {
+      Lattice<decltype(peekIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = peekIndex<Index>(lhs._odata[ss]);
+        }
+        return ret;
+    };
+    template<int Index,class vobj>
+      inline auto peekIndex(const Lattice<vobj> &lhs,int i)
+      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
+    {
+      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
+        }
+        return ret;
+    };
+    template<int Index,class vobj>
+      inline auto peekIndex(const Lattice<vobj> &lhs,int i,int j)
+      -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
+    {
+      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
+        }
+        return ret;
+    };
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Poke internal indices of a Lattice object
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj> inline
+    void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0]))> & rhs)
+    {
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss]);
+	}      
+    }
+    template<int Index,class vobj> inline
+    void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
+    {
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
+	}      
+    }
+    template<int Index,class vobj> inline
+    void pokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
+    {
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
+	}      
+    }
+
+    //////////////////////////////////////////////////////
+    // Poke a scalar object into the SIMD array
+    //////////////////////////////////////////////////////
+    template<class vobj,class sobj>
+    void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+
+      GridBase *grid=l._grid;
+
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      int Nsimd = grid->Nsimd();
+
+      assert( l.checkerboard== l._grid->CheckerBoard(site));
+      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+      int rank,odx,idx;
+      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+
+      // Optional to broadcast from node 0.
+      grid->Broadcast(0,s);
+
+      std::vector<sobj> buf(Nsimd);
+      std::vector<scalar_type *> pointers(Nsimd);  
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+
+      // extract-modify-merge cycle is easiest way and this is not perf critical
+      extract(l._odata[odx],pointers);
+      
+      buf[idx] = s;
+
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+      merge(l._odata[odx],pointers);
+
+      return;
+    };
+
+    //////////////////////////////////////////////////////////
+    // Peek a scalar object from the SIMD array
+    //////////////////////////////////////////////////////////
+    template<class vobj,class sobj>
+      void peekSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+        
+      GridBase *grid=l._grid;
+
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      int Nsimd = grid->Nsimd();
+
+      assert( l.checkerboard== l._grid->CheckerBoard(site));
+      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+      int rank,odx,idx;
+      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+      std::vector<sobj> buf(Nsimd);
+      std::vector<scalar_type *> pointers(Nsimd);  
+      for(int i=0;i<Nsimd;i++) pointers[i] = (scalar_type *)&buf[i];
+
+      extract(l._odata[odx],pointers);
+      
+      s = buf[idx];
+      grid->Broadcast(rank,s);
+
+      return;
+    };
+
+}
+#endif
+
--- a/lib/Grid_lattice_reality.h
+++ b/lib/Grid_lattice_reality.h
@@ -0,0 +1,52 @@
+#ifndef GRID_LATTICE_REALITY_H
+#define GRID_LATTICE_REALITY_H
+
+
+// FIXME .. this is the sector of the code 
+// I am most worried about the directions
+// The choice of burying complex in the SIMD
+// is making the use of "real" and "imag" very cumbersome
+
+namespace Grid {
+
+    template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = adj(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+    template<class vobj> inline Lattice<vobj> conj(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = conj(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+    template<class vobj> inline auto real(const Lattice<vobj> &z) -> Lattice<decltype(real(z._odata[0]))>
+    {
+      Lattice<decltype(real(z._odata[0]))> ret(z._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<z._grid->oSites();ss++){
+            ret._odata[ss] = real(z._odata[ss]);
+        }
+      return ret;
+    }
+
+    template<class vobj> inline auto imag(const Lattice<vobj> &z) -> Lattice<decltype(imag(z._odata[0]))>
+    {
+      Lattice<decltype(imag(z._odata[0]))> ret(z._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<z._grid->oSites();ss++){
+            ret._odata[ss] = imag(z._odata[ss]);
+        }
+      return ret;
+    }
+
+
+}
+#endif
--- a/lib/Grid_lattice_reduction.h
+++ b/lib/Grid_lattice_reduction.h
@@ -0,0 +1,45 @@
+#ifndef GRID_LATTICE_REDUCTION_H
+#define GRID_LATTICE_REDUCTION_H
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Reduction operations
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class vobj>
+    inline RealD norm2(const Lattice<vobj> &arg){
+
+      typedef typename vobj::scalar_type scalar;
+      typedef typename vobj::vector_type vector;
+      decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
+      scalar nrm;
+      //FIXME make this loop parallelisable
+      vnrm=zero;
+      for(int ss=0;ss<arg._grid->oSites(); ss++){
+	vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]);
+      }
+      vector vvnrm =TensorRemove(vnrm) ;
+      nrm = Reduce(vvnrm);
+      arg._grid->GlobalSum(nrm);
+      return real(nrm);
+    }
+
+    template<class vobj>
+    inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
+    {
+      typedef typename vobj::scalar_type scalar;
+      decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
+
+      scalar nrm;
+      //FIXME make this loop parallelisable
+      for(int ss=0;ss<left._grid->oSites(); ss++){
+	vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
+      }
+      nrm = Reduce(vnrm);
+      right._grid->GlobalSum(nrm);
+      return nrm;
+    }
+
+}
+#endif
+
--- a/lib/Grid_lattice_rng.h
+++ b/lib/Grid_lattice_rng.h
@@ -0,0 +1,32 @@
+#ifndef GRID_LATTICE_RNG_H
+#define GRID_LATTICE_RNG_H
+
+namespace Grid {
+
+    // FIXME Randomise; deprecate this
+    template <class vobj> inline void random(Lattice<vobj> &l){
+        Real *v_ptr = (Real *)&l._odata[0];
+        size_t v_len = l._grid->oSites()*sizeof(vobj);
+        size_t d_len = v_len/sizeof(Real);
+	
+        for(int i=0;i<d_len;i++){
+
+            v_ptr[i]=drand48();
+        }
+    };
+    
+    // FIXME Implement a consistent seed management strategy
+    template <class vobj> inline void gaussian(Lattice<vobj> &l){
+        // Zero mean, unit variance.
+        std::normal_distribution<double> distribution(0.0,1.0);
+        Real *v_ptr = (Real *)&l._odata[0];
+        size_t v_len = l._grid->oSites()*sizeof(vobj);
+        size_t d_len = v_len/sizeof(Real);
+
+        for(int i=0;i<d_len;i++){
+	  v_ptr[i]= drand48();
+        }
+    };
+
+}
+#endif
--- a/lib/Grid_lattice_trace.h
+++ b/lib/Grid_lattice_trace.h
@@ -0,0 +1,43 @@
+#ifndef GRID_LATTICE_TRACE_H
+#define GRID_LATTICE_TRACE_H
+
+///////////////////////////////////////////////
+// Tracing, transposing, peeking, poking
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Trace
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class vobj>
+    inline auto trace(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(trace(lhs._odata[0]))>
+    {
+      Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = trace(lhs._odata[ss]);
+        }
+        return ret;
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Trace Index level dependent operation
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj>
+    inline auto traceIndex(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
+    {
+      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
+      }
+      return ret;
+    };
+
+
+}
+#endif
+
--- a/lib/Grid_lattice_transfer.h
+++ b/lib/Grid_lattice_transfer.h
@@ -0,0 +1,46 @@
+#ifndef GRID_LATTICE_TRANSFER_H
+#define GRID_LATTICE_TRANSFER_H
+
+namespace Grid {
+
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // remove and insert a half checkerboard
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
+    half.checkerboard = cb;
+    int ssh=0;
+#pragma omp parallel for
+    for(int ss=0;ss<full._grid->oSites();ss++){
+      std::vector<int> coor;
+      int cbos;
+      
+      full._grid->oCoorFromOindex(coor,ss);
+      cbos=half._grid->CheckerBoard(coor);
+      
+      if (cbos==cb) {
+	
+	half._odata[ssh] = full._odata[ss];
+	ssh++;
+      }
+    }
+  }
+  template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
+    int cb = half.checkerboard;
+    int ssh=0;
+#pragma omp parallel for
+    for(int ss=0;ss<full._grid->oSites();ss++){
+      std::vector<int> coor;
+      int cbos;
+      
+      full._grid->oCoorFromOindex(coor,ss);
+      cbos=half._grid->CheckerBoard(coor);
+      
+      if (cbos==cb) {
+	full._odata[ss]=half._odata[ssh];
+	ssh++;
+      }
+    }
+  }
+  
+}
+#endif
--- a/lib/Grid_lattice_transpose.h
+++ b/lib/Grid_lattice_transpose.h
@@ -0,0 +1,39 @@
+#ifndef GRID_LATTICE_TRANSPOSE_H
+#define GRID_LATTICE_TRANSPOSE_H
+
+///////////////////////////////////////////////
+// Transpose
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Transpose
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj>
+    inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = transpose(lhs._odata[ss]);
+        }
+        return ret;
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Index level dependent transpose
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj>
+    inline auto transposeIndex(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
+    {
+      Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+}
+#endif
--- a/lib/Grid_predicated.h
+++ b/lib/Grid_predicated.h
@@ -1,14 +1,14 @@
-#ifndef GRID_PREDICATED_H
-#define GRID_PREDICATED_H
-
+#ifndef GRID_WHERE_H
+#define GRID_WHERE_H
+namespace Grid {
 // Must implement the predicate gating the 
 // Must be able to reduce the predicate down to a single vInteger per site.
 // Must be able to require the type be iScalar x iScalar x ....
 //                              give a GetVtype method in iScalar
 //                              and blow away the tensor structures.
 //
-template<class vobj>
-inline void where(Lattice<vobj> &ret,const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+template<class vobj,class iobj>
+inline void where(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
 {
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);
@@ -17,6 +17,7 @@ inline void where(Lattice<vobj> &ret,const LatticeInteger &predicate,Lattice<vob
  GridBase *grid=iftrue._grid;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
+  typedef typename iobj::vector_type mask_type;

  const int Nsimd = grid->Nsimd();
  const int words = sizeof(vobj)/sizeof(vector_type);
@@ -35,7 +36,7 @@ inline void where(Lattice<vobj> &ret,const LatticeInteger &predicate,Lattice<vob
    for(int s=0;s<Nsimd;s++) pointers[s] = & falsevals[s][0];
    extract(iffalse._odata[ss]  ,pointers);

-    extract(predicate._odata[ss],mask);
+    extract(TensorRemove(predicate._odata[ss]),mask);

    for(int s=0;s<Nsimd;s++){
      if (mask[s]) pointers[s]=&truevals[s][0];
@@ -46,8 +47,8 @@ inline void where(Lattice<vobj> &ret,const LatticeInteger &predicate,Lattice<vob
  }
 }

-template<class vobj>
-inline Lattice<vobj> where(const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+template<class vobj,class iobj>
+inline Lattice<vobj> where(const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
 {
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);
@@ -58,5 +59,5 @@ inline Lattice<vobj> where(const LatticeInteger &predicate,Lattice<vobj> &iftrue

  return ret;
 }
-
+}
 #endif