diff --git a/Grid.h b/Grid.h
index 0a4024bf..caa4ed5e 100644
--- a/Grid.h
+++ b/Grid.h
@@ -41,12 +41,12 @@
 #include <malloc.h>
 #endif
 
-
 #include <Grid_aligned_allocator.h>
 #include <Grid_simd.h>
 #include <Grid_math_types.h>
 #include <Grid_Cartesian.h>
 #include <Grid_Lattice.h>
+#include <Grid_comparison.h>
 #include <Grid_QCD.h>
 
 namespace Grid {
diff --git a/Grid_Lattice.h b/Grid_Lattice.h
index ebf68560..01b95755 100644
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -19,16 +19,14 @@ public:
     typedef typename vobj::vector_type vector_type;
 public:
 
-
     Lattice(GridBase *grid) : _grid(grid) {
         _odata.reserve(_grid->oSites());
         assert((((uint64_t)&_odata[0])&0xF) ==0);
         checkerboard=0;
     }
-    
 
 #include <Grid_cshift.h>
-    
+   
     template<class obj1,class obj2>
     friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);
 
@@ -156,23 +154,23 @@ public:
             v_ptr[i]=drand48();
         }
     };
-    
+
     // FIXME for debug; deprecate this
     friend void lex_sites(Lattice<vobj> &l){
-        Real *v_ptr = (Real *)&l._odata[0];
-	size_t o_len = l._grid->oSites();
-        size_t v_len = sizeof(vobj)/sizeof(vRealF);
-	size_t vec_len = vRealF::Nsimd();
+      Real *v_ptr = (Real *)&l._odata[0];
+      size_t o_len = l._grid->oSites();
+      size_t v_len = sizeof(vobj)/sizeof(vRealF);
+      size_t vec_len = vRealF::Nsimd();
 
-        for(int i=0;i<o_len;i++){
-        for(int j=0;j<v_len;j++){
-	  for(int vv=0;vv<vec_len;vv+=2){
-            v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
-            v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
+      for(int i=0;i<o_len;i++){
+	for(int j=0;j<v_len;j++){
+          for(int vv=0;vv<vec_len;vv+=2){
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
 	  }
-        }}
-    };
-
+	}}
+    }
+    
     // FIXME Implement a consistent seed management strategy
     friend void gaussian(Lattice<vobj> &l){
         // Zero mean, unit variance.
@@ -195,7 +193,7 @@ public:
         }
         return ret;
     }
-    // *=,+=,-= operators
+    // *=,+=,-= operators inherit behvour from correspond */+/- operation
     template<class T>
     inline Lattice<vobj> &operator *=(const T &r) {
         *this = (*this)*r;
@@ -351,7 +349,6 @@ public:
     inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
     {
         Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
-
 #pragma omp parallel for
         for(int ss=0;ss<rhs._grid->oSites(); ss++){
             ret._odata[ss]=lhs*rhs._odata[ss];
@@ -383,7 +380,7 @@ public:
     {
         Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
 #pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+        for(int ss=0;ss<lhs._grid->oSites(); ss++){
             ret._odata[ss]=lhs._odata[ss]*rhs;
         }
         return ret;
@@ -409,5 +406,6 @@ public:
         return ret;
     }
 
+
 }
 #endif
diff --git a/Grid_QCD.h b/Grid_QCD.h
index 3e852633..1d52f911 100644
--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@@ -45,7 +45,7 @@ namespace QCD {
     
     typedef Lattice<vTComplex>            LatticeComplex;
 
-    typedef Lattice<vTInteger>            LatticeInteger; // Predicates for "where"
+    typedef Lattice<vInteger>            LatticeInteger; // Predicates for "where"
     
     typedef Lattice<vColourMatrix>     LatticeColourMatrix;
     typedef Lattice<vSpinMatrix>       LatticeSpinMatrix;
@@ -92,6 +92,31 @@ namespace QCD {
         }
         return ret;
      }
+
+    // FIXME for debug; deprecate this
+   inline void LatticeCoordinate(LatticeInteger &l,int mu){
+      GridBase *grid = l._grid;
+      int Nsimd = grid->iSites();
+      std::vector<int> gcoor;
+      std::vector<Integer> mergebuf(Nsimd);
+      std::vector<Integer *> mergeptr(Nsimd);
+      for(int o=0;o<grid->oSites();o++){
+	for(int i=0;i<grid->iSites();i++){
+	  //	  RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
+	  grid->RankIndexToGlobalCoor(0,o,i,gcoor);
+	  mergebuf[i]=gcoor[mu];
+	  mergeptr[i]=&mergebuf[i];
+	}
+	merge(l._odata[o],mergeptr);
+      }
+    };
+
+#include <Grid_predicated.h>
+
+#if 0
+
+#endif
+
 }   //namespace QCD
 } // Grid
 #endif
diff --git a/Grid_comparison.h b/Grid_comparison.h
new file mode 100644
index 00000000..c890d342
--- /dev/null
+++ b/Grid_comparison.h
@@ -0,0 +1,264 @@
+#ifndef GRID_COMPARISON_H
+#define GRID_COMPARISON_H
+namespace Grid {
+
+    // Generic list of functors
+    template<class lobj,class robj> class veq {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs == rhs;
+	}
+    };
+    template<class lobj,class robj> class vne {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs != rhs;
+	}
+    };
+    template<class lobj,class robj> class vlt {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs < rhs;
+	}
+    };
+    template<class lobj,class robj> class vle {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs <= rhs;
+	}
+    };
+    template<class lobj,class robj> class vgt {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs > rhs;
+	}
+    };
+    template<class lobj,class robj> class vge {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs >= rhs;
+	}
+    };
+
+    // Generic list of functors
+    template<class lobj,class robj> class seq {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs == rhs;
+	}
+    };
+    template<class lobj,class robj> class sne {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs != rhs;
+	}
+    };
+    template<class lobj,class robj> class slt {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs < rhs;
+	}
+    };
+    template<class lobj,class robj> class sle {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs <= rhs;
+	}
+    };
+    template<class lobj,class robj> class sgt {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs > rhs;
+	}
+    };
+    template<class lobj,class robj> class sge {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs >= rhs;
+	}
+    };
+
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Integer gets extra relational functions. Could also implement these for RealF, RealD etc..
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class sfunctor> 
+    inline vInteger Comparison(sfunctor sop,const vInteger & lhs, const vInteger & rhs)
+    {
+      std::vector<Integer> vlhs(vInteger::Nsimd());   // Use functors to reduce this to single implementation
+      std::vector<Integer> vrhs(vInteger::Nsimd());
+      vInteger ret;
+      extract(lhs,vlhs);
+      extract(rhs,vrhs);
+      for(int s=0;s<vInteger::Nsimd();s++){
+	vlhs[s] = sop(vlhs[s],vrhs[s]);
+      }
+      merge(ret,vlhs);
+      return ret;
+    }
+    inline vInteger operator < (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(slt<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator <= (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sle<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator > (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sgt<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator >= (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sge<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator == (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(seq<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator != (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sne<Integer,Integer>(),lhs,rhs);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // relational operators
+    // 
+    // Support <,>,<=,>=,==,!=
+    //
+    //Query supporting bitwise &, |, ^, !
+    //Query supporting logical &&, ||, 
+    //////////////////////////////////////////////////////////////////////////
+    template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
+    {
+      Lattice<vInteger> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+    }
+    template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
+    {
+      Lattice<vInteger> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites(); ss++){
+	  ret._odata[ss]=op(lhs._odata[ss],rhs);
+        }
+        return ret;
+    }
+    template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
+    {
+      Lattice<vInteger> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=op(lhs._odata[ss],rhs);
+        }
+        return ret;
+    }
+
+    // Less than
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vlt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vlt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vlt<lobj,robj>(),lhs,rhs);
+   }
+
+   // Less than equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vle<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vle<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vle<lobj,robj>(),lhs,rhs);
+   }
+
+   // Greater than 
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vgt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vgt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vgt<lobj,robj>(),lhs,rhs);
+   }
+
+
+   // Greater than equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+
+
+   // equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+
+
+   // not equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+
+
+}
+#endif
diff --git a/Grid_cshift_common.h b/Grid_cshift_common.h
index 08e75cff..2910151c 100644
--- a/Grid_cshift_common.h
+++ b/Grid_cshift_common.h
@@ -265,7 +265,6 @@ friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimens
 //////////////////////////////////////////////////////
 // Local to node Cshift
 //////////////////////////////////////////////////////
-
 friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
 {
   int sshift[2];
diff --git a/Grid_main.cc b/Grid_main.cc
index 62356436..1676a07b 100644
--- a/Grid_main.cc
+++ b/Grid_main.cc
@@ -19,10 +19,10 @@ int main (int argc, char ** argv)
   std::vector<int> simd_layout(4);
   
   std::vector<int> mpi_layout(4);
-  mpi_layout[0]=2;
-  mpi_layout[1]=2;
-  mpi_layout[2]=2;
-  mpi_layout[3]=2;
+  mpi_layout[0]=1;
+  mpi_layout[1]=1;
+  mpi_layout[2]=1;
+  mpi_layout[3]=1;
 
 #ifdef AVX512
  for(int omp=128;omp<236;omp+=16){
@@ -121,12 +121,34 @@ int main (int argc, char ** argv)
     // Non-lattice (const objects) * Lattice
     ColourMatrix cm;
     SpinColourMatrix scm;
-    
+    vSpinColourMatrix vscm;
+    Complex cplx(1.0);
+    Integer myint=1;
+    double mydouble=1.0;
+
+    //    vSpinColourMatrix vscm;
     scMat = cMat*scMat;
     scm = cm * scm;         // SpinColourMatrix  = ColourMatrix     * SpinColourMatrix
     scm = scm *cm;          // SpinColourMatrix  = SpinColourMartix * ColourMatrix
     scm = GammaFive * scm ; // SpinColourMatrix  = SpinMatrix       * SpinColourMatrix
     scm = scm* GammaFive  ; // SpinColourMatrix  = SpinColourMatrix * SpinMatrix
+
+    scm = scm*cplx;
+    vscm = vscm*cplx;
+    scMat = scMat*cplx;
+
+    scm = cplx*scm;
+    vscm = cplx*vscm;
+    scMat = cplx*scMat;
+    scm = myint*scm;
+    vscm = myint*vscm;
+    scMat = scMat*myint;
+    
+    scm = scm*mydouble;
+    vscm = vscm*mydouble;
+    scMat = scMat*mydouble;
+    scMat = mydouble*scMat;
+    cMat = mydouble*cMat;
     
     sMat = adj(sMat);       // LatticeSpinMatrix adjoint
     sMat = iGammaFive*sMat; // SpinMatrix * LatticeSpinMatrix
@@ -160,7 +182,35 @@ int main (int argc, char ** argv)
     */
     lex_sites(Foo);
 
+    Integer mm[4];
+    mm[0]=1;
+    mm[1]=Fine._rdimensions[0];
+    mm[2]=Fine._ldimensions[0]*Fine._ldimensions[1];
+    mm[3]=Fine._ldimensions[0]*Fine._ldimensions[1]*Fine._ldimensions[2];
 
+    LatticeInteger lex(&Fine);
+    lex=zero;
+    for(int d=0;d<4;d++){
+      LatticeInteger coor(&Fine);
+      LatticeCoordinate(coor,d);
+      lex = lex + coor*mm[d];
+    }
+    Bar = zero;
+    Bar = where(lex<10,Foo,Bar);
+    {
+      std::vector<int> coor(4);
+      for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
+      for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
+      for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
+      for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
+        ColourMatrix bar;
+        peekSite(bar,Bar,coor);
+        for(int r=0;r<3;r++){
+        for(int c=0;c<3;c++){
+	  cout<<"bar "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "<<bar._internal._internal[r][c]<<std::endl;
+	}}
+      }}}}
+    }    
     //setCheckerboard(ShiftedCheck,rFoo); 
     //setCheckerboard(ShiftedCheck,bFoo); 
 
@@ -228,11 +278,15 @@ int main (int argc, char ** argv)
     
     double nrm=0;
 
+
+    LatticeColourMatrix deriv(&Fine);
+    double half=0.5;
+    deriv = 0.5*Cshift(Foo,0,1) - 0.5*Cshift(Foo,0,-1);
+
+
     for(int dir=0;dir<4;dir++){
       for(int shift=0;shift<latt_size[dir];shift++){
 
-
-
 	pickCheckerboard(0,rFoo,Foo);    // Pick out red or black checkerboards
 	pickCheckerboard(1,bFoo,Foo);
     
@@ -254,6 +308,8 @@ int main (int argc, char ** argv)
 	for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
 	for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
 	for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
+
+	 
  
         std::complex<Grid::Real> diff;
                     
@@ -305,7 +361,8 @@ int main (int argc, char ** argv)
         double nn=Ttr._internal._internal;
         if ( nn > 0 )
             cout<<"Shift real trace fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<endl;
-        
+     
+
         for(int r=0;r<3;r++){
         for(int c=0;c<3;c++){
             diff =shifted1._internal._internal[r][c]-shifted2._internal._internal[r][c];
diff --git a/Grid_math_type_mapper.h b/Grid_math_type_mapper.h
new file mode 100644
index 00000000..856f2c6b
--- /dev/null
+++ b/Grid_math_type_mapper.h
@@ -0,0 +1,83 @@
+#ifndef GRID_MATH_TYPE_MAPPER_H
+#define GRID_MATH_TYPE_MAPPER_H
+namespace Grid {
+//////////////////////////////////////////////////////////////////////////////////
+// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
+//////////////////////////////////////////////////////////////////////////////////
+  
+  template <class T> class GridTypeMapper {
+  public:
+    typedef typename T::scalar_type scalar_type;
+    typedef typename T::vector_type vector_type;
+    typedef typename T::tensor_reduced tensor_reduced;
+  };
+
+//////////////////////////////////////////////////////////////////////////////////
+// Recursion stops with these template specialisations
+//////////////////////////////////////////////////////////////////////////////////
+  template<> class GridTypeMapper<RealF> {
+  public:
+    typedef RealF scalar_type;
+    typedef RealF vector_type;
+    typedef RealF tensor_reduced ;
+  };
+  template<> class GridTypeMapper<RealD> {
+  public:
+    typedef RealD scalar_type;
+    typedef RealD vector_type;
+    typedef RealD tensor_reduced;
+  };
+  template<> class GridTypeMapper<ComplexF> {
+  public:
+    typedef ComplexF scalar_type;
+    typedef ComplexF vector_type;
+    typedef ComplexF tensor_reduced;
+  };
+  template<> class GridTypeMapper<ComplexD> {
+  public:
+    typedef ComplexD scalar_type;
+    typedef ComplexD vector_type;
+    typedef ComplexD tensor_reduced;
+  };
+
+  template<> class GridTypeMapper<vRealF> {
+  public:
+    typedef RealF  scalar_type;
+    typedef vRealF vector_type;
+    typedef vRealF tensor_reduced;
+  };
+  template<> class GridTypeMapper<vRealD> {
+  public:
+    typedef RealD  scalar_type;
+    typedef vRealD vector_type;
+    typedef vRealD tensor_reduced;
+  };
+  template<> class GridTypeMapper<vComplexF> {
+  public:
+    typedef ComplexF  scalar_type;
+    typedef vComplexF vector_type;
+    typedef vComplexF tensor_reduced;
+  };
+  template<> class GridTypeMapper<vComplexD> {
+  public:
+    typedef ComplexD  scalar_type;
+    typedef vComplexD vector_type;
+    typedef vComplexD tensor_reduced;
+  };
+  template<> class GridTypeMapper<vInteger> {
+  public:
+    typedef Integer  scalar_type;
+    typedef vInteger vector_type;
+    typedef vInteger tensor_reduced;
+  };
+
+  // Again terminate the recursion.
+  inline vRealD    TensorRemove(vRealD    arg){ return arg;}
+  inline vRealF    TensorRemove(vRealF    arg){ return arg;}
+  inline vComplexF TensorRemove(vComplexF arg){ return arg;}
+  inline vComplexD TensorRemove(vComplexD arg){ return arg;}
+  inline vInteger  TensorRemove(vInteger  arg){ return arg;}
+
+}
+
+#endif
diff --git a/Grid_math_types.h b/Grid_math_types.h
index 25524966..a2a3f97c 100644
--- a/Grid_math_types.h
+++ b/Grid_math_types.h
@@ -1,63 +1,11 @@
 #ifndef GRID_MATH_TYPES_H
 #define GRID_MATH_TYPES_H
+
+#include <Grid_math_type_mapper.h>
+
 namespace Grid {
 
 
-
-//////////////////////////////////////////////////////////////////////////////////
-// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
-//////////////////////////////////////////////////////////////////////////////////
-
-  template <class T> class GridTypeMapper {
-  public:
-    typedef typename T::scalar_type scalar_type;
-    typedef typename T::vector_type vector_type;
-  };
-
-  template<> class GridTypeMapper<RealF> {
-  public:
-    typedef RealF scalar_type;
-    typedef RealF vector_type;
-  };
-  template<> class GridTypeMapper<RealD> {
-  public:
-    typedef RealD scalar_type;
-    typedef RealD vector_type;
-  };
-  template<> class GridTypeMapper<ComplexF> {
-  public:
-    typedef ComplexF scalar_type;
-    typedef ComplexF vector_type;
-  };
-  template<> class GridTypeMapper<ComplexD> {
-  public:
-    typedef ComplexD scalar_type;
-    typedef ComplexD vector_type;
-  };
-
-  template<> class GridTypeMapper<vRealF> {
-  public:
-    typedef RealF  scalar_type;
-    typedef vRealF vector_type;
-  };
-  template<> class GridTypeMapper<vRealD> {
-  public:
-    typedef RealD  scalar_type;
-    typedef vRealD vector_type;
-  };
-  template<> class GridTypeMapper<vComplexF> {
-  public:
-    typedef ComplexF  scalar_type;
-    typedef vComplexF vector_type;
-  };
-  template<> class GridTypeMapper<vComplexD> {
-  public:
-    typedef ComplexD  scalar_type;
-    typedef vComplexD vector_type;
-  };
-
-
-
 ///////////////////////////////////////////////////
 // Scalar, Vector, Matrix objects.
 // These can be composed to form tensor products of internal indices.
@@ -70,9 +18,16 @@ public:
 
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
+
 
     iScalar(){};
+    
+    iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
+
     iScalar(Zero &z){ *this = zero; };
+
     iScalar<vtype> & operator= (const Zero &hero){
         zeroit(*this);
         return *this;
@@ -80,22 +35,27 @@ public:
     friend void zeroit(iScalar<vtype> &that){
         zeroit(that._internal);
     }
-    friend void permute(iScalar<vtype> &out,iScalar<vtype> &in,int permutetype){
+    friend void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
       permute(out._internal,in._internal,permutetype);
     }
-    friend void extract(iScalar<vtype> &in,std::vector<scalar_type *> &out){
+    friend void extract(const iScalar<vtype> &in,std::vector<scalar_type *> &out){
       extract(in._internal,out); // extract advances the pointers in out
     }
     friend void merge(iScalar<vtype> &in,std::vector<scalar_type *> &out){
       merge(in._internal,out); // extract advances the pointers in out
     }
+    friend inline iScalar<vtype>::vector_type TensorRemove(iScalar<vtype> arg)
+    {
+      return TensorRemove(arg._internal);
+    }
+
     // Unary negation
     friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
         iScalar<vtype> ret;
         ret._internal= -r._internal;
         return ret;
     }
-    // *=,+=,-= operators
+    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
     inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
         *this = (*this)*r;
         return *this;
@@ -117,6 +77,9 @@ public:
 
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
+
 
     iVector(Zero &z){ *this = zero; };
     iVector() {};
@@ -129,12 +92,12 @@ public:
             zeroit(that._internal[i]);
         }
     }
-    friend void permute(iVector<vtype,N> &out,iVector<vtype,N> &in,int permutetype){
+    friend void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
       for(int i=0;i<N;i++){
 	permute(out._internal[i],in._internal[i],permutetype);
       }
     }
-    friend void extract(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
+    friend void extract(const iVector<vtype,N> &in,std::vector<scalar_type *> &out){
       for(int i=0;i<N;i++){
 	extract(in._internal[i],out);// extract advances pointers in out
       }
@@ -150,7 +113,7 @@ public:
         for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
         return ret;
     }
-    // *=,+=,-= operators
+    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
     inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
         *this = (*this)*r;
         return *this;
@@ -163,10 +126,8 @@ public:
         *this = (*this)+r;
         return *this;
     }
-
 };
     
-    
 template<class vtype,int N> class iMatrix
 {
 public:
@@ -174,62 +135,64 @@ public:
 
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
 
-    iMatrix(Zero &z){ *this = zero; };
-    iMatrix() {};
-    iMatrix<vtype,N> & operator= (Zero &hero){
-        zeroit(*this);
-        return *this;
-    }
-    friend void zeroit(iMatrix<vtype,N> &that){
-        for(int i=0;i<N;i++){
-        for(int j=0;j<N;j++){
-                zeroit(that._internal[i][j]);
-        }}
-    }
-    friend void permute(iMatrix<vtype,N> &out,iMatrix<vtype,N> &in,int permutetype){
-      for(int i=0;i<N;i++){
+  iMatrix(Zero &z){ *this = zero; };
+  iMatrix() {};
+  iMatrix<vtype,N> & operator= (Zero &hero){
+    zeroit(*this);
+    return *this;
+  }
+  friend void zeroit(iMatrix<vtype,N> &that){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	zeroit(that._internal[i][j]);
+    }}
+  }
+  friend void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
+    for(int i=0;i<N;i++){
       for(int j=0;j<N;j++){
 	permute(out._internal[i][j],in._internal[i][j],permutetype);
-      }}
-    }
-    friend void extract(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
-      for(int i=0;i<N;i++){
+    }}
+  }
+  friend void extract(const iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
+    for(int i=0;i<N;i++){
       for(int j=0;j<N;j++){
 	extract(in._internal[i][j],out);// extract advances pointers in out
-      }}
-    }
-    friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
-      for(int i=0;i<N;i++){
+    }}
+  }
+  friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
+    for(int i=0;i<N;i++){
       for(int j=0;j<N;j++){
 	merge(in._internal[i][j],out);// extract advances pointers in out
-      }}
-    }
-    // Unary negation
-    friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
-        iMatrix<vtype,N> ret;
-        for(int i=0;i<N;i++){
-        for(int j=0;j<N;j++){
-            ret._internal[i][j]= -r._internal[i][j];
-        }}
-        return ret;
-    }
-    // *=,+=,-= operators
-    template<class T>
-    inline iMatrix<vtype,N> &operator *=(const T &r) {
-        *this = (*this)*r;
-        return *this;
-    }
-    template<class T>
-    inline iMatrix<vtype,N> &operator -=(const T &r) {
-        *this = (*this)-r;
-        return *this;
-    }
-    template<class T>
-    inline iMatrix<vtype,N> &operator +=(const T &r) {
-        *this = (*this)+r;
-        return *this;
-    }
+    }}
+  }
+  // Unary negation
+  friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
+    iMatrix<vtype,N> ret;
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j]= -r._internal[i][j];
+    }}
+    return ret;
+  }
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+  template<class T>
+  inline iMatrix<vtype,N> &operator *=(const T &r) {
+    *this = (*this)*r;
+    return *this;
+  }
+  template<class T>
+  inline iMatrix<vtype,N> &operator -=(const T &r) {
+    *this = (*this)-r;
+    return *this;
+  }
+  template<class T>
+  inline iMatrix<vtype,N> &operator +=(const T &r) {
+    *this = (*this)+r;
+    return *this;
+  }
 
 };
 
@@ -642,7 +605,8 @@ iVector<rtype,N> operator * (const iVector<mtype,N>& lhs,const iScalar<vtype>& r
     // mat  x vec  = vec
     // vec  x scal = vec
     // scal x vec  = vec
-    
+    //
+    // We can special case scalar_type ??
 template<class l,class r>
 inline auto operator * (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(lhs._internal * rhs._internal)>
 {
@@ -715,6 +679,229 @@ auto operator * (const iVector<l,N>& lhs,const iScalar<r>& rhs) -> iVector<declt
     }
     return ret;
 }
+//////////////////////////////////////////////////////////////////////////////////////////
+// Must support native C++ types Integer, Complex, Real
+//////////////////////////////////////////////////////////////////////////////////////////
+
+// multiplication by fundamental scalar type
+template<class l,int N> inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs*srhs;
+}
+template<class l,int N> inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iVector<l,N>::tensor_reduced srhs(rhs);
+  return lhs*srhs;
+}
+template<class l,int N> inline iVector<l,N> operator * (const typename iScalar<l>::scalar_type lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type &rhs) 
+{
+  typename iMatrix<l,N>::tensor_reduced srhs(rhs);
+  return lhs*srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator * (const typename iScalar<l>::scalar_type & lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l> inline iScalar<l> operator * (double lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iVector<l,N> operator * (double lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator * (double lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Integer support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l> inline iScalar<l> operator * (Integer lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iVector<l,N> operator * (Integer lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator * (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+// addition by fundamental scalar type applies to matrix(down diag) and scalar
+///////////////////////////////////////////////////////////////////////////////////////////////
+template<class l,int N> inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs+srhs;
+}
+template<class l,int N> inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iMatrix<l,N>::tensor_reduced srhs(rhs);
+  return lhs+srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator + (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+template<class l> inline iScalar<l> operator + (double lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator + (double lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Integer support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+template<class l> inline iScalar<l> operator + (Integer lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator + (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+// subtraction of fundamental scalar type applies to matrix(down diag) and scalar
+///////////////////////////////////////////////////////////////////////////////////////////////
+template<class l,int N> inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs-srhs;
+}
+template<class l,int N> inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::tensor_reduced slhs(lhs);
+  return slhs-rhs;
+}
+
+template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs-srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::tensor_reduced slhs(lhs);
+  return slhs-rhs;
+}
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l> inline iScalar<l> operator - (double lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+
+template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (double lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+
+////////////////////////////////////////////////////////////////////
+// Integer support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l> inline iScalar<l> operator - (Integer lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+
+
+
+
     ///////////////////////////////////////////////////////////////////////////////////////
     // localInnerProduct Scalar x Scalar -> Scalar
     // localInnerProduct Vector x Vector -> Scalar
@@ -907,6 +1094,7 @@ inline auto trace(const iScalar<vtype> &arg) -> iScalar<decltype(trace(arg._inte
     ret._internal=trace(arg._internal);
     return ret;
 }
+
 };
     
 #endif
diff --git a/Grid_predicated.h b/Grid_predicated.h
new file mode 100644
index 00000000..c6f29204
--- /dev/null
+++ b/Grid_predicated.h
@@ -0,0 +1,62 @@
+#ifndef GRID_PREDICATED_H
+#define GRID_PREDICATED_H
+
+// Must implement the predicate gating the 
+// Must be able to reduce the predicate down to a single vInteger per site.
+// Must be able to require the type be iScalar x iScalar x ....
+//                              give a GetVtype method in iScalar
+//                              and blow away the tensor structures.
+//
+template<class vobj>
+inline void where(Lattice<vobj> &ret,const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+  conformable(iftrue,ret);
+
+  GridBase *grid=iftrue._grid;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  const int Nsimd = grid->Nsimd();
+  const int words = sizeof(vobj)/sizeof(vector_type);
+
+  std::vector<Integer> mask(Nsimd);
+  std::vector<std::vector<scalar_type> > truevals (Nsimd,std::vector<scalar_type>(words) );
+  std::vector<std::vector<scalar_type> > falsevals(Nsimd,std::vector<scalar_type>(words) );
+  std::vector<scalar_type *> pointers(Nsimd);
+
+#pragma omp parallel for
+  for(int ss=0;ss<iftrue._grid->oSites(); ss++){
+
+    for(int s=0;s<Nsimd;s++) pointers[s] = & truevals[s][0];
+    extract(iftrue._odata[ss]   ,pointers);
+
+    for(int s=0;s<Nsimd;s++) pointers[s] = & falsevals[s][0];
+    extract(iffalse._odata[ss]  ,pointers);
+
+    extract(predicate._odata[ss],mask);
+
+    for(int s=0;s<Nsimd;s++){
+      if (mask[s]) pointers[s]=&truevals[s][0];
+      else         pointers[s]=&falsevals[s][0];
+    }
+
+    merge(ret._odata[ss],pointers);
+  }
+}
+
+template<class vobj>
+inline Lattice<vobj> where(const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+
+  Lattice<vobj> ret(iftrue._grid);
+
+  where(ret,predicate,iftrue,iffalse);
+
+  return ret;
+}
+
+#endif
diff --git a/Grid_simd.h b/Grid_simd.h
index 53cc8909..1229d328 100644
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -11,28 +11,15 @@
 // Vector types are arch dependent
 ////////////////////////////////////////////////////////////////////////
     
-  ////////////////////////////////////////////////////////////
-  // SIMD Alignment controls
-  ////////////////////////////////////////////////////////////
-#ifdef HAVE_VAR_ATTRIBUTE_ALIGNED
-#define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A)))
-#else
-#define ALIGN_DIRECTIVE(A) __declspec(align(A))
-#endif
 
 #ifdef SSE2
 #include <pmmintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(16)
 #endif
-
 #if defined(AVX1) || defined (AVX2)
 #include <immintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(32)
 #endif
-
 #ifdef AVX512
 #include <immintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(64)
 #endif
 
 namespace Grid {
@@ -137,41 +124,66 @@ namespace Grid {
 // Generic extract/merge/permute
 /////////////////////////////////////////////////////////////////
 template<class vsimd,class scalar>
-inline void Gextract(vsimd &y,std::vector<scalar *> &extracted){
-#if 1
+inline void Gextract(const vsimd &y,std::vector<scalar *> &extracted){
   // FIXME: bounce off stack is painful
   // temporary hack while I figure out better way.
   // There are intrinsics to do this work without the storage.
-  int Nsimd = extracted.size();
-  {
-    std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
-    vstore(y,&buf[0]);
-    for(int i=0;i<Nsimd;i++){
-      *extracted[i] = buf[i];
-      extracted[i]++;
-    }
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
+  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
+  vstore(y,&buf[0]);
+  for(int i=0;i<Nextr;i++){
+    *extracted[i] = buf[i*s];
+    extracted[i]++;
   }
-#else 
-  int NSo   = extracted.size();
-  int NSv   = vsimd::Nsimd();
-  int sparse= NSv/NSo;
-  for(int i=0;i<NSv;i+=sparse){
-    
-  }
-#endif
 };
 template<class vsimd,class scalar>
 inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
-#if 1
-  int Nsimd = extracted.size();
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
   std::vector<scalar> buf(Nsimd); 
-  for(int i=0;i<Nsimd;i++){
-    buf[i]=*extracted[i];
+  for(int i=0;i<Nextr;i++){
+    for(int ii=0;ii<s;ii++){
+      buf[i*s+ii]=*extracted[i];
+    }
     extracted[i]++;
   }
   vset(y,&buf[0]); 
-#else
-#endif
+};
+template<class vsimd,class scalar>
+inline void Gextract(const vsimd &y,std::vector<scalar> &extracted){
+  // FIXME: bounce off stack is painful
+  // temporary hack while I figure out better way.
+  // There are intrinsics to do this work without the storage.
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
+  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
+
+  vstore(y,&buf[0]);
+
+  for(int i=0;i<Nextr;i++){
+    extracted[i] = buf[i*s];
+  }
+};
+template<class vsimd,class scalar>
+inline void Gmerge(vsimd &y,std::vector<scalar> &extracted){
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
+  std::vector<scalar> buf(Nsimd); 
+  for(int i=0;i<Nextr;i++){
+    for(int ii=0;ii<s;ii++){
+      buf[i*s+ii]=extracted[i];
+    }
+  }
+  vset(y,&buf[0]); 
 };
 
 //////////////////////////////////////////////////////////
@@ -183,7 +195,7 @@ inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
 // Permute 4 possible on half precision @512bit vectors.
 //////////////////////////////////////////////////////////
 template<class vsimd>
-inline void Gpermute(vsimd &y,vsimd b,int perm){
+inline void Gpermute(vsimd &y,const vsimd &b,int perm){
       switch (perm){
 #if defined(AVX1)||defined(AVX2)
       // 8x32 bits=>3 permutes
@@ -214,10 +226,10 @@ inline void Gpermute(vsimd &y,vsimd b,int perm){
     };
 };
 
+#include <Grid_vInteger.h>
 #include <Grid_vRealF.h>
 #include <Grid_vRealD.h>
 #include <Grid_vComplexF.h>
 #include <Grid_vComplexD.h>
-#include <Grid_vInteger.h>
 
 #endif
diff --git a/Grid_stencil.h b/Grid_stencil.h
index d93204fc..4d73f436 100644
--- a/Grid_stencil.h
+++ b/Grid_stencil.h
@@ -8,5 +8,355 @@
 // Lattice <foo> could also allocate haloes which get used for stencil code.
 //
 // Grid could create a neighbour index table for a given stencil.
-// Could also implement CovariantCshift.
+//
+// Could also implement CovariantCshift, to fuse the loops and enhance performance.
+//
+//
+// General stencil computation:
+//
+// Generic services
+// 0) Prebuild neighbour tables
+// 1) Compute sizes of all haloes/comms buffers; allocate them.
+//
+// 2) Gather all faces, and communicate.
+// 3) Loop over result sites, giving nbr index/offnode info for each
+// 
+// Could take a 
+// SpinProjectFaces 
+// start comms
+// complete comms 
+// Reconstruct Umu
+//
+// Approach.
+//
 //////////////////////////////////////////////////////////////////////////////////////////
+
+namespace Grid {
+
+    class Stencil {
+  public:
+
+      Stencil(GridBase *grid,
+	      int npoints,
+	      int checkerboard,
+	      std::vector<int> directions,
+	      std::vector<int> distances);
+
+      void Stencil_local     (int dimension,int shift,int cbmask);
+      void Stencil_comms     (int dimension,int shift,int cbmask);
+      void Stencil_comms_simd(int dimension,int shift,int cbmask);
+      // Will need to implement actions for
+      //
+      Copy_plane;
+      Copy_plane_permute;
+      Gather_plane;
+
+
+
+      // The offsets to all neibours in stencil in each direction
+      int                               _checkerboard;
+      int                               _npoints; // Move to template param?
+      GridBase *                        _grid;
+
+      // Store these as SIMD Integer needed
+      //
+      // std::vector< iVector<Integer, Npoint> > _offsets;
+      // std::vector< iVector<Integer, Npoint> > _local;
+      // std::vector< iVector<Integer, Npoint> > _comm_buf_size;
+      // std::vector< iVector<Integer, Npoint> > _permute;
+
+      std::vector<std::vector<int>    > _offsets;
+      std::vector<std::vector<int>    > _local;
+      std::vector<int>                  _comm_buf_size;
+      std::vector<int>                  _permute;
+
+    };
+
+    Stencil::Stencil(GridBase *grid,
+		     int npoints,
+		     int checkerboard,
+		     std::vector<int> directions,
+		     std::vector<int> distances){
+      
+      _npoints = npoints;
+      _grid    = grid;
+      
+      for(int i=0;i<npoints;i++){
+
+	int dimension    = directions[i];
+	int displacement = distances[i];
+
+	int fd = _grid->_fdimensions[dimension];
+	int rd = _grid->_rdimensions[dimension];
+
+	_checkerboard = checkerboard;
+
+	// the permute type
+	int simd_layout     = _grid->_simd_layout[dimension];
+	int comm_dim        = _grid->_processors[dimension] >1 ;
+	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+
+	int sshift[2];
+
+	if ( !comm_dim ) {
+	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+
+	  if ( sshift[0] == sshift[1] ) {
+	    Stencil_local(dimension,shift,0x3);
+	  } else {
+	    Stencil_local(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Stencil_local(dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	} else if ( splice_dim ) {
+	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+	  
+	  if ( sshift[0] == sshift[1] ) {
+	    Stencil_comms_simd(dimension,shift,0x3);
+	  } else {
+	    Stencil_comms_simd(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Stencil_comms_simd(dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	} else {
+	  //	  Cshift_comms(ret,rhs,dimension,shift);
+	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+	  if ( sshift[0] == sshift[1] ) {
+	    Stencil_comms(dimension,shift,0x3);
+	  } else {
+	    Stencil_comms(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Stencil_comms(dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	}
+      }
+    }
+
+
+      void Stencil::Stencil_local     (int dimension,int shift,int cbmask)
+      {
+	int fd = _grid->_fdimensions[dimension];
+	int rd = _grid->_rdimensions[dimension];
+	int ld = _grid->_ldimensions[dimension];
+	int gd = _grid->_gdimensions[dimension];
+
+	// Map to always positive shift modulo global full dimension.
+	shift = (shift+fd)%fd;
+	
+	// the permute type
+	int permute_dim =_grid->PermuteDim(dimension);
+	int permute_type=_grid->PermuteType(dimension);
+	
+	for(int x=0;x<rd;x++){       
+	  
+	  int o   = 0;
+	  int bo  = x * _grid->_ostride[dimension];
+	  
+	  int cb= (cbmask==0x2)? 1 : 0;
+	  
+	  int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
+	  int sx     = (x+sshift)%rd;
+	  
+	  int permute_slice=0;
+	  if(permute_dim){
+	    int wrap = sshift/rd;
+	    int  num = sshift%rd;
+	    if ( x< rd-num ) permute_slice=wrap;
+	    else permute_slice = 1-wrap;
+	  }
+
+	  if ( permute_slice ) Copy_plane_permute(dimension,x,sx,cbmask,permute_type);
+	  else                 Copy_plane        (dimension,x,sx,cbmask); 
+  
+	}
+      }
+
+      void Stencil::Stencil_comms     (int dimension,int shift,int cbmask)
+      {
+	typedef typename vobj::vector_type vector_type;
+	typedef typename vobj::scalar_type scalar_type;
+	
+	GridBase *grid=_grid;
+
+	int fd              = _grid->_fdimensions[dimension];
+	int rd              = _grid->_rdimensions[dimension];
+	int simd_layout     = _grid->_simd_layout[dimension];
+	int comm_dim        = _grid->_processors[dimension] >1 ;
+
+	assert(simd_layout==1);
+	assert(comm_dim==1);
+	assert(shift>=0);
+	assert(shift<fd);
+	
+	int buffer_size = _grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
+	// FIXME: Do something with buffer_size??
+
+	int cb= (cbmask==0x2)? 1 : 0;
+	int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
+	
+	for(int x=0;x<rd;x++){       
+	  
+	  int offnode = ( x+sshift >= rd );
+	  int sx        = (x+sshift)%rd;
+	  int comm_proc = (x+sshift)/rd;
+	  
+	  if (!offnode) {
+	    
+	    Copy_plane(dimension,x,sx,cbmask); 
+	    
+	  } else {
+	    
+	    int words = send_buf.size();
+	    if (cbmask != 0x3) words=words>>1;
+	    
+	    int bytes = words * sizeof(vobj);
+	    
+	    Gather_plane_simple (dimension,sx,cbmask);
+	    
+	    int rank           = grid->_processor;
+	    int recv_from_rank;
+	    int xmit_to_rank;
+	    grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+	    /*	    
+	    grid->SendToRecvFrom((void *)&send_buf[0],
+				 xmit_to_rank,
+				 (void *)&recv_buf[0],
+				 recv_from_rank,
+				 bytes);
+	    */
+	    Scatter_plane_simple (dimension,x,cbmask);
+	  }
+	}
+      }
+
+      void Stencil::Stencil_comms_simd(int dimension,int shift,int cbmask)
+      {
+	GridBase *grid=_grid;
+	const int Nsimd = _grid->Nsimd();
+	typedef typename vobj::vector_type vector_type;
+	typedef typename vobj::scalar_type scalar_type;
+	
+	int fd = _grid->_fdimensions[dimension];
+	int rd = _grid->_rdimensions[dimension];
+	int ld = _grid->_ldimensions[dimension];
+	int simd_layout     = _grid->_simd_layout[dimension];
+	int comm_dim        = _grid->_processors[dimension] >1 ;
+	
+	assert(comm_dim==1);
+	assert(simd_layout==2);
+	assert(shift>=0);
+	assert(shift<fd);
+	
+	int permute_type=_grid->PermuteType(dimension);
+	
+	///////////////////////////////////////////////
+	// Simd direction uses an extract/merge pair
+	///////////////////////////////////////////////
+	int buffer_size = _grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
+	// FIXME do something with buffer size
+	
+	std::vector<scalar_type *> pointers(Nsimd);  // 
+	std::vector<scalar_type *> rpointers(Nsimd); // received pointers
+	
+	///////////////////////////////////////////
+	// Work out what to send where
+	///////////////////////////////////////////
+	
+	int cb    = (cbmask==0x2)? 1 : 0;
+	int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
+	
+	std::vector<int> comm_offnode(simd_layout);
+	std::vector<int> comm_proc   (simd_layout);  //relative processor coord in dim=dimension
+	std::vector<int> icoor(grid->Nd());
+	
+	for(int x=0;x<rd;x++){       
+	  
+	  int comm_any = 0;
+	  for(int s=0;s<simd_layout;s++) {
+	    int shifted_x   = x+s*rd+sshift;
+	    comm_offnode[s] = shifted_x >= ld; 
+	    comm_any        = comm_any | comm_offnode[s];
+	    comm_proc[s]    = shifted_x/ld;     
+	  }
+	  
+	  int o    = 0;
+	  int bo   = x*grid->_ostride[dimension];
+	  int sx   = (x+sshift)%rd;
+	  
+	  if ( comm_any ) {
+	    
+	    for(int i=0;i<Nsimd;i++){
+	      pointers[i] = (scalar_type *)&send_buf_extract[i][0];
+	    }
+	    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+	    
+	    for(int i=0;i<Nsimd;i++){
+	      
+	      int s;
+	      grid->iCoorFromIindex(icoor,i);
+	      s = icoor[dimension];
+	      
+	      if(comm_offnode[s]){
+		
+		int rank           = grid->_processor;
+		int recv_from_rank;
+		int xmit_to_rank;
+		grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
+		
+		/*		
+		grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
+				     xmit_to_rank,
+				     (void *)&recv_buf_extract[i][0],
+				     recv_from_rank,
+				     bytes);
+		*/
+
+		rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
+		
+	      } else { 
+		
+		rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
+
+	      }
+	      
+	    }
+	    
+	    // Permute by swizzling pointers in merge
+	    int permute_slice=0;
+	    int lshift=sshift%ld;
+	    int wrap  =lshift/rd;
+	    int  num  =lshift%rd;
+
+	    if ( x< rd-num ) permute_slice=wrap;
+	    else permute_slice = 1-wrap;
+	    
+	    int toggle_bit = (Nsimd>>(permute_type+1));
+	    int PermuteMap;
+	    for(int i=0;i<Nsimd;i++){
+	      if ( permute_slice ) {
+		PermuteMap=i^toggle_bit;
+		pointers[i] = rpointers[PermuteMap];
+	      } else {
+		pointers[i] = rpointers[i];
+	      }
+	    }
+
+	    Scatter_plane_merge(pointers,dimension,x,cbmask);
+	    
+	  } else { 
+
+	    int permute_slice=0;
+	    int wrap = sshift/rd;
+	    int  num = sshift%rd;
+	    if ( x< rd-num ) permute_slice=wrap;
+	    else permute_slice = 1-wrap;
+
+	    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
+	    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+
+	  }
+	}
+      }
+
+
+};
diff --git a/Grid_vComplexD.h b/Grid_vComplexD.h
index 330de26e..88d69ecb 100644
--- a/Grid_vComplexD.h
+++ b/Grid_vComplexD.h
@@ -16,6 +16,12 @@ namespace Grid {
             return (*this);
         }
         vComplexD(){};
+        vComplexD(ComplexD a){
+	  vsplat(*this,a);
+	};
+        vComplexD(double a){
+	  vsplat(*this,ComplexD(a));
+	};
  
         ///////////////////////////////////////////////
         // mac, mult, sub, add, adj
@@ -167,7 +173,15 @@ namespace Grid {
 	{
 	  Gmerge<vComplexD,ComplexD >(y,extracted);
 	}
-	friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted)
+	friend inline void extract(const vComplexD &y,std::vector<ComplexD *> &extracted)
+	{
+	  Gextract<vComplexD,ComplexD>(y,extracted);
+	}
+	friend inline void merge(vComplexD &y,std::vector<ComplexD > &extracted)
+	{
+	  Gmerge<vComplexD,ComplexD >(y,extracted);
+	}
+	friend inline void extract(const vComplexD &y,std::vector<ComplexD > &extracted)
 	{
 	  Gextract<vComplexD,ComplexD>(y,extracted);
 	}
@@ -184,6 +198,11 @@ namespace Grid {
         ///////////////////////
         // Splat
         ///////////////////////
+        friend inline void vsplat(vComplexD &ret,ComplexD c){
+            float a= real(c);
+            float b= imag(c);
+            vsplat(ret,a,b);
+        }
         friend inline void vsplat(vComplexD &ret,double rl,double ig){
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_pd(ig,rl,ig,rl);
@@ -215,7 +234,7 @@ namespace Grid {
 #endif
         }
 
-friend inline void vstore(vComplexD &ret, ComplexD *a){
+friend inline void vstore(const vComplexD &ret, ComplexD *a){
 #if defined (AVX1)|| defined (AVX2)
        _mm256_store_pd((double *)a,ret.v);
 #endif
diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h
index b7fb3d6a..b6b7ebe9 100644
--- a/Grid_vComplexF.h
+++ b/Grid_vComplexF.h
@@ -20,6 +20,12 @@ namespace Grid {
             return (*this);
         }
         vComplexF(){};
+        vComplexF(ComplexF a){
+	  vsplat(*this,a);
+	};
+        vComplexF(double a){
+	  vsplat(*this,ComplexF(a));
+	};
        
         ///////////////////////////////////////////////
         // mac, mult, sub, add, adj
@@ -161,7 +167,7 @@ namespace Grid {
             vsplat(ret,a,b);
         }
 
-friend inline void vstore(vComplexF &ret, ComplexF *a){
+friend inline void vstore(const vComplexF &ret, ComplexF *a){
 #if defined (AVX1)|| defined (AVX2)
         _mm256_store_ps((float *)a,ret.v);
 #endif
@@ -210,27 +216,47 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
 #endif
         }
 
-
         friend inline vComplexF operator * (const Complex &a, vComplexF b){
             vComplexF va;
             vsplat(va,a);
             return va*b;
         }
         friend inline vComplexF operator * (vComplexF b,const Complex &a){
+	  return a*b;
+        }
+
+       /*
+	template<class real>
+        friend inline vComplexF operator * (vComplexF b,const real &a){
             vComplexF va;
-            vsplat(va,a);
+	    Complex ca(a,0);
+            vsplat(va,ca);
             return va*b;
         }
+	template<class real>
+	friend inline vComplexF operator * (const real &a,vComplexF b){
+	  return a*b;
+	}
+
         friend inline vComplexF operator + (const Complex &a, vComplexF b){
             vComplexF va;
             vsplat(va,a);
             return va+b;
         }
         friend inline vComplexF operator + (vComplexF b,const Complex &a){
-            vComplexF va;
-            vsplat(va,a);
-            return b+va;
+            return a+b;
         }
+	template<class real>
+        friend inline vComplexF operator + (vComplexF b,const real &a){
+            vComplexF va;
+	    Complex ca(a,0);
+            vsplat(va,ca);
+            return va+b;
+        }
+	template<class real>
+	friend inline vComplexF operator + (const real &a,vComplexF b){
+	  return a+b;
+	}
         friend inline vComplexF operator - (const Complex &a, vComplexF b){
             vComplexF va;
             vsplat(va,a);
@@ -241,7 +267,24 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
             vsplat(va,a);
             return b-va;
         }
-        // NB: Template the following on "type Complex" and then implement *,+,- for ComplexF, ComplexD, RealF, RealD above to
+	template<class real>
+        friend inline vComplexF operator - (vComplexF b,const real &a){
+            vComplexF va;
+	    Complex ca(a,0);
+            vsplat(va,ca);
+            return b-va;
+        }
+	template<class real>
+	friend inline vComplexF operator - (const real &a,vComplexF b){
+            vComplexF va;
+	    Complex ca(a,0);
+            vsplat(va,ca);
+            return va-b;
+	}
+       */
+       
+        // NB: Template the following on "type Complex" and then implement *,+,- for 
+	// ComplexF, ComplexD, RealF, RealD above to
         // get full generality of binops with scalars.
         friend inline void mac (vComplexF *__restrict__ y,const Complex *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
         friend inline void mult(vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
@@ -304,7 +347,15 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
       {
 	Gmerge<vComplexF,ComplexF >(y,extracted);
       }
-      friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted)
+      friend inline void extract(const vComplexF &y,std::vector<ComplexF *> &extracted)
+      {
+	Gextract<vComplexF,ComplexF>(y,extracted);
+      }
+      friend inline void merge(vComplexF &y,std::vector<ComplexF > &extracted)
+      {
+	Gmerge<vComplexF,ComplexF >(y,extracted);
+      }
+      friend inline void extract(const vComplexF &y,std::vector<ComplexF > &extracted)
       {
 	Gextract<vComplexF,ComplexF>(y,extracted);
       }
diff --git a/Grid_vInteger.h b/Grid_vInteger.h
index 6ddce191..82adbd8e 100644
--- a/Grid_vInteger.h
+++ b/Grid_vInteger.h
@@ -10,7 +10,7 @@ namespace Grid {
 
   typedef uint32_t Integer;
 
-    class vInteger {
+  class vInteger {
     protected:
 
     public:
@@ -21,6 +21,13 @@ namespace Grid {
 	typedef Integer scalar_type;
 
         vInteger(){};
+        vInteger & operator = (const Zero & z){
+            vzero(*this);
+            return (*this);
+        }
+        vInteger(Integer a){
+	  vsplat(*this,a);
+	};
         ////////////////////////////////////
         // Arithmetic operator overloads +,-,*
         ////////////////////////////////////
@@ -166,18 +173,18 @@ namespace Grid {
 #endif
 	}
 
-friend inline void vstore(vInteger &ret, Integer *a){
+	friend inline void vstore(const vInteger &ret, Integer *a){
 #if defined (AVX1)|| defined (AVX2)
-        _mm256_store_si256((__m256i*)a,ret.v);
+	  _mm256_store_si256((__m256i*)a,ret.v);
 #endif
 #ifdef SSE2
-	_mm_store_si128(a,ret.v);
+	  _mm_store_si128(a,ret.v);
 #endif
 #ifdef AVX512
-	_mm512_store_si512(a,ret.v);
+	  _mm512_store_si512(a,ret.v);
 #endif
 #ifdef QPX
-	assert(0);
+	  assert(0);
 #endif
         }
 
@@ -185,6 +192,7 @@ friend inline void vstore(vInteger &ret, Integer *a){
         {
             _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
         }
+
         // Unary negation
         friend inline vInteger operator -(const vInteger &r) {
             vInteger ret;
@@ -210,9 +218,32 @@ friend inline void vstore(vInteger &ret, Integer *a){
             *this = *this-r;
             return *this;
         }
+
+      friend inline void permute(vInteger &y,const vInteger b,int perm)
+      {
+	Gpermute<vInteger>(y,b,perm);
+      }
+      friend inline void merge(vInteger &y,std::vector<Integer *> &extracted)
+      {
+	Gmerge<vInteger,Integer>(y,extracted);
+      }
+      friend inline void extract(const vInteger &y,std::vector<Integer *> &extracted)
+      {
+	Gextract<vInteger,Integer>(y,extracted);
+      }
+      friend inline void merge(vInteger &y,std::vector<Integer> &extracted)
+      {
+	Gmerge<vInteger,Integer>(y,extracted);
+      }
+      friend inline void extract(const vInteger &y,std::vector<Integer> &extracted)
+      {
+	Gextract<vInteger,Integer>(y,extracted);
+      }
+
+
     public:
-        static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
-    };
+        static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);}
+  };
 
     inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; }
 
@@ -222,27 +253,7 @@ friend inline void vstore(vInteger &ret, Integer *a){
     {
         return l*r;
     }
- 
-
-    class vIntegerF : public vInteger
-    {
-    public:
-      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(float);}
-      
-      friend inline void permute(vIntegerF &y,vIntegerF b,int perm)
-      {
-	Gpermute<vIntegerF>(y,b,perm);
-      }
-      friend inline void merge(vIntegerF &y,std::vector<Integer *> &extracted)
-      {
-	Gmerge<vIntegerF,Integer>(y,extracted);
-      }
-      friend inline void extract(vIntegerF &y,std::vector<Integer *> &extracted)
-      {
-	Gextract<vIntegerF,Integer>(y,extracted);
-      }
-    };
-
+    
 }
 
 #endif
diff --git a/Grid_vRealD.h b/Grid_vRealD.h
index 13ceedbe..a5b59be3 100644
--- a/Grid_vRealD.h
+++ b/Grid_vRealD.h
@@ -13,6 +13,9 @@ namespace Grid {
 	typedef RealD scalar_type;
 
         vRealD(){};
+        vRealD(RealD a){
+	  vsplat(*this,a);
+	};
 
         friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
         friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
@@ -112,7 +115,15 @@ namespace Grid {
 	{
 	  Gmerge<vRealD,RealD >(y,extracted);
 	}
-	friend inline void extract(vRealD &y,std::vector<RealD *> &extracted)
+	friend inline void extract(const vRealD &y,std::vector<RealD *> &extracted)
+	{
+	  Gextract<vRealD,RealD>(y,extracted);
+	}
+	friend inline void merge(vRealD &y,std::vector<RealD > &extracted)
+	{
+	  Gmerge<vRealD,RealD >(y,extracted);
+	}
+	friend inline void extract(const vRealD &y,std::vector<RealD > &extracted)
 	{
 	  Gextract<vRealD,RealD>(y,extracted);
 	}
@@ -157,7 +168,7 @@ namespace Grid {
 #endif
 	}
 
-	friend inline void vstore(vRealD &ret, double *a){
+	friend inline void vstore(const vRealD &ret, double *a){
 #if defined (AVX1)|| defined (AVX2)
             _mm256_store_pd(a,ret.v);
 #endif
diff --git a/Grid_vRealF.h b/Grid_vRealF.h
index 185f4da8..0fe68f43 100644
--- a/Grid_vRealF.h
+++ b/Grid_vRealF.h
@@ -14,6 +14,9 @@ namespace Grid {
 	typedef RealF scalar_type;
 
         vRealF(){};
+        vRealF(RealF a){
+	  vsplat(*this,a);
+	};
         ////////////////////////////////////
         // Arithmetic operator overloads +,-,*
         ////////////////////////////////////
@@ -133,7 +136,15 @@ namespace Grid {
 	{
 	  Gmerge<vRealF,RealF >(y,extracted);
 	}
-	friend inline void extract(vRealF &y,std::vector<RealF *> &extracted)
+	friend inline void extract(const vRealF &y,std::vector<RealF *> &extracted)
+	{
+	  Gextract<vRealF,RealF>(y,extracted);
+	}
+	friend inline void merge(vRealF &y,std::vector<RealF> &extracted)
+	{
+	  Gmerge<vRealF,RealF >(y,extracted);
+	}
+	friend inline void extract(const vRealF &y,std::vector<RealF> &extracted)
 	{
 	  Gextract<vRealF,RealF>(y,extracted);
 	}
@@ -180,7 +191,7 @@ namespace Grid {
 	////////////////////////////////////////////////////////////////////////
 	// FIXME:  gonna remove these load/store, get, set, prefetch
 	////////////////////////////////////////////////////////////////////////
-friend inline void vstore(vRealF &ret, float *a){
+friend inline void vstore(const vRealF &ret, float *a){
 #if defined (AVX1)|| defined (AVX2)
 	_mm256_store_ps(a,ret.v);
 #endif
diff --git a/TODO b/TODO
index d2373ba1..fb7d177a 100644
--- a/TODO
+++ b/TODO
@@ -1,29 +1,68 @@
+
 * FIXME audit
 * Remove vload/store etc..
 * Replace vset with a call to merge.
 * Replace vset with a call to merge.
+* Const audit
+* extract / merge extra implementation removal
 
-* Conditional execution Subset, where etc...
-* Coordinate information, integers etc...
-* Integer type padding/union to vector.
-* LatticeCoordinate[mu]
+* Conditional execution, where etc...         -----DONE, simple test
+* Integer relational support                         -----DONE
+* Coordinate information, integers etc...            -----DONE
+* Integer type padding/union to vector.              -----DONE 
+* LatticeCoordinate[mu]                              -----DONE
 
-* Optimise the extract/merge SIMD routines
 
-* Broadcast, reduction tests.
+* Stencil operator support                           -----Initial thoughts
+
+* Subset support, slice sums etc...                  -----Only need slice sum?
+                                                     -----Generic cartesian subslicing?
+                                                     -----Array ranges / boost extents?
+                                                     -----Multigrid grid transferral.
+                                                     -----Suggests generalised cartesian subblocking
+                                                          sums, returning modified grid.
+
+    Two classes of subset;
+i) red black parit subsetting.
+   (pick checkerboard).
+
+ii) Need to be able to project one Grid to another Grid.
+    Generic concept is to subdivide (based on RD so applies to red/black or full).
+    Return a type on SUB-grid from CellSum TOP-grid
+    SUB-grid need not distribute but be replicated in some dims if that is how the
+    cartesian communicator works.
+
+iii) No general permutation map.
+
+
+* Consider switch std::vector to boost arrays.
+  boost::multi_array<type, 3> A()...    to replace multi1d, multi2d etc..
+
+*? Cell definition <-> sliceSum.
+ ? Replicated arrays.
+
+
+
+* Check for missing functionality                    - partially audited against QDP++ layout
+
+* Optimise the extract/merge SIMD routines; Azusa??
+
+ -- I have collated into single location at least.
+
+ -- Need to use _mm_*insert/extract routines.
+
+* Conformable test in Cshift routines.
+
+* Gamma/Dirac structures
+
+* Fourspin, two spin project
+
+* Broadcast, reduction tests. innerProduct, localInnerProduct
 
 * QDP++ regression suite and comparative benchmark
 
 * NERSC Lattice loading, plaquette test
 
-* Conformable test in Cshift routines.
-
-* Gamma/Dirac structures
-* Fourspin, two spin project
-
-* Stencil operator support
-
-* Check for missing functionality
 
 * I/O support
   - MPI IO?