From 8f5281563e66092010f7b83e2d8c73124f3b09e3 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Thu, 9 Apr 2015 08:06:03 +0200
Subject: [PATCH] "where" and integer comparisons logic implemented for
 conditional assignment. LatticeCoordinate helper to get global (reduced)
 coordinate.

Some more work of similar type perhaps needed, but the bulk of the required
structure for masked array assignment is now in place.
---
 Grid.h                  |   2 +-
 Grid_Lattice.h          |  36 ++--
 Grid_QCD.h              |  27 ++-
 Grid_comparison.h       | 264 +++++++++++++++++++++++++
 Grid_cshift_common.h    |   1 -
 Grid_main.cc            |  73 ++++++-
 Grid_math_type_mapper.h |  83 ++++++++
 Grid_math_types.h       | 414 +++++++++++++++++++++++++++++-----------
 Grid_predicated.h       |  62 ++++++
 Grid_simd.h             |  90 +++++----
 Grid_stencil.h          | 352 +++++++++++++++++++++++++++++++++-
 Grid_vComplexD.h        |  23 ++-
 Grid_vComplexF.h        |  67 ++++++-
 Grid_vInteger.h         |  69 ++++---
 Grid_vRealD.h           |  15 +-
 Grid_vRealF.h           |  15 +-
 TODO                    |  67 +++++--
 17 files changed, 1420 insertions(+), 240 deletions(-)
 create mode 100644 Grid_comparison.h
 create mode 100644 Grid_math_type_mapper.h
 create mode 100644 Grid_predicated.h
diff --git a/Grid.h b/Grid.h
index 0a4024bf..caa4ed5e 100644
--- a/Grid.h
+++ b/Grid.h
@@ -41,12 +41,12 @@
 #include <malloc.h>
 #endif
 
-
 #include <Grid_aligned_allocator.h>
 #include <Grid_simd.h>
 #include <Grid_math_types.h>
 #include <Grid_Cartesian.h>
 #include <Grid_Lattice.h>
+#include <Grid_comparison.h>
 #include <Grid_QCD.h>
 
 namespace Grid {
diff --git a/Grid_Lattice.h b/Grid_Lattice.h
index ebf68560..01b95755 100644
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -19,16 +19,14 @@ public:
     typedef typename vobj::vector_type vector_type;
 public:
 
-
     Lattice(GridBase *grid) : _grid(grid) {
         _odata.reserve(_grid->oSites());
         assert((((uint64_t)&_odata[0])&0xF) ==0);
         checkerboard=0;
     }
-    
 
 #include <Grid_cshift.h>
-    
+   
     template<class obj1,class obj2>
     friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);
 
@@ -156,23 +154,23 @@ public:
             v_ptr[i]=drand48();
         }
     };
-    
+
     // FIXME for debug; deprecate this
     friend void lex_sites(Lattice<vobj> &l){
-        Real *v_ptr = (Real *)&l._odata[0];
-	size_t o_len = l._grid->oSites();
-        size_t v_len = sizeof(vobj)/sizeof(vRealF);
-	size_t vec_len = vRealF::Nsimd();
+      Real *v_ptr = (Real *)&l._odata[0];
+      size_t o_len = l._grid->oSites();
+      size_t v_len = sizeof(vobj)/sizeof(vRealF);
+      size_t vec_len = vRealF::Nsimd();
 
-        for(int i=0;i<o_len;i++){
-        for(int j=0;j<v_len;j++){
-	  for(int vv=0;vv<vec_len;vv+=2){
-            v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
-            v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
+      for(int i=0;i<o_len;i++){
+	for(int j=0;j<v_len;j++){
+          for(int vv=0;vv<vec_len;vv+=2){
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
 	  }
-        }}
-    };
-
+	}}
+    }
+    
     // FIXME Implement a consistent seed management strategy
     friend void gaussian(Lattice<vobj> &l){
         // Zero mean, unit variance.
@@ -195,7 +193,7 @@ public:
         }
         return ret;
     }
-    // *=,+=,-= operators
+    // *=,+=,-= operators inherit behvour from correspond */+/- operation
     template<class T>
     inline Lattice<vobj> &operator *=(const T &r) {
         *this = (*this)*r;
@@ -351,7 +349,6 @@ public:
     inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
     {
         Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
-
 #pragma omp parallel for
         for(int ss=0;ss<rhs._grid->oSites(); ss++){
             ret._odata[ss]=lhs*rhs._odata[ss];
@@ -383,7 +380,7 @@ public:
     {
         Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
 #pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+        for(int ss=0;ss<lhs._grid->oSites(); ss++){
             ret._odata[ss]=lhs._odata[ss]*rhs;
         }
         return ret;
@@ -409,5 +406,6 @@ public:
         return ret;
     }
 
+
 }
 #endif
diff --git a/Grid_QCD.h b/Grid_QCD.h
index 3e852633..1d52f911 100644
--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@@ -45,7 +45,7 @@ namespace QCD {
     
     typedef Lattice<vTComplex>            LatticeComplex;
 
-    typedef Lattice<vTInteger>            LatticeInteger; // Predicates for "where"
+    typedef Lattice<vInteger>            LatticeInteger; // Predicates for "where"
     
     typedef Lattice<vColourMatrix>     LatticeColourMatrix;
     typedef Lattice<vSpinMatrix>       LatticeSpinMatrix;
@@ -92,6 +92,31 @@ namespace QCD {
         }
         return ret;
      }
+
+    // FIXME for debug; deprecate this
+   inline void LatticeCoordinate(LatticeInteger &l,int mu){
+      GridBase *grid = l._grid;
+      int Nsimd = grid->iSites();
+      std::vector<int> gcoor;
+      std::vector<Integer> mergebuf(Nsimd);
+      std::vector<Integer *> mergeptr(Nsimd);
+      for(int o=0;o<grid->oSites();o++){
+	for(int i=0;i<grid->iSites();i++){
+	  //	  RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
+	  grid->RankIndexToGlobalCoor(0,o,i,gcoor);
+	  mergebuf[i]=gcoor[mu];
+	  mergeptr[i]=&mergebuf[i];
+	}
+	merge(l._odata[o],mergeptr);
+      }
+    };
+
+#include <Grid_predicated.h>
+
+#if 0
+
+#endif
+
 }   //namespace QCD
 } // Grid
 #endif
diff --git a/Grid_comparison.h b/Grid_comparison.h
new file mode 100644
index 00000000..c890d342
--- /dev/null
+++ b/Grid_comparison.h
@@ -0,0 +1,264 @@
+#ifndef GRID_COMPARISON_H
+#define GRID_COMPARISON_H
+namespace Grid {
+
+    // Generic list of functors
+    template<class lobj,class robj> class veq {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs == rhs;
+	}
+    };
+    template<class lobj,class robj> class vne {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs != rhs;
+	}
+    };
+    template<class lobj,class robj> class vlt {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs < rhs;
+	}
+    };
+    template<class lobj,class robj> class vle {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs <= rhs;
+	}
+    };
+    template<class lobj,class robj> class vgt {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs > rhs;
+	}
+    };
+    template<class lobj,class robj> class vge {
+    public:
+      vInteger operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs >= rhs;
+	}
+    };
+
+    // Generic list of functors
+    template<class lobj,class robj> class seq {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs == rhs;
+	}
+    };
+    template<class lobj,class robj> class sne {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs != rhs;
+	}
+    };
+    template<class lobj,class robj> class slt {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs < rhs;
+	}
+    };
+    template<class lobj,class robj> class sle {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs <= rhs;
+	}
+    };
+    template<class lobj,class robj> class sgt {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs > rhs;
+	}
+    };
+    template<class lobj,class robj> class sge {
+    public:
+      Integer operator()(const lobj &lhs, const robj &rhs)
+	{ 
+	  return lhs >= rhs;
+	}
+    };
+
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Integer gets extra relational functions. Could also implement these for RealF, RealD etc..
+    //////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class sfunctor> 
+    inline vInteger Comparison(sfunctor sop,const vInteger & lhs, const vInteger & rhs)
+    {
+      std::vector<Integer> vlhs(vInteger::Nsimd());   // Use functors to reduce this to single implementation
+      std::vector<Integer> vrhs(vInteger::Nsimd());
+      vInteger ret;
+      extract(lhs,vlhs);
+      extract(rhs,vrhs);
+      for(int s=0;s<vInteger::Nsimd();s++){
+	vlhs[s] = sop(vlhs[s],vrhs[s]);
+      }
+      merge(ret,vlhs);
+      return ret;
+    }
+    inline vInteger operator < (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(slt<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator <= (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sle<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator > (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sgt<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator >= (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sge<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator == (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(seq<Integer,Integer>(),lhs,rhs);
+    }
+    inline vInteger operator != (const vInteger & lhs, const vInteger & rhs)
+    {
+      return Comparison(sne<Integer,Integer>(),lhs,rhs);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // relational operators
+    // 
+    // Support <,>,<=,>=,==,!=
+    //
+    //Query supporting bitwise &, |, ^, !
+    //Query supporting logical &&, ||, 
+    //////////////////////////////////////////////////////////////////////////
+    template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
+    {
+      Lattice<vInteger> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+    }
+    template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
+    {
+      Lattice<vInteger> ret(lhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<lhs._grid->oSites(); ss++){
+	  ret._odata[ss]=op(lhs._odata[ss],rhs);
+        }
+        return ret;
+    }
+    template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
+    {
+      Lattice<vInteger> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=op(lhs._odata[ss],rhs);
+        }
+        return ret;
+    }
+
+    // Less than
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vlt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vlt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vlt<lobj,robj>(),lhs,rhs);
+   }
+
+   // Less than equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vle<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vle<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vle<lobj,robj>(),lhs,rhs);
+   }
+
+   // Greater than 
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vgt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vgt<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vgt<lobj,robj>(),lhs,rhs);
+   }
+
+
+   // Greater than equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+
+
+   // equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+
+
+   // not equal
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+
+
+}
+#endif
diff --git a/Grid_cshift_common.h b/Grid_cshift_common.h
index 08e75cff..2910151c 100644
--- a/Grid_cshift_common.h
+++ b/Grid_cshift_common.h
@@ -265,7 +265,6 @@ friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimens
 //////////////////////////////////////////////////////
 // Local to node Cshift
 //////////////////////////////////////////////////////
-
 friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
 {
   int sshift[2];
diff --git a/Grid_main.cc b/Grid_main.cc
index 62356436..1676a07b 100644
--- a/Grid_main.cc
+++ b/Grid_main.cc
@@ -19,10 +19,10 @@ int main (int argc, char ** argv)
   std::vector<int> simd_layout(4);
   
   std::vector<int> mpi_layout(4);
-  mpi_layout[0]=2;
-  mpi_layout[1]=2;
-  mpi_layout[2]=2;
-  mpi_layout[3]=2;
+  mpi_layout[0]=1;
+  mpi_layout[1]=1;
+  mpi_layout[2]=1;
+  mpi_layout[3]=1;
 
 #ifdef AVX512
  for(int omp=128;omp<236;omp+=16){
@@ -121,12 +121,34 @@ int main (int argc, char ** argv)
     // Non-lattice (const objects) * Lattice
     ColourMatrix cm;
     SpinColourMatrix scm;
-    
+    vSpinColourMatrix vscm;
+    Complex cplx(1.0);
+    Integer myint=1;
+    double mydouble=1.0;
+
+    //    vSpinColourMatrix vscm;
     scMat = cMat*scMat;
     scm = cm * scm;         // SpinColourMatrix  = ColourMatrix     * SpinColourMatrix
     scm = scm *cm;          // SpinColourMatrix  = SpinColourMartix * ColourMatrix
     scm = GammaFive * scm ; // SpinColourMatrix  = SpinMatrix       * SpinColourMatrix
     scm = scm* GammaFive  ; // SpinColourMatrix  = SpinColourMatrix * SpinMatrix
+
+    scm = scm*cplx;
+    vscm = vscm*cplx;
+    scMat = scMat*cplx;
+
+    scm = cplx*scm;
+    vscm = cplx*vscm;
+    scMat = cplx*scMat;
+    scm = myint*scm;
+    vscm = myint*vscm;
+    scMat = scMat*myint;
+    
+    scm = scm*mydouble;
+    vscm = vscm*mydouble;
+    scMat = scMat*mydouble;
+    scMat = mydouble*scMat;
+    cMat = mydouble*cMat;
     
     sMat = adj(sMat);       // LatticeSpinMatrix adjoint
     sMat = iGammaFive*sMat; // SpinMatrix * LatticeSpinMatrix
@@ -160,7 +182,35 @@ int main (int argc, char ** argv)
     */
     lex_sites(Foo);
 
+    Integer mm[4];
+    mm[0]=1;
+    mm[1]=Fine._rdimensions[0];
+    mm[2]=Fine._ldimensions[0]*Fine._ldimensions[1];
+    mm[3]=Fine._ldimensions[0]*Fine._ldimensions[1]*Fine._ldimensions[2];
 
+    LatticeInteger lex(&Fine);
+    lex=zero;
+    for(int d=0;d<4;d++){
+      LatticeInteger coor(&Fine);
+      LatticeCoordinate(coor,d);
+      lex = lex + coor*mm[d];
+    }
+    Bar = zero;
+    Bar = where(lex<10,Foo,Bar);
+    {
+      std::vector<int> coor(4);
+      for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
+      for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
+      for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
+      for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
+        ColourMatrix bar;
+        peekSite(bar,Bar,coor);
+        for(int r=0;r<3;r++){
+        for(int c=0;c<3;c++){
+	  cout<<"bar "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "<<bar._internal._internal[r][c]<<std::endl;
+	}}
+      }}}}
+    }    
     //setCheckerboard(ShiftedCheck,rFoo); 
     //setCheckerboard(ShiftedCheck,bFoo); 
 
@@ -228,11 +278,15 @@ int main (int argc, char ** argv)
     
     double nrm=0;
 
+
+    LatticeColourMatrix deriv(&Fine);
+    double half=0.5;
+    deriv = 0.5*Cshift(Foo,0,1) - 0.5*Cshift(Foo,0,-1);
+
+
     for(int dir=0;dir<4;dir++){
       for(int shift=0;shift<latt_size[dir];shift++){
 
-
-
 	pickCheckerboard(0,rFoo,Foo);    // Pick out red or black checkerboards
 	pickCheckerboard(1,bFoo,Foo);
     
@@ -254,6 +308,8 @@ int main (int argc, char ** argv)
 	for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
 	for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
 	for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
+
+	 
  
         std::complex<Grid::Real> diff;
                     
@@ -305,7 +361,8 @@ int main (int argc, char ** argv)
         double nn=Ttr._internal._internal;
         if ( nn > 0 )
             cout<<"Shift real trace fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<endl;
-        
+     
+
         for(int r=0;r<3;r++){
         for(int c=0;c<3;c++){
             diff =shifted1._internal._internal[r][c]-shifted2._internal._internal[r][c];
diff --git a/Grid_math_type_mapper.h b/Grid_math_type_mapper.h
new file mode 100644
index 00000000..856f2c6b
--- /dev/null
+++ b/Grid_math_type_mapper.h
@@ -0,0 +1,83 @@
+#ifndef GRID_MATH_TYPE_MAPPER_H
+#define GRID_MATH_TYPE_MAPPER_H
+namespace Grid {
+//////////////////////////////////////////////////////////////////////////////////
+// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
+//////////////////////////////////////////////////////////////////////////////////
+  
+  template <class T> class GridTypeMapper {
+  public:
+    typedef typename T::scalar_type scalar_type;
+    typedef typename T::vector_type vector_type;
+    typedef typename T::tensor_reduced tensor_reduced;
+  };
+
+//////////////////////////////////////////////////////////////////////////////////
+// Recursion stops with these template specialisations
+//////////////////////////////////////////////////////////////////////////////////
+  template<> class GridTypeMapper<RealF> {
+  public:
+    typedef RealF scalar_type;
+    typedef RealF vector_type;
+    typedef RealF tensor_reduced ;
+  };
+  template<> class GridTypeMapper<RealD> {
+  public:
+    typedef RealD scalar_type;
+    typedef RealD vector_type;
+    typedef RealD tensor_reduced;
+  };
+  template<> class GridTypeMapper<ComplexF> {
+  public:
+    typedef ComplexF scalar_type;
+    typedef ComplexF vector_type;
+    typedef ComplexF tensor_reduced;
+  };
+  template<> class GridTypeMapper<ComplexD> {
+  public:
+    typedef ComplexD scalar_type;
+    typedef ComplexD vector_type;
+    typedef ComplexD tensor_reduced;
+  };
+
+  template<> class GridTypeMapper<vRealF> {
+  public:
+    typedef RealF  scalar_type;
+    typedef vRealF vector_type;
+    typedef vRealF tensor_reduced;
+  };
+  template<> class GridTypeMapper<vRealD> {
+  public:
+    typedef RealD  scalar_type;
+    typedef vRealD vector_type;
+    typedef vRealD tensor_reduced;
+  };
+  template<> class GridTypeMapper<vComplexF> {
+  public:
+    typedef ComplexF  scalar_type;
+    typedef vComplexF vector_type;
+    typedef vComplexF tensor_reduced;
+  };
+  template<> class GridTypeMapper<vComplexD> {
+  public:
+    typedef ComplexD  scalar_type;
+    typedef vComplexD vector_type;
+    typedef vComplexD tensor_reduced;
+  };
+  template<> class GridTypeMapper<vInteger> {
+  public:
+    typedef Integer  scalar_type;
+    typedef vInteger vector_type;
+    typedef vInteger tensor_reduced;
+  };
+
+  // Again terminate the recursion.
+  inline vRealD    TensorRemove(vRealD    arg){ return arg;}
+  inline vRealF    TensorRemove(vRealF    arg){ return arg;}
+  inline vComplexF TensorRemove(vComplexF arg){ return arg;}
+  inline vComplexD TensorRemove(vComplexD arg){ return arg;}
+  inline vInteger  TensorRemove(vInteger  arg){ return arg;}
+
+}
+
+#endif
diff --git a/Grid_math_types.h b/Grid_math_types.h
index 25524966..a2a3f97c 100644
--- a/Grid_math_types.h
+++ b/Grid_math_types.h
@@ -1,63 +1,11 @@
 #ifndef GRID_MATH_TYPES_H
 #define GRID_MATH_TYPES_H
+
+#include <Grid_math_type_mapper.h>
+
 namespace Grid {
 
 
-
-//////////////////////////////////////////////////////////////////////////////////
-// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
-//////////////////////////////////////////////////////////////////////////////////
-
-  template <class T> class GridTypeMapper {
-  public:
-    typedef typename T::scalar_type scalar_type;
-    typedef typename T::vector_type vector_type;
-  };
-
-  template<> class GridTypeMapper<RealF> {
-  public:
-    typedef RealF scalar_type;
-    typedef RealF vector_type;
-  };
-  template<> class GridTypeMapper<RealD> {
-  public:
-    typedef RealD scalar_type;
-    typedef RealD vector_type;
-  };
-  template<> class GridTypeMapper<ComplexF> {
-  public:
-    typedef ComplexF scalar_type;
-    typedef ComplexF vector_type;
-  };
-  template<> class GridTypeMapper<ComplexD> {
-  public:
-    typedef ComplexD scalar_type;
-    typedef ComplexD vector_type;
-  };
-
-  template<> class GridTypeMapper<vRealF> {
-  public:
-    typedef RealF  scalar_type;
-    typedef vRealF vector_type;
-  };
-  template<> class GridTypeMapper<vRealD> {
-  public:
-    typedef RealD  scalar_type;
-    typedef vRealD vector_type;
-  };
-  template<> class GridTypeMapper<vComplexF> {
-  public:
-    typedef ComplexF  scalar_type;
-    typedef vComplexF vector_type;
-  };
-  template<> class GridTypeMapper<vComplexD> {
-  public:
-    typedef ComplexD  scalar_type;
-    typedef vComplexD vector_type;
-  };
-
-
-
 ///////////////////////////////////////////////////
 // Scalar, Vector, Matrix objects.
 // These can be composed to form tensor products of internal indices.
@@ -70,9 +18,16 @@ public:
 
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
+
 
     iScalar(){};
+    
+    iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
+
     iScalar(Zero &z){ *this = zero; };
+
     iScalar<vtype> & operator= (const Zero &hero){
         zeroit(*this);
         return *this;
@@ -80,22 +35,27 @@ public:
     friend void zeroit(iScalar<vtype> &that){
         zeroit(that._internal);
     }
-    friend void permute(iScalar<vtype> &out,iScalar<vtype> &in,int permutetype){
+    friend void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
       permute(out._internal,in._internal,permutetype);
     }
-    friend void extract(iScalar<vtype> &in,std::vector<scalar_type *> &out){
+    friend void extract(const iScalar<vtype> &in,std::vector<scalar_type *> &out){
       extract(in._internal,out); // extract advances the pointers in out
     }
     friend void merge(iScalar<vtype> &in,std::vector<scalar_type *> &out){
       merge(in._internal,out); // extract advances the pointers in out
     }
+    friend inline iScalar<vtype>::vector_type TensorRemove(iScalar<vtype> arg)
+    {
+      return TensorRemove(arg._internal);
+    }
+
     // Unary negation
     friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
         iScalar<vtype> ret;
         ret._internal= -r._internal;
         return ret;
     }
-    // *=,+=,-= operators
+    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
     inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
         *this = (*this)*r;
         return *this;
@@ -117,6 +77,9 @@ public:
 
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
+
 
     iVector(Zero &z){ *this = zero; };
     iVector() {};
@@ -129,12 +92,12 @@ public:
             zeroit(that._internal[i]);
         }
     }
-    friend void permute(iVector<vtype,N> &out,iVector<vtype,N> &in,int permutetype){
+    friend void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
       for(int i=0;i<N;i++){
 	permute(out._internal[i],in._internal[i],permutetype);
       }
     }
-    friend void extract(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
+    friend void extract(const iVector<vtype,N> &in,std::vector<scalar_type *> &out){
       for(int i=0;i<N;i++){
 	extract(in._internal[i],out);// extract advances pointers in out
       }
@@ -150,7 +113,7 @@ public:
         for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
         return ret;
     }
-    // *=,+=,-= operators
+    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
     inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
         *this = (*this)*r;
         return *this;
@@ -163,10 +126,8 @@ public:
         *this = (*this)+r;
         return *this;
     }
-
 };
     
-    
 template<class vtype,int N> class iMatrix
 {
 public:
@@ -174,62 +135,64 @@ public:
 
   typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
   typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
 
-    iMatrix(Zero &z){ *this = zero; };
-    iMatrix() {};
-    iMatrix<vtype,N> & operator= (Zero &hero){
-        zeroit(*this);
-        return *this;
-    }
-    friend void zeroit(iMatrix<vtype,N> &that){
-        for(int i=0;i<N;i++){
-        for(int j=0;j<N;j++){
-                zeroit(that._internal[i][j]);
-        }}
-    }
-    friend void permute(iMatrix<vtype,N> &out,iMatrix<vtype,N> &in,int permutetype){
-      for(int i=0;i<N;i++){
+  iMatrix(Zero &z){ *this = zero; };
+  iMatrix() {};
+  iMatrix<vtype,N> & operator= (Zero &hero){
+    zeroit(*this);
+    return *this;
+  }
+  friend void zeroit(iMatrix<vtype,N> &that){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	zeroit(that._internal[i][j]);
+    }}
+  }
+  friend void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
+    for(int i=0;i<N;i++){
       for(int j=0;j<N;j++){
 	permute(out._internal[i][j],in._internal[i][j],permutetype);
-      }}
-    }
-    friend void extract(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
-      for(int i=0;i<N;i++){
+    }}
+  }
+  friend void extract(const iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
+    for(int i=0;i<N;i++){
       for(int j=0;j<N;j++){
 	extract(in._internal[i][j],out);// extract advances pointers in out
-      }}
-    }
-    friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
-      for(int i=0;i<N;i++){
+    }}
+  }
+  friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
+    for(int i=0;i<N;i++){
       for(int j=0;j<N;j++){
 	merge(in._internal[i][j],out);// extract advances pointers in out
-      }}
-    }
-    // Unary negation
-    friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
-        iMatrix<vtype,N> ret;
-        for(int i=0;i<N;i++){
-        for(int j=0;j<N;j++){
-            ret._internal[i][j]= -r._internal[i][j];
-        }}
-        return ret;
-    }
-    // *=,+=,-= operators
-    template<class T>
-    inline iMatrix<vtype,N> &operator *=(const T &r) {
-        *this = (*this)*r;
-        return *this;
-    }
-    template<class T>
-    inline iMatrix<vtype,N> &operator -=(const T &r) {
-        *this = (*this)-r;
-        return *this;
-    }
-    template<class T>
-    inline iMatrix<vtype,N> &operator +=(const T &r) {
-        *this = (*this)+r;
-        return *this;
-    }
+    }}
+  }
+  // Unary negation
+  friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
+    iMatrix<vtype,N> ret;
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j]= -r._internal[i][j];
+    }}
+    return ret;
+  }
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+  template<class T>
+  inline iMatrix<vtype,N> &operator *=(const T &r) {
+    *this = (*this)*r;
+    return *this;
+  }
+  template<class T>
+  inline iMatrix<vtype,N> &operator -=(const T &r) {
+    *this = (*this)-r;
+    return *this;
+  }
+  template<class T>
+  inline iMatrix<vtype,N> &operator +=(const T &r) {
+    *this = (*this)+r;
+    return *this;
+  }
 
 };
 
@@ -642,7 +605,8 @@ iVector<rtype,N> operator * (const iVector<mtype,N>& lhs,const iScalar<vtype>& r
     // mat  x vec  = vec
     // vec  x scal = vec
     // scal x vec  = vec
-    
+    //
+    // We can special case scalar_type ??
 template<class l,class r>
 inline auto operator * (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(lhs._internal * rhs._internal)>
 {
@@ -715,6 +679,229 @@ auto operator * (const iVector<l,N>& lhs,const iScalar<r>& rhs) -> iVector<declt
     }
     return ret;
 }
+//////////////////////////////////////////////////////////////////////////////////////////
+// Must support native C++ types Integer, Complex, Real
+//////////////////////////////////////////////////////////////////////////////////////////
+
+// multiplication by fundamental scalar type
+template<class l,int N> inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs*srhs;
+}
+template<class l,int N> inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iVector<l,N>::tensor_reduced srhs(rhs);
+  return lhs*srhs;
+}
+template<class l,int N> inline iVector<l,N> operator * (const typename iScalar<l>::scalar_type lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type &rhs) 
+{
+  typename iMatrix<l,N>::tensor_reduced srhs(rhs);
+  return lhs*srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator * (const typename iScalar<l>::scalar_type & lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l> inline iScalar<l> operator * (double lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iVector<l,N> operator * (double lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator * (double lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Integer support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l> inline iScalar<l> operator * (Integer lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iVector<l,N> operator * (Integer lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator * (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+// addition by fundamental scalar type applies to matrix(down diag) and scalar
+///////////////////////////////////////////////////////////////////////////////////////////////
+template<class l,int N> inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs+srhs;
+}
+template<class l,int N> inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iMatrix<l,N>::tensor_reduced srhs(rhs);
+  return lhs+srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator + (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+template<class l> inline iScalar<l> operator + (double lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator + (double lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Integer support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+template<class l> inline iScalar<l> operator + (Integer lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator + (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+// subtraction of fundamental scalar type applies to matrix(down diag) and scalar
+///////////////////////////////////////////////////////////////////////////////////////////////
+template<class l,int N> inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs-srhs;
+}
+template<class l,int N> inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::tensor_reduced slhs(lhs);
+  return slhs-rhs;
+}
+
+template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs-srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::tensor_reduced slhs(lhs);
+  return slhs-rhs;
+}
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l> inline iScalar<l> operator - (double lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+
+template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (double lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+
+////////////////////////////////////////////////////////////////////
+// Integer support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l> inline iScalar<l> operator - (Integer lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+
+
+
+
     ///////////////////////////////////////////////////////////////////////////////////////
     // localInnerProduct Scalar x Scalar -> Scalar
     // localInnerProduct Vector x Vector -> Scalar
@@ -907,6 +1094,7 @@ inline auto trace(const iScalar<vtype> &arg) -> iScalar<decltype(trace(arg._inte
     ret._internal=trace(arg._internal);
     return ret;
 }
+
 };
     
 #endif
diff --git a/Grid_predicated.h b/Grid_predicated.h
new file mode 100644
index 00000000..c6f29204
--- /dev/null
+++ b/Grid_predicated.h
@@ -0,0 +1,62 @@
+#ifndef GRID_PREDICATED_H
+#define GRID_PREDICATED_H
+
+// Must implement the predicate gating the 
+// Must be able to reduce the predicate down to a single vInteger per site.
+// Must be able to require the type be iScalar x iScalar x ....
+//                              give a GetVtype method in iScalar
+//                              and blow away the tensor structures.
+//
+template<class vobj>
+inline void where(Lattice<vobj> &ret,const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+  conformable(iftrue,ret);
+
+  GridBase *grid=iftrue._grid;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  const int Nsimd = grid->Nsimd();
+  const int words = sizeof(vobj)/sizeof(vector_type);
+
+  std::vector<Integer> mask(Nsimd);
+  std::vector<std::vector<scalar_type> > truevals (Nsimd,std::vector<scalar_type>(words) );
+  std::vector<std::vector<scalar_type> > falsevals(Nsimd,std::vector<scalar_type>(words) );
+  std::vector<scalar_type *> pointers(Nsimd);
+
+#pragma omp parallel for
+  for(int ss=0;ss<iftrue._grid->oSites(); ss++){
+
+    for(int s=0;s<Nsimd;s++) pointers[s] = & truevals[s][0];
+    extract(iftrue._odata[ss]   ,pointers);
+
+    for(int s=0;s<Nsimd;s++) pointers[s] = & falsevals[s][0];
+    extract(iffalse._odata[ss]  ,pointers);
+
+    extract(predicate._odata[ss],mask);
+
+    for(int s=0;s<Nsimd;s++){
+      if (mask[s]) pointers[s]=&truevals[s][0];
+      else         pointers[s]=&falsevals[s][0];
+    }
+
+    merge(ret._odata[ss],pointers);
+  }
+}
+
+template<class vobj>
+inline Lattice<vobj> where(const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+
+  Lattice<vobj> ret(iftrue._grid);
+
+  where(ret,predicate,iftrue,iffalse);
+
+  return ret;
+}
+
+#endif
diff --git a/Grid_simd.h b/Grid_simd.h
index 53cc8909..1229d328 100644
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -11,28 +11,15 @@
 // Vector types are arch dependent
 ////////////////////////////////////////////////////////////////////////
     
-  ////////////////////////////////////////////////////////////
-  // SIMD Alignment controls
-  ////////////////////////////////////////////////////////////
-#ifdef HAVE_VAR_ATTRIBUTE_ALIGNED
-#define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A)))
-#else
-#define ALIGN_DIRECTIVE(A) __declspec(align(A))
-#endif
 
 #ifdef SSE2
 #include <pmmintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(16)
 #endif
-
 #if defined(AVX1) || defined (AVX2)
 #include <immintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(32)
 #endif
-
 #ifdef AVX512
 #include <immintrin.h>
-#define SIMDalign ALIGN_DIRECTIVE(64)
 #endif
 
 namespace Grid {
@@ -137,41 +124,66 @@ namespace Grid {
 // Generic extract/merge/permute
 /////////////////////////////////////////////////////////////////
 template<class vsimd,class scalar>
-inline void Gextract(vsimd &y,std::vector<scalar *> &extracted){
-#if 1
+inline void Gextract(const vsimd &y,std::vector<scalar *> &extracted){
   // FIXME: bounce off stack is painful
   // temporary hack while I figure out better way.
   // There are intrinsics to do this work without the storage.
-  int Nsimd = extracted.size();
-  {
-    std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
-    vstore(y,&buf[0]);
-    for(int i=0;i<Nsimd;i++){
-      *extracted[i] = buf[i];
-      extracted[i]++;
-    }
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
+  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
+  vstore(y,&buf[0]);
+  for(int i=0;i<Nextr;i++){
+    *extracted[i] = buf[i*s];
+    extracted[i]++;
   }
-#else 
-  int NSo   = extracted.size();
-  int NSv   = vsimd::Nsimd();
-  int sparse= NSv/NSo;
-  for(int i=0;i<NSv;i+=sparse){
-    
-  }
-#endif
 };
 template<class vsimd,class scalar>
 inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
-#if 1
-  int Nsimd = extracted.size();
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
   std::vector<scalar> buf(Nsimd); 
-  for(int i=0;i<Nsimd;i++){
-    buf[i]=*extracted[i];
+  for(int i=0;i<Nextr;i++){
+    for(int ii=0;ii<s;ii++){
+      buf[i*s+ii]=*extracted[i];
+    }
     extracted[i]++;
   }
   vset(y,&buf[0]); 
-#else
-#endif
+};
+template<class vsimd,class scalar>
+inline void Gextract(const vsimd &y,std::vector<scalar> &extracted){
+  // FIXME: bounce off stack is painful
+  // temporary hack while I figure out better way.
+  // There are intrinsics to do this work without the storage.
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
+  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
+
+  vstore(y,&buf[0]);
+
+  for(int i=0;i<Nextr;i++){
+    extracted[i] = buf[i*s];
+  }
+};
+template<class vsimd,class scalar>
+inline void Gmerge(vsimd &y,std::vector<scalar> &extracted){
+  int Nextr=extracted.size();
+  int Nsimd=vsimd::Nsimd();
+  int s=Nsimd/Nextr;
+
+  std::vector<scalar> buf(Nsimd); 
+  for(int i=0;i<Nextr;i++){
+    for(int ii=0;ii<s;ii++){
+      buf[i*s+ii]=extracted[i];
+    }
+  }
+  vset(y,&buf[0]); 
 };
 
 //////////////////////////////////////////////////////////
@@ -183,7 +195,7 @@ inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
 // Permute 4 possible on half precision @512bit vectors.
 //////////////////////////////////////////////////////////
 template<class vsimd>
-inline void Gpermute(vsimd &y,vsimd b,int perm){
+inline void Gpermute(vsimd &y,const vsimd &b,int perm){
       switch (perm){
 #if defined(AVX1)||defined(AVX2)
       // 8x32 bits=>3 permutes
@@ -214,10 +226,10 @@ inline void Gpermute(vsimd &y,vsimd b,int perm){
     };
 };
 
+#include <Grid_vInteger.h>
 #include <Grid_vRealF.h>
 #include <Grid_vRealD.h>
 #include <Grid_vComplexF.h>
 #include <Grid_vComplexD.h>
-#include <Grid_vInteger.h>
 
 #endif
diff --git a/Grid_stencil.h b/Grid_stencil.h
index d93204fc..4d73f436 100644
--- a/Grid_stencil.h
+++ b/Grid_stencil.h
@@ -8,5 +8,355 @@
 // Lattice <foo> could also allocate haloes which get used for stencil code.
 //
 // Grid could create a neighbour index table for a given stencil.
-// Could also implement CovariantCshift.
+//
+// Could also implement CovariantCshift, to fuse the loops and enhance performance.
+//
+//
+// General stencil computation:
+//
+// Generic services
+// 0) Prebuild neighbour tables
+// 1) Compute sizes of all haloes/comms buffers; allocate them.
+//
+// 2) Gather all faces, and communicate.
+// 3) Loop over result sites, giving nbr index/offnode info for each
+// 
+// Could take a 
+// SpinProjectFaces 
+// start comms
+// complete comms 
+// Reconstruct Umu
+//
+// Approach.
+//
 //////////////////////////////////////////////////////////////////////////////////////////
+
+namespace Grid {
+
+    class Stencil {
+  public:
+
+      Stencil(GridBase *grid,
+	      int npoints,
+	      int checkerboard,
+	      std::vector<int> directions,
+	      std::vector<int> distances);
+
+      void Stencil_local     (int dimension,int shift,int cbmask);
+      void Stencil_comms     (int dimension,int shift,int cbmask);
+      void Stencil_comms_simd(int dimension,int shift,int cbmask);
+      // Will need to implement actions for
+      //
+      Copy_plane;
+      Copy_plane_permute;
+      Gather_plane;
+
+
+
+      // The offsets to all neibours in stencil in each direction
+      int                               _checkerboard;
+      int                               _npoints; // Move to template param?
+      GridBase *                        _grid;
+
+      // Store these as SIMD Integer needed
+      //
+      // std::vector< iVector<Integer, Npoint> > _offsets;
+      // std::vector< iVector<Integer, Npoint> > _local;
+      // std::vector< iVector<Integer, Npoint> > _comm_buf_size;
+      // std::vector< iVector<Integer, Npoint> > _permute;
+
+      std::vector<std::vector<int>    > _offsets;
+      std::vector<std::vector<int>    > _local;
+      std::vector<int>                  _comm_buf_size;
+      std::vector<int>                  _permute;
+
+    };
+
+    Stencil::Stencil(GridBase *grid,
+		     int npoints,
+		     int checkerboard,
+		     std::vector<int> directions,
+		     std::vector<int> distances){
+      
+      _npoints = npoints;
+      _grid    = grid;
+      
+      for(int i=0;i<npoints;i++){
+
+	int dimension    = directions[i];
+	int displacement = distances[i];
+
+	int fd = _grid->_fdimensions[dimension];
+	int rd = _grid->_rdimensions[dimension];
+
+	_checkerboard = checkerboard;
+
+	// the permute type
+	int simd_layout     = _grid->_simd_layout[dimension];
+	int comm_dim        = _grid->_processors[dimension] >1 ;
+	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
+
+	int sshift[2];
+
+	if ( !comm_dim ) {
+	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+
+	  if ( sshift[0] == sshift[1] ) {
+	    Stencil_local(dimension,shift,0x3);
+	  } else {
+	    Stencil_local(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Stencil_local(dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	} else if ( splice_dim ) {
+	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+	  
+	  if ( sshift[0] == sshift[1] ) {
+	    Stencil_comms_simd(dimension,shift,0x3);
+	  } else {
+	    Stencil_comms_simd(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Stencil_comms_simd(dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	} else {
+	  //	  Cshift_comms(ret,rhs,dimension,shift);
+	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
+	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
+	  if ( sshift[0] == sshift[1] ) {
+	    Stencil_comms(dimension,shift,0x3);
+	  } else {
+	    Stencil_comms(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+	    Stencil_comms(dimension,shift,0x2);// both with block stride loop iteration
+	  }
+	}
+      }
+    }
+
+
+      void Stencil::Stencil_local     (int dimension,int shift,int cbmask)
+      {
+	int fd = _grid->_fdimensions[dimension];
+	int rd = _grid->_rdimensions[dimension];
+	int ld = _grid->_ldimensions[dimension];
+	int gd = _grid->_gdimensions[dimension];
+
+	// Map to always positive shift modulo global full dimension.
+	shift = (shift+fd)%fd;
+	
+	// the permute type
+	int permute_dim =_grid->PermuteDim(dimension);
+	int permute_type=_grid->PermuteType(dimension);
+	
+	for(int x=0;x<rd;x++){       
+	  
+	  int o   = 0;
+	  int bo  = x * _grid->_ostride[dimension];
+	  
+	  int cb= (cbmask==0x2)? 1 : 0;
+	  
+	  int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
+	  int sx     = (x+sshift)%rd;
+	  
+	  int permute_slice=0;
+	  if(permute_dim){
+	    int wrap = sshift/rd;
+	    int  num = sshift%rd;
+	    if ( x< rd-num ) permute_slice=wrap;
+	    else permute_slice = 1-wrap;
+	  }
+
+	  if ( permute_slice ) Copy_plane_permute(dimension,x,sx,cbmask,permute_type);
+	  else                 Copy_plane        (dimension,x,sx,cbmask); 
+  
+	}
+      }
+
+      void Stencil::Stencil_comms     (int dimension,int shift,int cbmask)
+      {
+	typedef typename vobj::vector_type vector_type;
+	typedef typename vobj::scalar_type scalar_type;
+	
+	GridBase *grid=_grid;
+
+	int fd              = _grid->_fdimensions[dimension];
+	int rd              = _grid->_rdimensions[dimension];
+	int simd_layout     = _grid->_simd_layout[dimension];
+	int comm_dim        = _grid->_processors[dimension] >1 ;
+
+	assert(simd_layout==1);
+	assert(comm_dim==1);
+	assert(shift>=0);
+	assert(shift<fd);
+	
+	int buffer_size = _grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
+	// FIXME: Do something with buffer_size??
+
+	int cb= (cbmask==0x2)? 1 : 0;
+	int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
+	
+	for(int x=0;x<rd;x++){       
+	  
+	  int offnode = ( x+sshift >= rd );
+	  int sx        = (x+sshift)%rd;
+	  int comm_proc = (x+sshift)/rd;
+	  
+	  if (!offnode) {
+	    
+	    Copy_plane(dimension,x,sx,cbmask); 
+	    
+	  } else {
+	    
+	    int words = send_buf.size();
+	    if (cbmask != 0x3) words=words>>1;
+	    
+	    int bytes = words * sizeof(vobj);
+	    
+	    Gather_plane_simple (dimension,sx,cbmask);
+	    
+	    int rank           = grid->_processor;
+	    int recv_from_rank;
+	    int xmit_to_rank;
+	    grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+	    /*	    
+	    grid->SendToRecvFrom((void *)&send_buf[0],
+				 xmit_to_rank,
+				 (void *)&recv_buf[0],
+				 recv_from_rank,
+				 bytes);
+	    */
+	    Scatter_plane_simple (dimension,x,cbmask);
+	  }
+	}
+      }
+
+      void Stencil::Stencil_comms_simd(int dimension,int shift,int cbmask)
+      {
+	GridBase *grid=_grid;
+	const int Nsimd = _grid->Nsimd();
+	typedef typename vobj::vector_type vector_type;
+	typedef typename vobj::scalar_type scalar_type;
+	
+	int fd = _grid->_fdimensions[dimension];
+	int rd = _grid->_rdimensions[dimension];
+	int ld = _grid->_ldimensions[dimension];
+	int simd_layout     = _grid->_simd_layout[dimension];
+	int comm_dim        = _grid->_processors[dimension] >1 ;
+	
+	assert(comm_dim==1);
+	assert(simd_layout==2);
+	assert(shift>=0);
+	assert(shift<fd);
+	
+	int permute_type=_grid->PermuteType(dimension);
+	
+	///////////////////////////////////////////////
+	// Simd direction uses an extract/merge pair
+	///////////////////////////////////////////////
+	int buffer_size = _grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
+	// FIXME do something with buffer size
+	
+	std::vector<scalar_type *> pointers(Nsimd);  // 
+	std::vector<scalar_type *> rpointers(Nsimd); // received pointers
+	
+	///////////////////////////////////////////
+	// Work out what to send where
+	///////////////////////////////////////////
+	
+	int cb    = (cbmask==0x2)? 1 : 0;
+	int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
+	
+	std::vector<int> comm_offnode(simd_layout);
+	std::vector<int> comm_proc   (simd_layout);  //relative processor coord in dim=dimension
+	std::vector<int> icoor(grid->Nd());
+	
+	for(int x=0;x<rd;x++){       
+	  
+	  int comm_any = 0;
+	  for(int s=0;s<simd_layout;s++) {
+	    int shifted_x   = x+s*rd+sshift;
+	    comm_offnode[s] = shifted_x >= ld; 
+	    comm_any        = comm_any | comm_offnode[s];
+	    comm_proc[s]    = shifted_x/ld;     
+	  }
+	  
+	  int o    = 0;
+	  int bo   = x*grid->_ostride[dimension];
+	  int sx   = (x+sshift)%rd;
+	  
+	  if ( comm_any ) {
+	    
+	    for(int i=0;i<Nsimd;i++){
+	      pointers[i] = (scalar_type *)&send_buf_extract[i][0];
+	    }
+	    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+	    
+	    for(int i=0;i<Nsimd;i++){
+	      
+	      int s;
+	      grid->iCoorFromIindex(icoor,i);
+	      s = icoor[dimension];
+	      
+	      if(comm_offnode[s]){
+		
+		int rank           = grid->_processor;
+		int recv_from_rank;
+		int xmit_to_rank;
+		grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
+		
+		/*		
+		grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
+				     xmit_to_rank,
+				     (void *)&recv_buf_extract[i][0],
+				     recv_from_rank,
+				     bytes);
+		*/
+
+		rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
+		
+	      } else { 
+		
+		rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
+
+	      }
+	      
+	    }
+	    
+	    // Permute by swizzling pointers in merge
+	    int permute_slice=0;
+	    int lshift=sshift%ld;
+	    int wrap  =lshift/rd;
+	    int  num  =lshift%rd;
+
+	    if ( x< rd-num ) permute_slice=wrap;
+	    else permute_slice = 1-wrap;
+	    
+	    int toggle_bit = (Nsimd>>(permute_type+1));
+	    int PermuteMap;
+	    for(int i=0;i<Nsimd;i++){
+	      if ( permute_slice ) {
+		PermuteMap=i^toggle_bit;
+		pointers[i] = rpointers[PermuteMap];
+	      } else {
+		pointers[i] = rpointers[i];
+	      }
+	    }
+
+	    Scatter_plane_merge(pointers,dimension,x,cbmask);
+	    
+	  } else { 
+
+	    int permute_slice=0;
+	    int wrap = sshift/rd;
+	    int  num = sshift%rd;
+	    if ( x< rd-num ) permute_slice=wrap;
+	    else permute_slice = 1-wrap;
+
+	    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
+	    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+
+	  }
+	}
+      }
+
+
+};
diff --git a/Grid_vComplexD.h b/Grid_vComplexD.h
index 330de26e..88d69ecb 100644
--- a/Grid_vComplexD.h
+++ b/Grid_vComplexD.h
@@ -16,6 +16,12 @@ namespace Grid {
             return (*this);
         }
         vComplexD(){};
+        vComplexD(ComplexD a){
+	  vsplat(*this,a);
+	};
+        vComplexD(double a){
+	  vsplat(*this,ComplexD(a));
+	};
  
         ///////////////////////////////////////////////
         // mac, mult, sub, add, adj
@@ -167,7 +173,15 @@ namespace Grid {
 	{
 	  Gmerge<vComplexD,ComplexD >(y,extracted);
 	}
-	friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted)
+	friend inline void extract(const vComplexD &y,std::vector<ComplexD *> &extracted)
+	{
+	  Gextract<vComplexD,ComplexD>(y,extracted);
+	}
+	friend inline void merge(vComplexD &y,std::vector<ComplexD > &extracted)
+	{
+	  Gmerge<vComplexD,ComplexD >(y,extracted);
+	}
+	friend inline void extract(const vComplexD &y,std::vector<ComplexD > &extracted)
 	{
 	  Gextract<vComplexD,ComplexD>(y,extracted);
 	}
@@ -184,6 +198,11 @@ namespace Grid {
         ///////////////////////
         // Splat
         ///////////////////////
+        friend inline void vsplat(vComplexD &ret,ComplexD c){
+            float a= real(c);
+            float b= imag(c);
+            vsplat(ret,a,b);
+        }
         friend inline void vsplat(vComplexD &ret,double rl,double ig){
 #if defined (AVX1)|| defined (AVX2)
             ret.v = _mm256_set_pd(ig,rl,ig,rl);
@@ -215,7 +234,7 @@ namespace Grid {
 #endif
         }
 
-friend inline void vstore(vComplexD &ret, ComplexD *a){
+friend inline void vstore(const vComplexD &ret, ComplexD *a){
 #if defined (AVX1)|| defined (AVX2)
        _mm256_store_pd((double *)a,ret.v);
 #endif
diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h
index b7fb3d6a..b6b7ebe9 100644
--- a/Grid_vComplexF.h
+++ b/Grid_vComplexF.h
@@ -20,6 +20,12 @@ namespace Grid {
             return (*this);
         }
         vComplexF(){};
+        vComplexF(ComplexF a){
+	  vsplat(*this,a);
+	};
+        vComplexF(double a){
+	  vsplat(*this,ComplexF(a));
+	};
        
         ///////////////////////////////////////////////
         // mac, mult, sub, add, adj
@@ -161,7 +167,7 @@ namespace Grid {
             vsplat(ret,a,b);
         }
 
-friend inline void vstore(vComplexF &ret, ComplexF *a){
+friend inline void vstore(const vComplexF &ret, ComplexF *a){
 #if defined (AVX1)|| defined (AVX2)
         _mm256_store_ps((float *)a,ret.v);
 #endif
@@ -210,27 +216,47 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
 #endif
         }
 
-
         friend inline vComplexF operator * (const Complex &a, vComplexF b){
             vComplexF va;
             vsplat(va,a);
             return va*b;
         }
         friend inline vComplexF operator * (vComplexF b,const Complex &a){
+	  return a*b;
+        }
+
+       /*
+	template<class real>
+        friend inline vComplexF operator * (vComplexF b,const real &a){
             vComplexF va;
-            vsplat(va,a);
+	    Complex ca(a,0);
+            vsplat(va,ca);
             return va*b;
         }
+	template<class real>
+	friend inline vComplexF operator * (const real &a,vComplexF b){
+	  return a*b;
+	}
+
         friend inline vComplexF operator + (const Complex &a, vComplexF b){
             vComplexF va;
             vsplat(va,a);
             return va+b;
         }
         friend inline vComplexF operator + (vComplexF b,const Complex &a){
-            vComplexF va;
-            vsplat(va,a);
-            return b+va;
+            return a+b;
         }
+	template<class real>
+        friend inline vComplexF operator + (vComplexF b,const real &a){
+            vComplexF va;
+	    Complex ca(a,0);
+            vsplat(va,ca);
+            return va+b;
+        }
+	template<class real>
+	friend inline vComplexF operator + (const real &a,vComplexF b){
+	  return a+b;
+	}
         friend inline vComplexF operator - (const Complex &a, vComplexF b){
             vComplexF va;
             vsplat(va,a);
@@ -241,7 +267,24 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
             vsplat(va,a);
             return b-va;
         }
-        // NB: Template the following on "type Complex" and then implement *,+,- for ComplexF, ComplexD, RealF, RealD above to
+	template<class real>
+        friend inline vComplexF operator - (vComplexF b,const real &a){
+            vComplexF va;
+	    Complex ca(a,0);
+            vsplat(va,ca);
+            return b-va;
+        }
+	template<class real>
+	friend inline vComplexF operator - (const real &a,vComplexF b){
+            vComplexF va;
+	    Complex ca(a,0);
+            vsplat(va,ca);
+            return va-b;
+	}
+       */
+       
+        // NB: Template the following on "type Complex" and then implement *,+,- for 
+	// ComplexF, ComplexD, RealF, RealD above to
         // get full generality of binops with scalars.
         friend inline void mac (vComplexF *__restrict__ y,const Complex *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
         friend inline void mult(vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
@@ -304,7 +347,15 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
       {
 	Gmerge<vComplexF,ComplexF >(y,extracted);
       }
-      friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted)
+      friend inline void extract(const vComplexF &y,std::vector<ComplexF *> &extracted)
+      {
+	Gextract<vComplexF,ComplexF>(y,extracted);
+      }
+      friend inline void merge(vComplexF &y,std::vector<ComplexF > &extracted)
+      {
+	Gmerge<vComplexF,ComplexF >(y,extracted);
+      }
+      friend inline void extract(const vComplexF &y,std::vector<ComplexF > &extracted)
       {
 	Gextract<vComplexF,ComplexF>(y,extracted);
       }
diff --git a/Grid_vInteger.h b/Grid_vInteger.h
index 6ddce191..82adbd8e 100644
--- a/Grid_vInteger.h
+++ b/Grid_vInteger.h
@@ -10,7 +10,7 @@ namespace Grid {
 
   typedef uint32_t Integer;
 
-    class vInteger {
+  class vInteger {
     protected:
 
     public:
@@ -21,6 +21,13 @@ namespace Grid {
 	typedef Integer scalar_type;
 
         vInteger(){};
+        vInteger & operator = (const Zero & z){
+            vzero(*this);
+            return (*this);
+        }
+        vInteger(Integer a){
+	  vsplat(*this,a);
+	};
         ////////////////////////////////////
         // Arithmetic operator overloads +,-,*
         ////////////////////////////////////
@@ -166,18 +173,18 @@ namespace Grid {
 #endif
 	}
 
-friend inline void vstore(vInteger &ret, Integer *a){
+	friend inline void vstore(const vInteger &ret, Integer *a){
 #if defined (AVX1)|| defined (AVX2)
-        _mm256_store_si256((__m256i*)a,ret.v);
+	  _mm256_store_si256((__m256i*)a,ret.v);
 #endif
 #ifdef SSE2
-	_mm_store_si128(a,ret.v);
+	  _mm_store_si128(a,ret.v);
 #endif
 #ifdef AVX512
-	_mm512_store_si512(a,ret.v);
+	  _mm512_store_si512(a,ret.v);
 #endif
 #ifdef QPX
-	assert(0);
+	  assert(0);
 #endif
         }
 
@@ -185,6 +192,7 @@ friend inline void vstore(vInteger &ret, Integer *a){
         {
             _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
         }
+
         // Unary negation
         friend inline vInteger operator -(const vInteger &r) {
             vInteger ret;
@@ -210,9 +218,32 @@ friend inline void vstore(vInteger &ret, Integer *a){
             *this = *this-r;
             return *this;
         }
+
+      friend inline void permute(vInteger &y,const vInteger b,int perm)
+      {
+	Gpermute<vInteger>(y,b,perm);
+      }
+      friend inline void merge(vInteger &y,std::vector<Integer *> &extracted)
+      {
+	Gmerge<vInteger,Integer>(y,extracted);
+      }
+      friend inline void extract(const vInteger &y,std::vector<Integer *> &extracted)
+      {
+	Gextract<vInteger,Integer>(y,extracted);
+      }
+      friend inline void merge(vInteger &y,std::vector<Integer> &extracted)
+      {
+	Gmerge<vInteger,Integer>(y,extracted);
+      }
+      friend inline void extract(const vInteger &y,std::vector<Integer> &extracted)
+      {
+	Gextract<vInteger,Integer>(y,extracted);
+      }
+
+
     public:
-        static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
-    };
+        static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);}
+  };
 
     inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; }
 
@@ -222,27 +253,7 @@ friend inline void vstore(vInteger &ret, Integer *a){
     {
         return l*r;
     }
- 
-
-    class vIntegerF : public vInteger
-    {
-    public:
-      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(float);}
-      
-      friend inline void permute(vIntegerF &y,vIntegerF b,int perm)
-      {
-	Gpermute<vIntegerF>(y,b,perm);
-      }
-      friend inline void merge(vIntegerF &y,std::vector<Integer *> &extracted)
-      {
-	Gmerge<vIntegerF,Integer>(y,extracted);
-      }
-      friend inline void extract(vIntegerF &y,std::vector<Integer *> &extracted)
-      {
-	Gextract<vIntegerF,Integer>(y,extracted);
-      }
-    };
-
+    
 }
 
 #endif
diff --git a/Grid_vRealD.h b/Grid_vRealD.h
index 13ceedbe..a5b59be3 100644
--- a/Grid_vRealD.h
+++ b/Grid_vRealD.h
@@ -13,6 +13,9 @@ namespace Grid {
 	typedef RealD scalar_type;
 
         vRealD(){};
+        vRealD(RealD a){
+	  vsplat(*this,a);
+	};
 
         friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
         friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
@@ -112,7 +115,15 @@ namespace Grid {
 	{
 	  Gmerge<vRealD,RealD >(y,extracted);
 	}
-	friend inline void extract(vRealD &y,std::vector<RealD *> &extracted)
+	friend inline void extract(const vRealD &y,std::vector<RealD *> &extracted)
+	{
+	  Gextract<vRealD,RealD>(y,extracted);
+	}
+	friend inline void merge(vRealD &y,std::vector<RealD > &extracted)
+	{
+	  Gmerge<vRealD,RealD >(y,extracted);
+	}
+	friend inline void extract(const vRealD &y,std::vector<RealD > &extracted)
 	{
 	  Gextract<vRealD,RealD>(y,extracted);
 	}
@@ -157,7 +168,7 @@ namespace Grid {
 #endif
 	}
 
-	friend inline void vstore(vRealD &ret, double *a){
+	friend inline void vstore(const vRealD &ret, double *a){
 #if defined (AVX1)|| defined (AVX2)
             _mm256_store_pd(a,ret.v);
 #endif
diff --git a/Grid_vRealF.h b/Grid_vRealF.h
index 185f4da8..0fe68f43 100644
--- a/Grid_vRealF.h
+++ b/Grid_vRealF.h
@@ -14,6 +14,9 @@ namespace Grid {
 	typedef RealF scalar_type;
 
         vRealF(){};
+        vRealF(RealF a){
+	  vsplat(*this,a);
+	};
         ////////////////////////////////////
         // Arithmetic operator overloads +,-,*
         ////////////////////////////////////
@@ -133,7 +136,15 @@ namespace Grid {
 	{
 	  Gmerge<vRealF,RealF >(y,extracted);
 	}
-	friend inline void extract(vRealF &y,std::vector<RealF *> &extracted)
+	friend inline void extract(const vRealF &y,std::vector<RealF *> &extracted)
+	{
+	  Gextract<vRealF,RealF>(y,extracted);
+	}
+	friend inline void merge(vRealF &y,std::vector<RealF> &extracted)
+	{
+	  Gmerge<vRealF,RealF >(y,extracted);
+	}
+	friend inline void extract(const vRealF &y,std::vector<RealF> &extracted)
 	{
 	  Gextract<vRealF,RealF>(y,extracted);
 	}
@@ -180,7 +191,7 @@ namespace Grid {
 	////////////////////////////////////////////////////////////////////////
 	// FIXME:  gonna remove these load/store, get, set, prefetch
 	////////////////////////////////////////////////////////////////////////
-friend inline void vstore(vRealF &ret, float *a){
+friend inline void vstore(const vRealF &ret, float *a){
 #if defined (AVX1)|| defined (AVX2)
 	_mm256_store_ps(a,ret.v);
 #endif
diff --git a/TODO b/TODO
index d2373ba1..fb7d177a 100644
--- a/TODO
+++ b/TODO
@@ -1,29 +1,68 @@
+
 * FIXME audit
 * Remove vload/store etc..
 * Replace vset with a call to merge.
 * Replace vset with a call to merge.
+* Const audit
+* extract / merge extra implementation removal
 
-* Conditional execution Subset, where etc...
-* Coordinate information, integers etc...
-* Integer type padding/union to vector.
-* LatticeCoordinate[mu]
+* Conditional execution, where etc...         -----DONE, simple test
+* Integer relational support                         -----DONE
+* Coordinate information, integers etc...            -----DONE
+* Integer type padding/union to vector.              -----DONE 
+* LatticeCoordinate[mu]                              -----DONE
 
-* Optimise the extract/merge SIMD routines
 
-* Broadcast, reduction tests.
+* Stencil operator support                           -----Initial thoughts
+
+* Subset support, slice sums etc...                  -----Only need slice sum?
+                                                     -----Generic cartesian subslicing?
+                                                     -----Array ranges / boost extents?
+                                                     -----Multigrid grid transferral.
+                                                     -----Suggests generalised cartesian subblocking
+                                                          sums, returning modified grid.
+
+    Two classes of subset;
+i) red black parit subsetting.
+   (pick checkerboard).
+
+ii) Need to be able to project one Grid to another Grid.
+    Generic concept is to subdivide (based on RD so applies to red/black or full).
+    Return a type on SUB-grid from CellSum TOP-grid
+    SUB-grid need not distribute but be replicated in some dims if that is how the
+    cartesian communicator works.
+
+iii) No general permutation map.
+
+
+* Consider switch std::vector to boost arrays.
+  boost::multi_array<type, 3> A()...    to replace multi1d, multi2d etc..
+
+*? Cell definition <-> sliceSum.
+ ? Replicated arrays.
+
+
+
+* Check for missing functionality                    - partially audited against QDP++ layout
+
+* Optimise the extract/merge SIMD routines; Azusa??
+
+ -- I have collated into single location at least.
+
+ -- Need to use _mm_*insert/extract routines.
+
+* Conformable test in Cshift routines.
+
+* Gamma/Dirac structures
+
+* Fourspin, two spin project
+
+* Broadcast, reduction tests. innerProduct, localInnerProduct
 
 * QDP++ regression suite and comparative benchmark
 
 * NERSC Lattice loading, plaquette test
 
-* Conformable test in Cshift routines.
-
-* Gamma/Dirac structures
-* Fourspin, two spin project
-
-* Stencil operator support
-
-* Check for missing functionality
 
 * I/O support
   - MPI IO?