"where" and integer comparisons logic implemented for conditional

assignment. LatticeCoordinate helper to get global (reduced) coordinate. Some more work of similar type perhaps needed, but the bulk of the required structure for masked array assignment is now in place.
2025-07-29 02:37:07 +01:00 · 2015-04-09 08:06:03 +02:00
parent 4666acfbb0
commit 8f5281563e
17 changed files with 1420 additions and 240 deletions
--- a/Grid.h
+++ b/Grid.h
@@ -41,12 +41,12 @@
 #include <malloc.h>
 #endif
 #include <Grid_aligned_allocator.h>
 #include <Grid_simd.h>
 #include <Grid_math_types.h>
 #include <Grid_Cartesian.h>
 #include <Grid_Lattice.h>
 #include <Grid_comparison.h>
 #include <Grid_QCD.h>
 namespace Grid {
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -19,16 +19,14 @@ public:
    typedef typename vobj::vector_type vector_type;
 public:
    Lattice(GridBase *grid) : _grid(grid) {
        _odata.reserve(_grid->oSites());
        assert((((uint64_t)&_odata[0])&0xF) ==0);
        checkerboard=0;
    }
 #include <Grid_cshift.h>
-    
+   
    template<class obj1,class obj2>
    friend void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs);
@@ -156,23 +154,23 @@ public:
            v_ptr[i]=drand48();
        }
    };
-    
+
    // FIXME for debug; deprecate this
    friend void lex_sites(Lattice<vobj> &l){
-        Real *v_ptr = (Real *)&l._odata[0];
+      Real *v_ptr = (Real *)&l._odata[0];
-	size_t o_len = l._grid->oSites();
+      size_t o_len = l._grid->oSites();
-        size_t v_len = sizeof(vobj)/sizeof(vRealF);
+      size_t v_len = sizeof(vobj)/sizeof(vRealF);
-	size_t vec_len = vRealF::Nsimd();
+      size_t vec_len = vRealF::Nsimd();
-        for(int i=0;i<o_len;i++){
+      for(int i=0;i<o_len;i++){
-        for(int j=0;j<v_len;j++){
+	for(int j=0;j<v_len;j++){
-	  for(int vv=0;vv<vec_len;vv+=2){
+          for(int vv=0;vv<vec_len;vv+=2){
-            v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
-            v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
+	    v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
 	  }
-        }}
+	}}
-    };
+    }
-
+    
    // FIXME Implement a consistent seed management strategy
    friend void gaussian(Lattice<vobj> &l){
        // Zero mean, unit variance.
@@ -195,7 +193,7 @@ public:
        }
        return ret;
    }
-    // *=,+=,-= operators
+    // *=,+=,-= operators inherit behvour from correspond */+/- operation
    template<class T>
    inline Lattice<vobj> &operator *=(const T &r) {
        *this = (*this)*r;
@@ -351,7 +349,6 @@ public:
    inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
    {
        Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
 #pragma omp parallel for
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
            ret._odata[ss]=lhs*rhs._odata[ss];
@@ -383,7 +380,7 @@ public:
    {
        Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
 #pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+        for(int ss=0;ss<lhs._grid->oSites(); ss++){
            ret._odata[ss]=lhs._odata[ss]*rhs;
        }
        return ret;
@@ -409,5 +406,6 @@ public:
        return ret;
    }
 }
 #endif
--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@@ -45,7 +45,7 @@ namespace QCD {
    typedef Lattice<vTComplex>            LatticeComplex;
-    typedef Lattice<vTInteger>            LatticeInteger; // Predicates for "where"
+    typedef Lattice<vInteger>            LatticeInteger; // Predicates for "where"
    typedef Lattice<vColourMatrix>     LatticeColourMatrix;
    typedef Lattice<vSpinMatrix>       LatticeSpinMatrix;
@@ -92,6 +92,31 @@ namespace QCD {
        }
        return ret;
     }
    // FIXME for debug; deprecate this
   inline void LatticeCoordinate(LatticeInteger &l,int mu){
      GridBase *grid = l._grid;
      int Nsimd = grid->iSites();
      std::vector<int> gcoor;
      std::vector<Integer> mergebuf(Nsimd);
      std::vector<Integer *> mergeptr(Nsimd);
      for(int o=0;o<grid->oSites();o++){
 	for(int i=0;i<grid->iSites();i++){
 	  //	  RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
 	  grid->RankIndexToGlobalCoor(0,o,i,gcoor);
 	  mergebuf[i]=gcoor[mu];
 	  mergeptr[i]=&mergebuf[i];
 	}
 	merge(l._odata[o],mergeptr);
      }
    };
 #include <Grid_predicated.h>
 #if 0
 #endif
 }   //namespace QCD
 } // Grid
 #endif
--- a/Grid_comparison.h
+++ b/Grid_comparison.h
@@ -0,0 +1,264 @@
 #ifndef GRID_COMPARISON_H
 #define GRID_COMPARISON_H
 namespace Grid {
    // Generic list of functors
    template<class lobj,class robj> class veq {
    public:
      vInteger operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs == rhs;
 	}
    };
    template<class lobj,class robj> class vne {
    public:
      vInteger operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs != rhs;
 	}
    };
    template<class lobj,class robj> class vlt {
    public:
      vInteger operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs < rhs;
 	}
    };
    template<class lobj,class robj> class vle {
    public:
      vInteger operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs <= rhs;
 	}
    };
    template<class lobj,class robj> class vgt {
    public:
      vInteger operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs > rhs;
 	}
    };
    template<class lobj,class robj> class vge {
    public:
      vInteger operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs >= rhs;
 	}
    };
    // Generic list of functors
    template<class lobj,class robj> class seq {
    public:
      Integer operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs == rhs;
 	}
    };
    template<class lobj,class robj> class sne {
    public:
      Integer operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs != rhs;
 	}
    };
    template<class lobj,class robj> class slt {
    public:
      Integer operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs < rhs;
 	}
    };
    template<class lobj,class robj> class sle {
    public:
      Integer operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs <= rhs;
 	}
    };
    template<class lobj,class robj> class sgt {
    public:
      Integer operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs > rhs;
 	}
    };
    template<class lobj,class robj> class sge {
    public:
      Integer operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return lhs >= rhs;
 	}
    };
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // Integer gets extra relational functions. Could also implement these for RealF, RealD etc..
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class sfunctor> 
    inline vInteger Comparison(sfunctor sop,const vInteger & lhs, const vInteger & rhs)
    {
      std::vector<Integer> vlhs(vInteger::Nsimd());   // Use functors to reduce this to single implementation
      std::vector<Integer> vrhs(vInteger::Nsimd());
      vInteger ret;
      extract(lhs,vlhs);
      extract(rhs,vrhs);
      for(int s=0;s<vInteger::Nsimd();s++){
 	vlhs[s] = sop(vlhs[s],vrhs[s]);
      }
      merge(ret,vlhs);
      return ret;
    }
    inline vInteger operator < (const vInteger & lhs, const vInteger & rhs)
    {
      return Comparison(slt<Integer,Integer>(),lhs,rhs);
    }
    inline vInteger operator <= (const vInteger & lhs, const vInteger & rhs)
    {
      return Comparison(sle<Integer,Integer>(),lhs,rhs);
    }
    inline vInteger operator > (const vInteger & lhs, const vInteger & rhs)
    {
      return Comparison(sgt<Integer,Integer>(),lhs,rhs);
    }
    inline vInteger operator >= (const vInteger & lhs, const vInteger & rhs)
    {
      return Comparison(sge<Integer,Integer>(),lhs,rhs);
    }
    inline vInteger operator == (const vInteger & lhs, const vInteger & rhs)
    {
      return Comparison(seq<Integer,Integer>(),lhs,rhs);
    }
    inline vInteger operator != (const vInteger & lhs, const vInteger & rhs)
    {
      return Comparison(sne<Integer,Integer>(),lhs,rhs);
    }
    //////////////////////////////////////////////////////////////////////////
    // relational operators
    // 
    // Support <,>,<=,>=,==,!=
    //
    //Query supporting bitwise &, |, ^, !
    //Query supporting logical &&, ||, 
    //////////////////////////////////////////////////////////////////////////
    template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
    {
      Lattice<vInteger> ret(rhs._grid);
 #pragma omp parallel for
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
        }
        return ret;
    }
    template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
    {
      Lattice<vInteger> ret(lhs._grid);
 #pragma omp parallel for
        for(int ss=0;ss<lhs._grid->oSites(); ss++){
 	  ret._odata[ss]=op(lhs._odata[ss],rhs);
        }
        return ret;
    }
    template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
    {
      Lattice<vInteger> ret(rhs._grid);
 #pragma omp parallel for
        for(int ss=0;ss<rhs._grid->oSites(); ss++){
 	  ret._odata[ss]=op(lhs._odata[ss],rhs);
        }
        return ret;
    }
    // Less than
   template<class lobj,class robj>
   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(vlt<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(vlt<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vlt<lobj,robj>(),lhs,rhs);
   }
   // Less than equal
   template<class lobj,class robj>
   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(vle<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(vle<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vle<lobj,robj>(),lhs,rhs);
   }
   // Greater than 
   template<class lobj,class robj>
   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(vgt<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(vgt<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vgt<lobj,robj>(),lhs,rhs);
   }
   // Greater than equal
   template<class lobj,class robj>
   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(vge<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(vge<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vge<lobj,robj>(),lhs,rhs);
   }
   // equal
   template<class lobj,class robj>
   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(veq<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(veq<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(veq<lobj,robj>(),lhs,rhs);
   }
   // not equal
   template<class lobj,class robj>
   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
     return LLComparison(vne<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
     return LSComparison(vne<lobj,robj>(),lhs,rhs);
   }
   template<class lobj,class robj>
   inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
     return SLComparison(vne<lobj,robj>(),lhs,rhs);
   }
 }
 #endif
--- a/Grid_cshift_common.h
+++ b/Grid_cshift_common.h
@@ -265,7 +265,6 @@ friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimens
 //////////////////////////////////////////////////////
 // Local to node Cshift
 //////////////////////////////////////////////////////
 friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];
--- a/Grid_main.cc
+++ b/Grid_main.cc
@@ -19,10 +19,10 @@ int main (int argc, char ** argv)
  std::vector<int> simd_layout(4);
  std::vector<int> mpi_layout(4);
-  mpi_layout[0]=2;
+  mpi_layout[0]=1;
-  mpi_layout[1]=2;
+  mpi_layout[1]=1;
-  mpi_layout[2]=2;
+  mpi_layout[2]=1;
-  mpi_layout[3]=2;
+  mpi_layout[3]=1;
 #ifdef AVX512
 for(int omp=128;omp<236;omp+=16){
@@ -121,12 +121,34 @@ int main (int argc, char ** argv)
    // Non-lattice (const objects) * Lattice
    ColourMatrix cm;
    SpinColourMatrix scm;
-    
+    vSpinColourMatrix vscm;
    Complex cplx(1.0);
    Integer myint=1;
    double mydouble=1.0;
    //    vSpinColourMatrix vscm;
    scMat = cMat*scMat;
    scm = cm * scm;         // SpinColourMatrix  = ColourMatrix     * SpinColourMatrix
    scm = scm *cm;          // SpinColourMatrix  = SpinColourMartix * ColourMatrix
    scm = GammaFive * scm ; // SpinColourMatrix  = SpinMatrix       * SpinColourMatrix
    scm = scm* GammaFive  ; // SpinColourMatrix  = SpinColourMatrix * SpinMatrix
    scm = scm*cplx;
    vscm = vscm*cplx;
    scMat = scMat*cplx;
    scm = cplx*scm;
    vscm = cplx*vscm;
    scMat = cplx*scMat;
    scm = myint*scm;
    vscm = myint*vscm;
    scMat = scMat*myint;
    scm = scm*mydouble;
    vscm = vscm*mydouble;
    scMat = scMat*mydouble;
    scMat = mydouble*scMat;
    cMat = mydouble*cMat;
    sMat = adj(sMat);       // LatticeSpinMatrix adjoint
    sMat = iGammaFive*sMat; // SpinMatrix * LatticeSpinMatrix
@@ -160,7 +182,35 @@ int main (int argc, char ** argv)
    */
    lex_sites(Foo);
    Integer mm[4];
    mm[0]=1;
    mm[1]=Fine._rdimensions[0];
    mm[2]=Fine._ldimensions[0]*Fine._ldimensions[1];
    mm[3]=Fine._ldimensions[0]*Fine._ldimensions[1]*Fine._ldimensions[2];
    LatticeInteger lex(&Fine);
    lex=zero;
    for(int d=0;d<4;d++){
      LatticeInteger coor(&Fine);
      LatticeCoordinate(coor,d);
      lex = lex + coor*mm[d];
    }
    Bar = zero;
    Bar = where(lex<10,Foo,Bar);
    {
      std::vector<int> coor(4);
      for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
      for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
      for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
      for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
        ColourMatrix bar;
        peekSite(bar,Bar,coor);
        for(int r=0;r<3;r++){
        for(int c=0;c<3;c++){
 	  cout<<"bar "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "<<bar._internal._internal[r][c]<<std::endl;
 	}}
      }}}}
    }    
    //setCheckerboard(ShiftedCheck,rFoo); 
    //setCheckerboard(ShiftedCheck,bFoo); 
@@ -228,11 +278,15 @@ int main (int argc, char ** argv)
    double nrm=0;
    LatticeColourMatrix deriv(&Fine);
    double half=0.5;
    deriv = 0.5*Cshift(Foo,0,1) - 0.5*Cshift(Foo,0,-1);
    for(int dir=0;dir<4;dir++){
      for(int shift=0;shift<latt_size[dir];shift++){
 	pickCheckerboard(0,rFoo,Foo);    // Pick out red or black checkerboards
 	pickCheckerboard(1,bFoo,Foo);
@@ -254,6 +308,8 @@ int main (int argc, char ** argv)
 	for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
 	for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
 	for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
        std::complex<Grid::Real> diff;
@@ -305,7 +361,8 @@ int main (int argc, char ** argv)
        double nn=Ttr._internal._internal;
        if ( nn > 0 )
            cout<<"Shift real trace fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<endl;
-        
+     
        for(int r=0;r<3;r++){
        for(int c=0;c<3;c++){
            diff =shifted1._internal._internal[r][c]-shifted2._internal._internal[r][c];
--- a/Grid_math_type_mapper.h
+++ b/Grid_math_type_mapper.h
@@ -0,0 +1,83 @@
 #ifndef GRID_MATH_TYPE_MAPPER_H
 #define GRID_MATH_TYPE_MAPPER_H
 namespace Grid {
 //////////////////////////////////////////////////////////////////////////////////
 // Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
 //////////////////////////////////////////////////////////////////////////////////
  template <class T> class GridTypeMapper {
  public:
    typedef typename T::scalar_type scalar_type;
    typedef typename T::vector_type vector_type;
    typedef typename T::tensor_reduced tensor_reduced;
  };
 //////////////////////////////////////////////////////////////////////////////////
 // Recursion stops with these template specialisations
 //////////////////////////////////////////////////////////////////////////////////
  template<> class GridTypeMapper<RealF> {
  public:
    typedef RealF scalar_type;
    typedef RealF vector_type;
    typedef RealF tensor_reduced ;
  };
  template<> class GridTypeMapper<RealD> {
  public:
    typedef RealD scalar_type;
    typedef RealD vector_type;
    typedef RealD tensor_reduced;
  };
  template<> class GridTypeMapper<ComplexF> {
  public:
    typedef ComplexF scalar_type;
    typedef ComplexF vector_type;
    typedef ComplexF tensor_reduced;
  };
  template<> class GridTypeMapper<ComplexD> {
  public:
    typedef ComplexD scalar_type;
    typedef ComplexD vector_type;
    typedef ComplexD tensor_reduced;
  };
  template<> class GridTypeMapper<vRealF> {
  public:
    typedef RealF  scalar_type;
    typedef vRealF vector_type;
    typedef vRealF tensor_reduced;
  };
  template<> class GridTypeMapper<vRealD> {
  public:
    typedef RealD  scalar_type;
    typedef vRealD vector_type;
    typedef vRealD tensor_reduced;
  };
  template<> class GridTypeMapper<vComplexF> {
  public:
    typedef ComplexF  scalar_type;
    typedef vComplexF vector_type;
    typedef vComplexF tensor_reduced;
  };
  template<> class GridTypeMapper<vComplexD> {
  public:
    typedef ComplexD  scalar_type;
    typedef vComplexD vector_type;
    typedef vComplexD tensor_reduced;
  };
  template<> class GridTypeMapper<vInteger> {
  public:
    typedef Integer  scalar_type;
    typedef vInteger vector_type;
    typedef vInteger tensor_reduced;
  };
  // Again terminate the recursion.
  inline vRealD    TensorRemove(vRealD    arg){ return arg;}
  inline vRealF    TensorRemove(vRealF    arg){ return arg;}
  inline vComplexF TensorRemove(vComplexF arg){ return arg;}
  inline vComplexD TensorRemove(vComplexD arg){ return arg;}
  inline vInteger  TensorRemove(vInteger  arg){ return arg;}
 }
 #endif
--- a/Grid_math_types.h
+++ b/Grid_math_types.h
@@ -1,63 +1,11 @@
 #ifndef GRID_MATH_TYPES_H
 #define GRID_MATH_TYPES_H
 #include <Grid_math_type_mapper.h>
 namespace Grid {
 //////////////////////////////////////////////////////////////////////////////////
 // Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
 //////////////////////////////////////////////////////////////////////////////////
  template <class T> class GridTypeMapper {
  public:
    typedef typename T::scalar_type scalar_type;
    typedef typename T::vector_type vector_type;
  };
  template<> class GridTypeMapper<RealF> {
  public:
    typedef RealF scalar_type;
    typedef RealF vector_type;
  };
  template<> class GridTypeMapper<RealD> {
  public:
    typedef RealD scalar_type;
    typedef RealD vector_type;
  };
  template<> class GridTypeMapper<ComplexF> {
  public:
    typedef ComplexF scalar_type;
    typedef ComplexF vector_type;
  };
  template<> class GridTypeMapper<ComplexD> {
  public:
    typedef ComplexD scalar_type;
    typedef ComplexD vector_type;
  };
  template<> class GridTypeMapper<vRealF> {
  public:
    typedef RealF  scalar_type;
    typedef vRealF vector_type;
  };
  template<> class GridTypeMapper<vRealD> {
  public:
    typedef RealD  scalar_type;
    typedef vRealD vector_type;
  };
  template<> class GridTypeMapper<vComplexF> {
  public:
    typedef ComplexF  scalar_type;
    typedef vComplexF vector_type;
  };
  template<> class GridTypeMapper<vComplexD> {
  public:
    typedef ComplexD  scalar_type;
    typedef vComplexD vector_type;
  };
 ///////////////////////////////////////////////////
 // Scalar, Vector, Matrix objects.
 // These can be composed to form tensor products of internal indices.
@@ -70,9 +18,16 @@ public:
  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  typedef iScalar<tensor_reduced_v> tensor_reduced;
    iScalar(){};
    iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
    iScalar(Zero &z){ *this = zero; };
    iScalar<vtype> & operator= (const Zero &hero){
        zeroit(*this);
        return *this;
@@ -80,22 +35,27 @@ public:
    friend void zeroit(iScalar<vtype> &that){
        zeroit(that._internal);
    }
-    friend void permute(iScalar<vtype> &out,iScalar<vtype> &in,int permutetype){
+    friend void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
      permute(out._internal,in._internal,permutetype);
    }
-    friend void extract(iScalar<vtype> &in,std::vector<scalar_type *> &out){
+    friend void extract(const iScalar<vtype> &in,std::vector<scalar_type *> &out){
      extract(in._internal,out); // extract advances the pointers in out
    }
    friend void merge(iScalar<vtype> &in,std::vector<scalar_type *> &out){
      merge(in._internal,out); // extract advances the pointers in out
    }
    friend inline iScalar<vtype>::vector_type TensorRemove(iScalar<vtype> arg)
    {
      return TensorRemove(arg._internal);
    }
    // Unary negation
    friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
        iScalar<vtype> ret;
        ret._internal= -r._internal;
        return ret;
    }
-    // *=,+=,-= operators
+    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
    inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
        *this = (*this)*r;
        return *this;
@@ -117,6 +77,9 @@ public:
  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  typedef iScalar<tensor_reduced_v> tensor_reduced;
    iVector(Zero &z){ *this = zero; };
    iVector() {};
@@ -129,12 +92,12 @@ public:
            zeroit(that._internal[i]);
        }
    }
-    friend void permute(iVector<vtype,N> &out,iVector<vtype,N> &in,int permutetype){
+    friend void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
      for(int i=0;i<N;i++){
 	permute(out._internal[i],in._internal[i],permutetype);
      }
    }
-    friend void extract(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
+    friend void extract(const iVector<vtype,N> &in,std::vector<scalar_type *> &out){
      for(int i=0;i<N;i++){
 	extract(in._internal[i],out);// extract advances pointers in out
      }
@@ -150,7 +113,7 @@ public:
        for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
        return ret;
    }
-    // *=,+=,-= operators
+    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
    inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
        *this = (*this)*r;
        return *this;
@@ -163,10 +126,8 @@ public:
        *this = (*this)+r;
        return *this;
    }
 };
 template<class vtype,int N> class iMatrix
 {
 public:
@@ -174,62 +135,64 @@ public:
  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  typedef iScalar<tensor_reduced_v> tensor_reduced;
-    iMatrix(Zero &z){ *this = zero; };
+  iMatrix(Zero &z){ *this = zero; };
-    iMatrix() {};
+  iMatrix() {};
-    iMatrix<vtype,N> & operator= (Zero &hero){
+  iMatrix<vtype,N> & operator= (Zero &hero){
-        zeroit(*this);
+    zeroit(*this);
-        return *this;
+    return *this;
-    }
+  }
-    friend void zeroit(iMatrix<vtype,N> &that){
+  friend void zeroit(iMatrix<vtype,N> &that){
-        for(int i=0;i<N;i++){
+    for(int i=0;i<N;i++){
-        for(int j=0;j<N;j++){
+      for(int j=0;j<N;j++){
-                zeroit(that._internal[i][j]);
+	zeroit(that._internal[i][j]);
-        }}
+    }}
-    }
+  }
-    friend void permute(iMatrix<vtype,N> &out,iMatrix<vtype,N> &in,int permutetype){
+  friend void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
-      for(int i=0;i<N;i++){
+    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	permute(out._internal[i][j],in._internal[i][j],permutetype);
-      }}
+    }}
-    }
+  }
-    friend void extract(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
+  friend void extract(const iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
-      for(int i=0;i<N;i++){
+    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	extract(in._internal[i][j],out);// extract advances pointers in out
-      }}
+    }}
-    }
+  }
-    friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
+  friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
-      for(int i=0;i<N;i++){
+    for(int i=0;i<N;i++){
      for(int j=0;j<N;j++){
 	merge(in._internal[i][j],out);// extract advances pointers in out
-      }}
+    }}
-    }
+  }
-    // Unary negation
+  // Unary negation
-    friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
+  friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
-        iMatrix<vtype,N> ret;
+    iMatrix<vtype,N> ret;
-        for(int i=0;i<N;i++){
+    for(int i=0;i<N;i++){
-        for(int j=0;j<N;j++){
+      for(int j=0;j<N;j++){
-            ret._internal[i][j]= -r._internal[i][j];
+	ret._internal[i][j]= -r._internal[i][j];
-        }}
+    }}
-        return ret;
+    return ret;
-    }
+  }
-    // *=,+=,-= operators
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
-    template<class T>
+  template<class T>
-    inline iMatrix<vtype,N> &operator *=(const T &r) {
+  inline iMatrix<vtype,N> &operator *=(const T &r) {
-        *this = (*this)*r;
+    *this = (*this)*r;
-        return *this;
+    return *this;
-    }
+  }
-    template<class T>
+  template<class T>
-    inline iMatrix<vtype,N> &operator -=(const T &r) {
+  inline iMatrix<vtype,N> &operator -=(const T &r) {
-        *this = (*this)-r;
+    *this = (*this)-r;
-        return *this;
+    return *this;
-    }
+  }
-    template<class T>
+  template<class T>
-    inline iMatrix<vtype,N> &operator +=(const T &r) {
+  inline iMatrix<vtype,N> &operator +=(const T &r) {
-        *this = (*this)+r;
+    *this = (*this)+r;
-        return *this;
+    return *this;
-    }
+  }
 };
@@ -642,7 +605,8 @@ iVector<rtype,N> operator * (const iVector<mtype,N>& lhs,const iScalar<vtype>& r
    // mat  x vec  = vec
    // vec  x scal = vec
    // scal x vec  = vec
-    
+    //
    // We can special case scalar_type ??
 template<class l,class r>
 inline auto operator * (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(lhs._internal * rhs._internal)>
 {
@@ -715,6 +679,229 @@ auto operator * (const iVector<l,N>& lhs,const iScalar<r>& rhs) -> iVector<declt
    }
    return ret;
 }
 //////////////////////////////////////////////////////////////////////////////////////////
 // Must support native C++ types Integer, Complex, Real
 //////////////////////////////////////////////////////////////////////////////////////////
 // multiplication by fundamental scalar type
 template<class l,int N> inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs(rhs);
  return lhs*srhs;
 }
 template<class l,int N> inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
 template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iVector<l,N>::tensor_reduced srhs(rhs);
  return lhs*srhs;
 }
 template<class l,int N> inline iVector<l,N> operator * (const typename iScalar<l>::scalar_type lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
 template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type &rhs) 
 {
  typename iMatrix<l,N>::tensor_reduced srhs(rhs);
  return lhs*srhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator * (const typename iScalar<l>::scalar_type & lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
 ////////////////////////////////////////////////////////////////////
 // Double support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
 template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,double rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs*srhs;
 }
 template<class l> inline iScalar<l> operator * (double lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
 template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,double rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs*srhs;
 }
 template<class l,int N> inline iVector<l,N> operator * (double lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
 template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,double rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs*srhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator * (double lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
 ////////////////////////////////////////////////////////////////////
 // Integer support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
 template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,Integer rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs*srhs;
 }
 template<class l> inline iScalar<l> operator * (Integer lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
 template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,Integer rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs*srhs;
 }
 template<class l,int N> inline iVector<l,N> operator * (Integer lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
 template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,Integer rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs*srhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator * (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // addition by fundamental scalar type applies to matrix(down diag) and scalar
 ///////////////////////////////////////////////////////////////////////////////////////////////
 template<class l,int N> inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs(rhs);
  return lhs+srhs;
 }
 template<class l,int N> inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
 template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iMatrix<l,N>::tensor_reduced srhs(rhs);
  return lhs+srhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator + (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
 ////////////////////////////////////////////////////////////////////
 // Double support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
 template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,double rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs+srhs;
 }
 template<class l> inline iScalar<l> operator + (double lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
 template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,double rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs+srhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator + (double lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
 ////////////////////////////////////////////////////////////////////
 // Integer support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
 template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,Integer rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs+srhs;
 }
 template<class l> inline iScalar<l> operator + (Integer lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
 template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,Integer rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs+srhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator + (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // subtraction of fundamental scalar type applies to matrix(down diag) and scalar
 ///////////////////////////////////////////////////////////////////////////////////////////////
 template<class l,int N> inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs(rhs);
  return lhs-srhs;
 }
 template<class l,int N> inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
 {
  typename iScalar<l>::tensor_reduced slhs(lhs);
  return slhs-rhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs(rhs);
  return lhs-srhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator - (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) 
 {
  typename iScalar<l>::tensor_reduced slhs(lhs);
  return slhs-rhs;
 }
 ////////////////////////////////////////////////////////////////////
 // Double support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
 template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,double rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs-srhs;
 }
 template<class l> inline iScalar<l> operator - (double lhs,const iScalar<l>& rhs) 
 {
  typename iScalar<l>::scalar_type t(lhs);
  typename iScalar<l>::tensor_reduced slhs(t);
  return slhs-rhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,double rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs-srhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator - (double lhs,const iMatrix<l,N>& rhs) 
 {
  typename iScalar<l>::scalar_type t(lhs);
  typename iScalar<l>::tensor_reduced slhs(t);
  return slhs-rhs;
 }
 ////////////////////////////////////////////////////////////////////
 // Integer support; cast to "scalar_type" through constructor
 ////////////////////////////////////////////////////////////////////
 template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,Integer rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs-srhs;
 }
 template<class l> inline iScalar<l> operator - (Integer lhs,const iScalar<l>& rhs) 
 {
  typename iScalar<l>::scalar_type t(lhs);
  typename iScalar<l>::tensor_reduced slhs(t);
  return slhs-rhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,Integer rhs) 
 {
  typename iScalar<l>::scalar_type t(rhs);
  typename iScalar<l>::tensor_reduced srhs(t);
  return lhs-srhs;
 }
 template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatrix<l,N>& rhs) 
 {
  typename iScalar<l>::scalar_type t(lhs);
  typename iScalar<l>::tensor_reduced slhs(t);
  return slhs-rhs;
 }
    ///////////////////////////////////////////////////////////////////////////////////////
    // localInnerProduct Scalar x Scalar -> Scalar
    // localInnerProduct Vector x Vector -> Scalar
@@ -907,6 +1094,7 @@ inline auto trace(const iScalar<vtype> &arg) -> iScalar<decltype(trace(arg._inte
    ret._internal=trace(arg._internal);
    return ret;
 }
 };
 #endif
--- a/Grid_predicated.h
+++ b/Grid_predicated.h
@@ -0,0 +1,62 @@
 #ifndef GRID_PREDICATED_H
 #define GRID_PREDICATED_H
 // Must implement the predicate gating the 
 // Must be able to reduce the predicate down to a single vInteger per site.
 // Must be able to require the type be iScalar x iScalar x ....
 //                              give a GetVtype method in iScalar
 //                              and blow away the tensor structures.
 //
 template<class vobj>
 inline void where(Lattice<vobj> &ret,const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
 {
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);
  conformable(iftrue,ret);
  GridBase *grid=iftrue._grid;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  const int Nsimd = grid->Nsimd();
  const int words = sizeof(vobj)/sizeof(vector_type);
  std::vector<Integer> mask(Nsimd);
  std::vector<std::vector<scalar_type> > truevals (Nsimd,std::vector<scalar_type>(words) );
  std::vector<std::vector<scalar_type> > falsevals(Nsimd,std::vector<scalar_type>(words) );
  std::vector<scalar_type *> pointers(Nsimd);
 #pragma omp parallel for
  for(int ss=0;ss<iftrue._grid->oSites(); ss++){
    for(int s=0;s<Nsimd;s++) pointers[s] = & truevals[s][0];
    extract(iftrue._odata[ss]   ,pointers);
    for(int s=0;s<Nsimd;s++) pointers[s] = & falsevals[s][0];
    extract(iffalse._odata[ss]  ,pointers);
    extract(predicate._odata[ss],mask);
    for(int s=0;s<Nsimd;s++){
      if (mask[s]) pointers[s]=&truevals[s][0];
      else         pointers[s]=&falsevals[s][0];
    }
    merge(ret._odata[ss],pointers);
  }
 }
 template<class vobj>
 inline Lattice<vobj> where(const LatticeInteger &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
 {
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);
  Lattice<vobj> ret(iftrue._grid);
  where(ret,predicate,iftrue,iffalse);
  return ret;
 }
 #endif
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -11,28 +11,15 @@
 // Vector types are arch dependent
 ////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////
  // SIMD Alignment controls
  ////////////////////////////////////////////////////////////
 #ifdef HAVE_VAR_ATTRIBUTE_ALIGNED
 #define ALIGN_DIRECTIVE(A) __attribute__ ((aligned(A)))
 #else
 #define ALIGN_DIRECTIVE(A) __declspec(align(A))
 #endif
 #ifdef SSE2
 #include <pmmintrin.h>
 #define SIMDalign ALIGN_DIRECTIVE(16)
 #endif
 #if defined(AVX1) || defined (AVX2)
 #include <immintrin.h>
 #define SIMDalign ALIGN_DIRECTIVE(32)
 #endif
 #ifdef AVX512
 #include <immintrin.h>
 #define SIMDalign ALIGN_DIRECTIVE(64)
 #endif
 namespace Grid {
@@ -137,41 +124,66 @@ namespace Grid {
 // Generic extract/merge/permute
 /////////////////////////////////////////////////////////////////
 template<class vsimd,class scalar>
-inline void Gextract(vsimd &y,std::vector<scalar *> &extracted){
+inline void Gextract(const vsimd &y,std::vector<scalar *> &extracted){
 #if 1
  // FIXME: bounce off stack is painful
  // temporary hack while I figure out better way.
  // There are intrinsics to do this work without the storage.
-  int Nsimd = extracted.size();
+  int Nextr=extracted.size();
-  {
+  int Nsimd=vsimd::Nsimd();
-    std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
+  int s=Nsimd/Nextr;
-    vstore(y,&buf[0]);
+
-    for(int i=0;i<Nsimd;i++){
+  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
-      *extracted[i] = buf[i];
+  vstore(y,&buf[0]);
-      extracted[i]++;
+  for(int i=0;i<Nextr;i++){
-    }
+    *extracted[i] = buf[i*s];
    extracted[i]++;
  }
 #else 
  int NSo   = extracted.size();
  int NSv   = vsimd::Nsimd();
  int sparse= NSv/NSo;
  for(int i=0;i<NSv;i+=sparse){
  }
 #endif
 };
 template<class vsimd,class scalar>
 inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
-#if 1
+  int Nextr=extracted.size();
-  int Nsimd = extracted.size();
+  int Nsimd=vsimd::Nsimd();
  int s=Nsimd/Nextr;
  std::vector<scalar> buf(Nsimd); 
-  for(int i=0;i<Nsimd;i++){
+  for(int i=0;i<Nextr;i++){
-    buf[i]=*extracted[i];
+    for(int ii=0;ii<s;ii++){
      buf[i*s+ii]=*extracted[i];
    }
    extracted[i]++;
  }
  vset(y,&buf[0]); 
-#else
+};
-#endif
+template<class vsimd,class scalar>
 inline void Gextract(const vsimd &y,std::vector<scalar> &extracted){
  // FIXME: bounce off stack is painful
  // temporary hack while I figure out better way.
  // There are intrinsics to do this work without the storage.
  int Nextr=extracted.size();
  int Nsimd=vsimd::Nsimd();
  int s=Nsimd/Nextr;
  std::vector<scalar,alignedAllocator<scalar> > buf(Nsimd); 
  vstore(y,&buf[0]);
  for(int i=0;i<Nextr;i++){
    extracted[i] = buf[i*s];
  }
 };
 template<class vsimd,class scalar>
 inline void Gmerge(vsimd &y,std::vector<scalar> &extracted){
  int Nextr=extracted.size();
  int Nsimd=vsimd::Nsimd();
  int s=Nsimd/Nextr;
  std::vector<scalar> buf(Nsimd); 
  for(int i=0;i<Nextr;i++){
    for(int ii=0;ii<s;ii++){
      buf[i*s+ii]=extracted[i];
    }
  }
  vset(y,&buf[0]); 
 };
 //////////////////////////////////////////////////////////
@@ -183,7 +195,7 @@ inline void Gmerge(vsimd &y,std::vector<scalar *> &extracted){
 // Permute 4 possible on half precision @512bit vectors.
 //////////////////////////////////////////////////////////
 template<class vsimd>
-inline void Gpermute(vsimd &y,vsimd b,int perm){
+inline void Gpermute(vsimd &y,const vsimd &b,int perm){
      switch (perm){
 #if defined(AVX1)||defined(AVX2)
      // 8x32 bits=>3 permutes
@@ -214,10 +226,10 @@ inline void Gpermute(vsimd &y,vsimd b,int perm){
    };
 };
 #include <Grid_vInteger.h>
 #include <Grid_vRealF.h>
 #include <Grid_vRealD.h>
 #include <Grid_vComplexF.h>
 #include <Grid_vComplexD.h>
 #include <Grid_vInteger.h>
 #endif
--- a/Grid_stencil.h
+++ b/Grid_stencil.h
@@ -8,5 +8,355 @@
 // Lattice <foo> could also allocate haloes which get used for stencil code.
 //
 // Grid could create a neighbour index table for a given stencil.
-// Could also implement CovariantCshift.
+//
 // Could also implement CovariantCshift, to fuse the loops and enhance performance.
 //
 //
 // General stencil computation:
 //
 // Generic services
 // 0) Prebuild neighbour tables
 // 1) Compute sizes of all haloes/comms buffers; allocate them.
 //
 // 2) Gather all faces, and communicate.
 // 3) Loop over result sites, giving nbr index/offnode info for each
 // 
 // Could take a 
 // SpinProjectFaces 
 // start comms
 // complete comms 
 // Reconstruct Umu
 //
 // Approach.
 //
 //////////////////////////////////////////////////////////////////////////////////////////
 namespace Grid {
    class Stencil {
  public:
      Stencil(GridBase *grid,
 	      int npoints,
 	      int checkerboard,
 	      std::vector<int> directions,
 	      std::vector<int> distances);
      void Stencil_local     (int dimension,int shift,int cbmask);
      void Stencil_comms     (int dimension,int shift,int cbmask);
      void Stencil_comms_simd(int dimension,int shift,int cbmask);
      // Will need to implement actions for
      //
      Copy_plane;
      Copy_plane_permute;
      Gather_plane;
      // The offsets to all neibours in stencil in each direction
      int                               _checkerboard;
      int                               _npoints; // Move to template param?
      GridBase *                        _grid;
      // Store these as SIMD Integer needed
      //
      // std::vector< iVector<Integer, Npoint> > _offsets;
      // std::vector< iVector<Integer, Npoint> > _local;
      // std::vector< iVector<Integer, Npoint> > _comm_buf_size;
      // std::vector< iVector<Integer, Npoint> > _permute;
      std::vector<std::vector<int>    > _offsets;
      std::vector<std::vector<int>    > _local;
      std::vector<int>                  _comm_buf_size;
      std::vector<int>                  _permute;
    };
    Stencil::Stencil(GridBase *grid,
 		     int npoints,
 		     int checkerboard,
 		     std::vector<int> directions,
 		     std::vector<int> distances){
      _npoints = npoints;
      _grid    = grid;
      for(int i=0;i<npoints;i++){
 	int dimension    = directions[i];
 	int displacement = distances[i];
 	int fd = _grid->_fdimensions[dimension];
 	int rd = _grid->_rdimensions[dimension];
 	_checkerboard = checkerboard;
 	// the permute type
 	int simd_layout     = _grid->_simd_layout[dimension];
 	int comm_dim        = _grid->_processors[dimension] >1 ;
 	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
 	int sshift[2];
 	if ( !comm_dim ) {
 	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
 	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
 	  if ( sshift[0] == sshift[1] ) {
 	    Stencil_local(dimension,shift,0x3);
 	  } else {
 	    Stencil_local(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 	    Stencil_local(dimension,shift,0x2);// both with block stride loop iteration
 	  }
 	} else if ( splice_dim ) {
 	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
 	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
 	  if ( sshift[0] == sshift[1] ) {
 	    Stencil_comms_simd(dimension,shift,0x3);
 	  } else {
 	    Stencil_comms_simd(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 	    Stencil_comms_simd(dimension,shift,0x2);// both with block stride loop iteration
 	  }
 	} else {
 	  //	  Cshift_comms(ret,rhs,dimension,shift);
 	  sshift[0] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,0);
 	  sshift[1] = _grid->CheckerBoardShift(_checkerboard,dimension,shift,1);
 	  if ( sshift[0] == sshift[1] ) {
 	    Stencil_comms(dimension,shift,0x3);
 	  } else {
 	    Stencil_comms(dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 	    Stencil_comms(dimension,shift,0x2);// both with block stride loop iteration
 	  }
 	}
      }
    }
      void Stencil::Stencil_local     (int dimension,int shift,int cbmask)
      {
 	int fd = _grid->_fdimensions[dimension];
 	int rd = _grid->_rdimensions[dimension];
 	int ld = _grid->_ldimensions[dimension];
 	int gd = _grid->_gdimensions[dimension];
 	// Map to always positive shift modulo global full dimension.
 	shift = (shift+fd)%fd;
 	// the permute type
 	int permute_dim =_grid->PermuteDim(dimension);
 	int permute_type=_grid->PermuteType(dimension);
 	for(int x=0;x<rd;x++){       
 	  int o   = 0;
 	  int bo  = x * _grid->_ostride[dimension];
 	  int cb= (cbmask==0x2)? 1 : 0;
 	  int sshift = _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
 	  int sx     = (x+sshift)%rd;
 	  int permute_slice=0;
 	  if(permute_dim){
 	    int wrap = sshift/rd;
 	    int  num = sshift%rd;
 	    if ( x< rd-num ) permute_slice=wrap;
 	    else permute_slice = 1-wrap;
 	  }
 	  if ( permute_slice ) Copy_plane_permute(dimension,x,sx,cbmask,permute_type);
 	  else                 Copy_plane        (dimension,x,sx,cbmask); 
 	}
      }
      void Stencil::Stencil_comms     (int dimension,int shift,int cbmask)
      {
 	typedef typename vobj::vector_type vector_type;
 	typedef typename vobj::scalar_type scalar_type;
 	GridBase *grid=_grid;
 	int fd              = _grid->_fdimensions[dimension];
 	int rd              = _grid->_rdimensions[dimension];
 	int simd_layout     = _grid->_simd_layout[dimension];
 	int comm_dim        = _grid->_processors[dimension] >1 ;
 	assert(simd_layout==1);
 	assert(comm_dim==1);
 	assert(shift>=0);
 	assert(shift<fd);
 	int buffer_size = _grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
 	// FIXME: Do something with buffer_size??
 	int cb= (cbmask==0x2)? 1 : 0;
 	int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
 	for(int x=0;x<rd;x++){       
 	  int offnode = ( x+sshift >= rd );
 	  int sx        = (x+sshift)%rd;
 	  int comm_proc = (x+sshift)/rd;
 	  if (!offnode) {
 	    Copy_plane(dimension,x,sx,cbmask); 
 	  } else {
 	    int words = send_buf.size();
 	    if (cbmask != 0x3) words=words>>1;
 	    int bytes = words * sizeof(vobj);
 	    Gather_plane_simple (dimension,sx,cbmask);
 	    int rank           = grid->_processor;
 	    int recv_from_rank;
 	    int xmit_to_rank;
 	    grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
 	    /*	    
 	    grid->SendToRecvFrom((void *)&send_buf[0],
 				 xmit_to_rank,
 				 (void *)&recv_buf[0],
 				 recv_from_rank,
 				 bytes);
 	    */
 	    Scatter_plane_simple (dimension,x,cbmask);
 	  }
 	}
      }
      void Stencil::Stencil_comms_simd(int dimension,int shift,int cbmask)
      {
 	GridBase *grid=_grid;
 	const int Nsimd = _grid->Nsimd();
 	typedef typename vobj::vector_type vector_type;
 	typedef typename vobj::scalar_type scalar_type;
 	int fd = _grid->_fdimensions[dimension];
 	int rd = _grid->_rdimensions[dimension];
 	int ld = _grid->_ldimensions[dimension];
 	int simd_layout     = _grid->_simd_layout[dimension];
 	int comm_dim        = _grid->_processors[dimension] >1 ;
 	assert(comm_dim==1);
 	assert(simd_layout==2);
 	assert(shift>=0);
 	assert(shift<fd);
 	int permute_type=_grid->PermuteType(dimension);
 	///////////////////////////////////////////////
 	// Simd direction uses an extract/merge pair
 	///////////////////////////////////////////////
 	int buffer_size = _grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
 	// FIXME do something with buffer size
 	std::vector<scalar_type *> pointers(Nsimd);  // 
 	std::vector<scalar_type *> rpointers(Nsimd); // received pointers
 	///////////////////////////////////////////
 	// Work out what to send where
 	///////////////////////////////////////////
 	int cb    = (cbmask==0x2)? 1 : 0;
 	int sshift= _grid->CheckerBoardShift(_checkerboard,dimension,shift,cb);
 	std::vector<int> comm_offnode(simd_layout);
 	std::vector<int> comm_proc   (simd_layout);  //relative processor coord in dim=dimension
 	std::vector<int> icoor(grid->Nd());
 	for(int x=0;x<rd;x++){       
 	  int comm_any = 0;
 	  for(int s=0;s<simd_layout;s++) {
 	    int shifted_x   = x+s*rd+sshift;
 	    comm_offnode[s] = shifted_x >= ld; 
 	    comm_any        = comm_any | comm_offnode[s];
 	    comm_proc[s]    = shifted_x/ld;     
 	  }
 	  int o    = 0;
 	  int bo   = x*grid->_ostride[dimension];
 	  int sx   = (x+sshift)%rd;
 	  if ( comm_any ) {
 	    for(int i=0;i<Nsimd;i++){
 	      pointers[i] = (scalar_type *)&send_buf_extract[i][0];
 	    }
 	    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
 	    for(int i=0;i<Nsimd;i++){
 	      int s;
 	      grid->iCoorFromIindex(icoor,i);
 	      s = icoor[dimension];
 	      if(comm_offnode[s]){
 		int rank           = grid->_processor;
 		int recv_from_rank;
 		int xmit_to_rank;
 		grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
 		/*		
 		grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
 				     xmit_to_rank,
 				     (void *)&recv_buf_extract[i][0],
 				     recv_from_rank,
 				     bytes);
 		*/
 		rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
 	      } else { 
 		rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
 	      }
 	    }
 	    // Permute by swizzling pointers in merge
 	    int permute_slice=0;
 	    int lshift=sshift%ld;
 	    int wrap  =lshift/rd;
 	    int  num  =lshift%rd;
 	    if ( x< rd-num ) permute_slice=wrap;
 	    else permute_slice = 1-wrap;
 	    int toggle_bit = (Nsimd>>(permute_type+1));
 	    int PermuteMap;
 	    for(int i=0;i<Nsimd;i++){
 	      if ( permute_slice ) {
 		PermuteMap=i^toggle_bit;
 		pointers[i] = rpointers[PermuteMap];
 	      } else {
 		pointers[i] = rpointers[i];
 	      }
 	    }
 	    Scatter_plane_merge(pointers,dimension,x,cbmask);
 	  } else { 
 	    int permute_slice=0;
 	    int wrap = sshift/rd;
 	    int  num = sshift%rd;
 	    if ( x< rd-num ) permute_slice=wrap;
 	    else permute_slice = 1-wrap;
 	    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
 	    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
 	  }
 	}
      }
 };
--- a/Grid_vComplexD.h
+++ b/Grid_vComplexD.h
@@ -16,6 +16,12 @@ namespace Grid {
            return (*this);
        }
        vComplexD(){};
        vComplexD(ComplexD a){
 	  vsplat(*this,a);
 	};
        vComplexD(double a){
 	  vsplat(*this,ComplexD(a));
 	};
        ///////////////////////////////////////////////
        // mac, mult, sub, add, adj
@@ -167,7 +173,15 @@ namespace Grid {
 	{
 	  Gmerge<vComplexD,ComplexD >(y,extracted);
 	}
-	friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted)
+	friend inline void extract(const vComplexD &y,std::vector<ComplexD *> &extracted)
 	{
 	  Gextract<vComplexD,ComplexD>(y,extracted);
 	}
 	friend inline void merge(vComplexD &y,std::vector<ComplexD > &extracted)
 	{
 	  Gmerge<vComplexD,ComplexD >(y,extracted);
 	}
 	friend inline void extract(const vComplexD &y,std::vector<ComplexD > &extracted)
 	{
 	  Gextract<vComplexD,ComplexD>(y,extracted);
 	}
@@ -184,6 +198,11 @@ namespace Grid {
        ///////////////////////
        // Splat
        ///////////////////////
        friend inline void vsplat(vComplexD &ret,ComplexD c){
            float a= real(c);
            float b= imag(c);
            vsplat(ret,a,b);
        }
        friend inline void vsplat(vComplexD &ret,double rl,double ig){
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_pd(ig,rl,ig,rl);
@@ -215,7 +234,7 @@ namespace Grid {
 #endif
        }
-friend inline void vstore(vComplexD &ret, ComplexD *a){
+friend inline void vstore(const vComplexD &ret, ComplexD *a){
 #if defined (AVX1)|| defined (AVX2)
       _mm256_store_pd((double *)a,ret.v);
 #endif
--- a/Grid_vComplexF.h
+++ b/Grid_vComplexF.h
@@ -20,6 +20,12 @@ namespace Grid {
            return (*this);
        }
        vComplexF(){};
        vComplexF(ComplexF a){
 	  vsplat(*this,a);
 	};
        vComplexF(double a){
 	  vsplat(*this,ComplexF(a));
 	};
        ///////////////////////////////////////////////
        // mac, mult, sub, add, adj
@@ -161,7 +167,7 @@ namespace Grid {
            vsplat(ret,a,b);
        }
-friend inline void vstore(vComplexF &ret, ComplexF *a){
+friend inline void vstore(const vComplexF &ret, ComplexF *a){
 #if defined (AVX1)|| defined (AVX2)
        _mm256_store_ps((float *)a,ret.v);
 #endif
@@ -210,27 +216,47 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
 #endif
        }
        friend inline vComplexF operator * (const Complex &a, vComplexF b){
            vComplexF va;
            vsplat(va,a);
            return va*b;
        }
        friend inline vComplexF operator * (vComplexF b,const Complex &a){
 	  return a*b;
        }
       /*
 	template<class real>
        friend inline vComplexF operator * (vComplexF b,const real &a){
            vComplexF va;
-            vsplat(va,a);
+	    Complex ca(a,0);
            vsplat(va,ca);
            return va*b;
        }
 	template<class real>
 	friend inline vComplexF operator * (const real &a,vComplexF b){
 	  return a*b;
 	}
        friend inline vComplexF operator + (const Complex &a, vComplexF b){
            vComplexF va;
            vsplat(va,a);
            return va+b;
        }
        friend inline vComplexF operator + (vComplexF b,const Complex &a){
-            vComplexF va;
+            return a+b;
            vsplat(va,a);
            return b+va;
        }
 	template<class real>
        friend inline vComplexF operator + (vComplexF b,const real &a){
            vComplexF va;
 	    Complex ca(a,0);
            vsplat(va,ca);
            return va+b;
        }
 	template<class real>
 	friend inline vComplexF operator + (const real &a,vComplexF b){
 	  return a+b;
 	}
        friend inline vComplexF operator - (const Complex &a, vComplexF b){
            vComplexF va;
            vsplat(va,a);
@@ -241,7 +267,24 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
            vsplat(va,a);
            return b-va;
        }
-        // NB: Template the following on "type Complex" and then implement *,+,- for ComplexF, ComplexD, RealF, RealD above to
+	template<class real>
        friend inline vComplexF operator - (vComplexF b,const real &a){
            vComplexF va;
 	    Complex ca(a,0);
            vsplat(va,ca);
            return b-va;
        }
 	template<class real>
 	friend inline vComplexF operator - (const real &a,vComplexF b){
            vComplexF va;
 	    Complex ca(a,0);
            vsplat(va,ca);
            return va-b;
 	}
       */
        // NB: Template the following on "type Complex" and then implement *,+,- for 
 	// ComplexF, ComplexD, RealF, RealD above to
        // get full generality of binops with scalars.
        friend inline void mac (vComplexF *__restrict__ y,const Complex *__restrict__ a,const vComplexF *__restrict__ x){ *y = (*a)*(*x)+(*y); };
        friend inline void mult(vComplexF *__restrict__ y,const Complex *__restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
@@ -304,7 +347,15 @@ friend inline void vstore(vComplexF &ret, ComplexF *a){
      {
 	Gmerge<vComplexF,ComplexF >(y,extracted);
      }
-      friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted)
+      friend inline void extract(const vComplexF &y,std::vector<ComplexF *> &extracted)
      {
 	Gextract<vComplexF,ComplexF>(y,extracted);
      }
      friend inline void merge(vComplexF &y,std::vector<ComplexF > &extracted)
      {
 	Gmerge<vComplexF,ComplexF >(y,extracted);
      }
      friend inline void extract(const vComplexF &y,std::vector<ComplexF > &extracted)
      {
 	Gextract<vComplexF,ComplexF>(y,extracted);
      }
--- a/Grid_vInteger.h
+++ b/Grid_vInteger.h
@@ -10,7 +10,7 @@ namespace Grid {
  typedef uint32_t Integer;
-    class vInteger {
+  class vInteger {
    protected:
    public:
@@ -21,6 +21,13 @@ namespace Grid {
 	typedef Integer scalar_type;
        vInteger(){};
        vInteger & operator = (const Zero & z){
            vzero(*this);
            return (*this);
        }
        vInteger(Integer a){
 	  vsplat(*this,a);
 	};
        ////////////////////////////////////
        // Arithmetic operator overloads +,-,*
        ////////////////////////////////////
@@ -166,18 +173,18 @@ namespace Grid {
 #endif
 	}
-friend inline void vstore(vInteger &ret, Integer *a){
+	friend inline void vstore(const vInteger &ret, Integer *a){
 #if defined (AVX1)|| defined (AVX2)
-        _mm256_store_si256((__m256i*)a,ret.v);
+	  _mm256_store_si256((__m256i*)a,ret.v);
 #endif
 #ifdef SSE2
-	_mm_store_si128(a,ret.v);
+	  _mm_store_si128(a,ret.v);
 #endif
 #ifdef AVX512
-	_mm512_store_si512(a,ret.v);
+	  _mm512_store_si512(a,ret.v);
 #endif
 #ifdef QPX
-	assert(0);
+	  assert(0);
 #endif
        }
@@ -185,6 +192,7 @@ friend inline void vstore(vInteger &ret, Integer *a){
        {
            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
        }
        // Unary negation
        friend inline vInteger operator -(const vInteger &r) {
            vInteger ret;
@@ -210,9 +218,32 @@ friend inline void vstore(vInteger &ret, Integer *a){
            *this = *this-r;
            return *this;
        }
      friend inline void permute(vInteger &y,const vInteger b,int perm)
      {
 	Gpermute<vInteger>(y,b,perm);
      }
      friend inline void merge(vInteger &y,std::vector<Integer *> &extracted)
      {
 	Gmerge<vInteger,Integer>(y,extracted);
      }
      friend inline void extract(const vInteger &y,std::vector<Integer *> &extracted)
      {
 	Gextract<vInteger,Integer>(y,extracted);
      }
      friend inline void merge(vInteger &y,std::vector<Integer> &extracted)
      {
 	Gmerge<vInteger,Integer>(y,extracted);
      }
      friend inline void extract(const vInteger &y,std::vector<Integer> &extracted)
      {
 	Gextract<vInteger,Integer>(y,extracted);
      }
    public:
-        static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
+        static inline int Nsimd(void) { return sizeof(ivec)/sizeof(Integer);}
-    };
+  };
    inline vInteger localInnerProduct(const vInteger & l, const vInteger & r) { return l*r; }
@@ -222,27 +253,7 @@ friend inline void vstore(vInteger &ret, Integer *a){
    {
        return l*r;
    }
- 
+    
    class vIntegerF : public vInteger
    {
    public:
      static inline int Nsimd(void) { return sizeof(ivec)/sizeof(float);}
      friend inline void permute(vIntegerF &y,vIntegerF b,int perm)
      {
 	Gpermute<vIntegerF>(y,b,perm);
      }
      friend inline void merge(vIntegerF &y,std::vector<Integer *> &extracted)
      {
 	Gmerge<vIntegerF,Integer>(y,extracted);
      }
      friend inline void extract(vIntegerF &y,std::vector<Integer *> &extracted)
      {
 	Gextract<vIntegerF,Integer>(y,extracted);
      }
    };
 }
 #endif
--- a/Grid_vRealD.h
+++ b/Grid_vRealD.h
@@ -13,6 +13,9 @@ namespace Grid {
 	typedef RealD scalar_type;
        vRealD(){};
        vRealD(RealD a){
 	  vsplat(*this,a);
 	};
        friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
        friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
@@ -112,7 +115,15 @@ namespace Grid {
 	{
 	  Gmerge<vRealD,RealD >(y,extracted);
 	}
-	friend inline void extract(vRealD &y,std::vector<RealD *> &extracted)
+	friend inline void extract(const vRealD &y,std::vector<RealD *> &extracted)
 	{
 	  Gextract<vRealD,RealD>(y,extracted);
 	}
 	friend inline void merge(vRealD &y,std::vector<RealD > &extracted)
 	{
 	  Gmerge<vRealD,RealD >(y,extracted);
 	}
 	friend inline void extract(const vRealD &y,std::vector<RealD > &extracted)
 	{
 	  Gextract<vRealD,RealD>(y,extracted);
 	}
@@ -157,7 +168,7 @@ namespace Grid {
 #endif
 	}
-	friend inline void vstore(vRealD &ret, double *a){
+	friend inline void vstore(const vRealD &ret, double *a){
 #if defined (AVX1)|| defined (AVX2)
            _mm256_store_pd(a,ret.v);
 #endif
--- a/Grid_vRealF.h
+++ b/Grid_vRealF.h
@@ -14,6 +14,9 @@ namespace Grid {
 	typedef RealF scalar_type;
        vRealF(){};
        vRealF(RealF a){
 	  vsplat(*this,a);
 	};
        ////////////////////////////////////
        // Arithmetic operator overloads +,-,*
        ////////////////////////////////////
@@ -133,7 +136,15 @@ namespace Grid {
 	{
 	  Gmerge<vRealF,RealF >(y,extracted);
 	}
-	friend inline void extract(vRealF &y,std::vector<RealF *> &extracted)
+	friend inline void extract(const vRealF &y,std::vector<RealF *> &extracted)
 	{
 	  Gextract<vRealF,RealF>(y,extracted);
 	}
 	friend inline void merge(vRealF &y,std::vector<RealF> &extracted)
 	{
 	  Gmerge<vRealF,RealF >(y,extracted);
 	}
 	friend inline void extract(const vRealF &y,std::vector<RealF> &extracted)
 	{
 	  Gextract<vRealF,RealF>(y,extracted);
 	}
@@ -180,7 +191,7 @@ namespace Grid {
 	////////////////////////////////////////////////////////////////////////
 	// FIXME:  gonna remove these load/store, get, set, prefetch
 	////////////////////////////////////////////////////////////////////////
-friend inline void vstore(vRealF &ret, float *a){
+friend inline void vstore(const vRealF &ret, float *a){
 #if defined (AVX1)|| defined (AVX2)
 	_mm256_store_ps(a,ret.v);
 #endif
--- a/67
+++ b/67
@@ -1,29 +1,68 @@
 * FIXME audit
 * Remove vload/store etc..
 * Replace vset with a call to merge.
 * Replace vset with a call to merge.
 * Const audit
 * extract / merge extra implementation removal
-* Conditional execution Subset, where etc...
+* Conditional execution, where etc...         -----DONE, simple test
-* Coordinate information, integers etc...
+* Integer relational support                         -----DONE
-* Integer type padding/union to vector.
+* Coordinate information, integers etc...            -----DONE
-* LatticeCoordinate[mu]
+* Integer type padding/union to vector.              -----DONE 
 * LatticeCoordinate[mu]                              -----DONE
 * Optimise the extract/merge SIMD routines
-* Broadcast, reduction tests.
+* Stencil operator support                           -----Initial thoughts
 * Subset support, slice sums etc...                  -----Only need slice sum?
                                                     -----Generic cartesian subslicing?
                                                     -----Array ranges / boost extents?
                                                     -----Multigrid grid transferral.
                                                     -----Suggests generalised cartesian subblocking
                                                          sums, returning modified grid.
    Two classes of subset;
 i) red black parit subsetting.
   (pick checkerboard).
 ii) Need to be able to project one Grid to another Grid.
    Generic concept is to subdivide (based on RD so applies to red/black or full).
    Return a type on SUB-grid from CellSum TOP-grid
    SUB-grid need not distribute but be replicated in some dims if that is how the
    cartesian communicator works.
 iii) No general permutation map.
 * Consider switch std::vector to boost arrays.
  boost::multi_array<type, 3> A()...    to replace multi1d, multi2d etc..
 *? Cell definition <-> sliceSum.
 ? Replicated arrays.
 * Check for missing functionality                    - partially audited against QDP++ layout
 * Optimise the extract/merge SIMD routines; Azusa??
 -- I have collated into single location at least.
 -- Need to use _mm_*insert/extract routines.
 * Conformable test in Cshift routines.
 * Gamma/Dirac structures
 * Fourspin, two spin project
 * Broadcast, reduction tests. innerProduct, localInnerProduct
 * QDP++ regression suite and comparative benchmark
 * NERSC Lattice loading, plaquette test
 * Conformable test in Cshift routines.
 * Gamma/Dirac structures
 * Fourspin, two spin project
 * Stencil operator support
 * Check for missing functionality
 * I/O support
  - MPI IO?