Hadrons: moving Hadrons to root directory, build system improvements

2025-06-18 15:57:05 +01:00 · 2018-08-28 15:00:40 +01:00
parent 5f206df775
commit fb7d021b9d
499 changed files with 429 additions and 846 deletions
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@ -0,0 +1,33 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Lattice.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_H
+#define GRID_LATTICE_H
+
+#include <Grid/lattice/Lattice_base.h>
+
+#endif
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@ -0,0 +1,466 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/lattice/Lattice_ET.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_ET_H
+#define GRID_LATTICE_ET_H
+
+#include <iostream>
+#include <tuple>
+#include <typeinfo>
+#include <vector>
+
+namespace Grid {
+
+////////////////////////////////////////////////////
+// Predicated where support
+////////////////////////////////////////////////////
+template <class iobj, class vobj, class robj>
+inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
+                            const robj &iffalse) {
+  typename std::remove_const<vobj>::type ret;
+
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  const int Nsimd = vobj::vector_type::Nsimd();
+  const int words = sizeof(vobj) / sizeof(vector_type);
+
+  std::vector<Integer> mask(Nsimd);
+  std::vector<scalar_object> truevals(Nsimd);
+  std::vector<scalar_object> falsevals(Nsimd);
+
+  extract(iftrue, truevals);
+  extract(iffalse, falsevals);
+  extract<vInteger, Integer>(TensorRemove(predicate), mask);
+
+  for (int s = 0; s < Nsimd; s++) {
+    if (mask[s]) falsevals[s] = truevals[s];
+  }
+
+  merge(ret, falsevals);
+  return ret;
+}
+
+////////////////////////////////////////////
+// recursive evaluation of expressions; Could
+// switch to generic approach with variadics, a la
+// Antonin's Lat Sim but the repack to variadic with popped
+// from tuple is hideous; C++14 introduces std::make_index_sequence for this
+////////////////////////////////////////////
+
+// leaf eval of lattice ; should enable if protect using traits
+
+template <typename T>
+using is_lattice = std::is_base_of<LatticeBase, T>;
+
+template <typename T>
+using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
+
+template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
+
+//Specialization of getVectorType for lattices
+template<typename T>
+struct getVectorType<Lattice<T> >{
+  typedef typename Lattice<T>::vector_object type;
+};
+ 
+template<class sobj>
+inline sobj eval(const unsigned int ss, const sobj &arg)
+{
+  return arg;
+}
+template <class lobj>
+inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
+  return arg._odata[ss];
+}
+
+// handle nodes in syntax tree
+template <typename Op, typename T1>
+auto inline eval(
+    const unsigned int ss,
+    const LatticeUnaryExpression<Op, T1> &expr)  // eval one operand
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)));
+}
+
+template <typename Op, typename T1, typename T2>
+auto inline eval(
+    const unsigned int ss,
+    const LatticeBinaryExpression<Op, T1, T2> &expr)  // eval two operands
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+                                eval(ss, std::get<1>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)),
+                         eval(ss, std::get<1>(expr.second)));
+}
+
+template <typename Op, typename T1, typename T2, typename T3>
+auto inline eval(const unsigned int ss,
+                 const LatticeTrinaryExpression<Op, T1, T2, T3>
+                     &expr)  // eval three operands
+    -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
+                                eval(ss, std::get<1>(expr.second)),
+                                eval(ss, std::get<2>(expr.second)))) {
+  return expr.first.func(eval(ss, std::get<0>(expr.second)),
+                         eval(ss, std::get<1>(expr.second)),
+                         eval(ss, std::get<2>(expr.second)));
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Obtain the grid from an expression, ensuring conformable. This must follow a
+// tree recursion
+//////////////////////////////////////////////////////////////////////////
+template <class T1,
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf
+{
+  if (grid) {
+    conformable(grid, lat._grid);
+  }
+  grid = lat._grid;
+}
+template <class T1,
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void GridFromExpression(GridBase *&grid,
+                               const T1 &notlat)  // non-lattice leaf
+{}
+template <typename Op, typename T1>
+inline void GridFromExpression(GridBase *&grid,
+                               const LatticeUnaryExpression<Op, T1> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+}
+
+template <typename Op, typename T1, typename T2>
+inline void GridFromExpression(
+    GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid, std::get<1>(expr.second));
+}
+template <typename Op, typename T1, typename T2, typename T3>
+inline void GridFromExpression(
+    GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+  GridFromExpression(grid, std::get<0>(expr.second));  // recurse
+  GridFromExpression(grid, std::get<1>(expr.second));
+  GridFromExpression(grid, std::get<2>(expr.second));
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Obtain the CB from an expression, ensuring conformable. This must follow a
+// tree recursion
+//////////////////////////////////////////////////////////////////////////
+template <class T1,
+          typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
+{
+  if ((cb == Odd) || (cb == Even)) {
+    assert(cb == lat.checkerboard);
+  }
+  cb = lat.checkerboard;
+  //  std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
+}
+template <class T1,
+          typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
+{
+  //  std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
+}
+template <typename Op, typename T1>
+inline void CBFromExpression(int &cb,
+                             const LatticeUnaryExpression<Op, T1> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  //  std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
+}
+
+template <typename Op, typename T1, typename T2>
+inline void CBFromExpression(int &cb,
+                             const LatticeBinaryExpression<Op, T1, T2> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb, std::get<1>(expr.second));
+  //  std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
+}
+template <typename Op, typename T1, typename T2, typename T3>
+inline void CBFromExpression(
+    int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
+  CBFromExpression(cb, std::get<0>(expr.second));  // recurse
+  CBFromExpression(cb, std::get<1>(expr.second));
+  CBFromExpression(cb, std::get<2>(expr.second));
+  //  std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
+}
+
+////////////////////////////////////////////
+// Unary operators and funcs
+////////////////////////////////////////////
+#define GridUnopClass(name, ret)                                          \
+  template <class arg>                                                    \
+  struct name {                                                           \
+    static auto inline func(const arg a) -> decltype(ret) { return ret; } \
+  };
+
+GridUnopClass(UnarySub, -a);
+GridUnopClass(UnaryNot, Not(a));
+GridUnopClass(UnaryAdj, adj(a));
+GridUnopClass(UnaryConj, conjugate(a));
+GridUnopClass(UnaryTrace, trace(a));
+GridUnopClass(UnaryTranspose, transpose(a));
+GridUnopClass(UnaryTa, Ta(a));
+GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
+GridUnopClass(UnaryReal, real(a));
+GridUnopClass(UnaryImag, imag(a));
+GridUnopClass(UnaryToReal, toReal(a));
+GridUnopClass(UnaryToComplex, toComplex(a));
+GridUnopClass(UnaryTimesI, timesI(a));
+GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
+GridUnopClass(UnaryAbs, abs(a));
+GridUnopClass(UnarySqrt, sqrt(a));
+GridUnopClass(UnaryRsqrt, rsqrt(a));
+GridUnopClass(UnarySin, sin(a));
+GridUnopClass(UnaryCos, cos(a));
+GridUnopClass(UnaryAsin, asin(a));
+GridUnopClass(UnaryAcos, acos(a));
+GridUnopClass(UnaryLog, log(a));
+GridUnopClass(UnaryExp, exp(a));
+
+////////////////////////////////////////////
+// Binary operators
+////////////////////////////////////////////
+#define GridBinOpClass(name, combination)                      \
+  template <class left, class right>                           \
+  struct name {                                                \
+    static auto inline func(const left &lhs, const right &rhs) \
+        -> decltype(combination) const {                       \
+      return combination;                                      \
+    }                                                          \
+  }
+GridBinOpClass(BinaryAdd, lhs + rhs);
+GridBinOpClass(BinarySub, lhs - rhs);
+GridBinOpClass(BinaryMul, lhs *rhs);
+GridBinOpClass(BinaryDiv, lhs /rhs);
+
+GridBinOpClass(BinaryAnd, lhs &rhs);
+GridBinOpClass(BinaryOr, lhs | rhs);
+GridBinOpClass(BinaryAndAnd, lhs &&rhs);
+GridBinOpClass(BinaryOrOr, lhs || rhs);
+
+////////////////////////////////////////////////////
+// Trinary conditional op
+////////////////////////////////////////////////////
+#define GridTrinOpClass(name, combination)                                     \
+  template <class predicate, class left, class right>                          \
+  struct name {                                                                \
+    static auto inline func(const predicate &pred, const left &lhs,            \
+                            const right &rhs) -> decltype(combination) const { \
+      return combination;                                                      \
+    }                                                                          \
+  }
+
+GridTrinOpClass(
+    TrinaryWhere,
+    (predicatedWhere<predicate, typename std::remove_reference<left>::type,
+                     typename std::remove_reference<right>::type>(pred, lhs,
+                                                                  rhs)));
+
+////////////////////////////////////////////
+// Operator syntactical glue
+////////////////////////////////////////////
+
+#define GRID_UNOP(name) name<decltype(eval(0, arg))>
+#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+#define GRID_TRINOP(name) \
+  name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+
+#define GRID_DEF_UNOP(op, name)                                             \
+  template <typename T1,                                                    \
+            typename std::enable_if<is_lattice<T1>::value ||                \
+                                        is_lattice_expr<T1>::value,         \
+                                    T1>::type * = nullptr>                  \
+  inline auto op(const T1 &arg)                                             \
+      ->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(       \
+          std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
+    return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(             \
+        std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)));     \
+  }
+
+#define GRID_BINOP_LEFT(op, name)                                             \
+  template <typename T1, typename T2,                                         \
+            typename std::enable_if<is_lattice<T1>::value ||                  \
+                                        is_lattice_expr<T1>::value,           \
+                                    T1>::type * = nullptr>                    \
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
+      ->decltype(                                                             \
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
+              std::make_pair(GRID_BINOP(name)(),                              \
+                             std::forward_as_tuple(lhs, rhs)))) {             \
+    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
+        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
+  }
+
+#define GRID_BINOP_RIGHT(op, name)                                            \
+  template <typename T1, typename T2,                                         \
+            typename std::enable_if<!is_lattice<T1>::value &&                 \
+                                        !is_lattice_expr<T1>::value,          \
+                                    T1>::type * = nullptr,                    \
+            typename std::enable_if<is_lattice<T2>::value ||                  \
+                                        is_lattice_expr<T2>::value,           \
+                                    T2>::type * = nullptr>                    \
+  inline auto op(const T1 &lhs, const T2 &rhs)                                \
+      ->decltype(                                                             \
+          LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(  \
+              std::make_pair(GRID_BINOP(name)(),                              \
+                             std::forward_as_tuple(lhs, rhs)))) {             \
+    return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
+        std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
+  }
+
+#define GRID_DEF_BINOP(op, name) \
+  GRID_BINOP_LEFT(op, name);     \
+  GRID_BINOP_RIGHT(op, name);
+
+#define GRID_DEF_TRINOP(op, name)                                              \
+  template <typename T1, typename T2, typename T3>                             \
+  inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)                 \
+      ->decltype(                                                              \
+          LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,  \
+                                   const T3 &>(std::make_pair(                 \
+              GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) {  \
+    return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
+                                    const T3 &>(std::make_pair(                \
+        GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)));          \
+  }
+////////////////////////
+// Operator definitions
+////////////////////////
+
+GRID_DEF_UNOP(operator-, UnarySub);
+GRID_DEF_UNOP(Not, UnaryNot);
+GRID_DEF_UNOP(operator!, UnaryNot);
+GRID_DEF_UNOP(adj, UnaryAdj);
+GRID_DEF_UNOP(conjugate, UnaryConj);
+GRID_DEF_UNOP(trace, UnaryTrace);
+GRID_DEF_UNOP(transpose, UnaryTranspose);
+GRID_DEF_UNOP(Ta, UnaryTa);
+GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
+GRID_DEF_UNOP(real, UnaryReal);
+GRID_DEF_UNOP(imag, UnaryImag);
+GRID_DEF_UNOP(toReal, UnaryToReal);
+GRID_DEF_UNOP(toComplex, UnaryToComplex);
+GRID_DEF_UNOP(timesI, UnaryTimesI);
+GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
+GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
+                               // abs-fabs-dabs-labs thing
+GRID_DEF_UNOP(sqrt, UnarySqrt);
+GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
+GRID_DEF_UNOP(sin, UnarySin);
+GRID_DEF_UNOP(cos, UnaryCos);
+GRID_DEF_UNOP(asin, UnaryAsin);
+GRID_DEF_UNOP(acos, UnaryAcos);
+GRID_DEF_UNOP(log, UnaryLog);
+GRID_DEF_UNOP(exp, UnaryExp);
+
+GRID_DEF_BINOP(operator+, BinaryAdd);
+GRID_DEF_BINOP(operator-, BinarySub);
+GRID_DEF_BINOP(operator*, BinaryMul);
+GRID_DEF_BINOP(operator/, BinaryDiv);
+
+GRID_DEF_BINOP(operator&, BinaryAnd);
+GRID_DEF_BINOP(operator|, BinaryOr);
+GRID_DEF_BINOP(operator&&, BinaryAndAnd);
+GRID_DEF_BINOP(operator||, BinaryOrOr);
+
+GRID_DEF_TRINOP(where, TrinaryWhere);
+
+/////////////////////////////////////////////////////////////
+// Closure convenience to force expression to evaluate
+/////////////////////////////////////////////////////////////
+template <class Op, class T1>
+auto closure(const LatticeUnaryExpression<Op, T1> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
+      expr);
+  return ret;
+}
+template <class Op, class T1, class T2>
+auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                        eval(0, std::get<1>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second))))>
+      ret(expr);
+  return ret;
+}
+template <class Op, class T1, class T2, class T3>
+auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
+    -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                        eval(0, std::get<1>(expr.second)),
+                                        eval(0, std::get<2>(expr.second))))> {
+  Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
+                                   eval(0, std::get<1>(expr.second)),
+                                   eval(0, std::get<2>(expr.second))))>
+      ret(expr);
+  return ret;
+}
+
+#undef GRID_UNOP
+#undef GRID_BINOP
+#undef GRID_TRINOP
+
+#undef GRID_DEF_UNOP
+#undef GRID_DEF_BINOP
+#undef GRID_DEF_TRINOP
+}
+
+#if 0
+using namespace Grid;
+        
+ int main(int argc,char **argv){
+   
+   Lattice<double> v1(16);
+   Lattice<double> v2(16);
+   Lattice<double> v3(16);
+
+   BinaryAdd<double,double> tmp;
+   LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &> 
+     expr(std::make_pair(tmp,
+    std::forward_as_tuple(v1,v2)));
+   tmp.func(eval(0,v1),eval(0,v2));
+
+   auto var = v1+v2;
+   std::cout<<GridLogMessage<<typeid(var).name()<<std::endl;
+
+   v3=v1+v2;
+   v3=v1+v2+v1*v2;
+ };
+
+void testit(Lattice<double> &v1,Lattice<double> &v2,Lattice<double> &v3)
+{
+   v3=v1+v2+v1*v2;
+}
+#endif
+
+#endif
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@ -0,0 +1,255 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_arith.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_ARITH_H
+#define GRID_LATTICE_ARITH_H
+
+namespace Grid {
+
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  avoid copy back routines for mult, mac, sub, add
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class obj1,class obj2,class obj3> strong_inline
+    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
+    conformable(lhs,rhs);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      obj1 tmp;
+      mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
+      vstream(ret._odata[ss],tmp);
+#else
+      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3> strong_inline
+    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
+    conformable(lhs,rhs);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      obj1 tmp;
+      mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
+      vstream(ret._odata[ss],tmp);
+#else
+      mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3> strong_inline
+    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
+    conformable(lhs,rhs);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      obj1 tmp;
+      sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
+      vstream(ret._odata[ss],tmp);
+#else
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
+    }
+  }
+  template<class obj1,class obj2,class obj3> strong_inline
+    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,rhs);
+    conformable(lhs,rhs);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      obj1 tmp;
+      add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
+      vstream(ret._odata[ss],tmp);
+#else
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
+#endif
+    }
+  }
+  
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  avoid copy back routines for mult, mac, sub, add
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class obj1,class obj2,class obj3> strong_inline
+    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(lhs,ret);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+      obj1 tmp;
+      mult(&tmp,&lhs._odata[ss],&rhs);
+      vstream(ret._odata[ss],tmp);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3> strong_inline
+    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,lhs);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+      obj1 tmp;
+      mac(&tmp,&lhs._odata[ss],&rhs);
+      vstream(ret._odata[ss],tmp);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3> strong_inline
+    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(ret,lhs);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      obj1 tmp;
+      sub(&tmp,&lhs._odata[ss],&rhs);
+      vstream(ret._odata[ss],tmp);
+#else 
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
+#endif
+    }
+  }
+  template<class obj1,class obj2,class obj3> strong_inline
+    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    ret.checkerboard = lhs.checkerboard;
+    conformable(lhs,ret);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      obj1 tmp;
+      add(&tmp,&lhs._odata[ss],&rhs);
+      vstream(ret._odata[ss],tmp);
+#else 
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs);
+#endif
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  avoid copy back routines for mult, mac, sub, add
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class obj1,class obj2,class obj3> strong_inline
+    void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      obj1 tmp;
+      mult(&tmp,&lhs,&rhs._odata[ss]);
+      vstream(ret._odata[ss],tmp);
+#else 
+      mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3> strong_inline
+    void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      obj1 tmp;
+      mac(&tmp,&lhs,&rhs._odata[ss]);
+      vstream(ret._odata[ss],tmp);
+#else 
+      mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3> strong_inline
+    void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      obj1 tmp;
+      sub(&tmp,&lhs,&rhs._odata[ss]);
+      vstream(ret._odata[ss],tmp);
+#else 
+      sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
+    }
+  }
+  template<class obj1,class obj2,class obj3> strong_inline
+    void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      obj1 tmp;
+      add(&tmp,&lhs,&rhs._odata[ss]);
+      vstream(ret._odata[ss],tmp);
+#else 
+      add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+#endif
+    }
+  }
+  
+  template<class sobj,class vobj> strong_inline
+  void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    ret.checkerboard = x.checkerboard;
+    conformable(ret,x);
+    conformable(x,y);
+    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      vobj tmp = a*x._odata[ss]+y._odata[ss];
+      vstream(ret._odata[ss],tmp);
+#else
+      ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
+#endif
+    }
+  }
+  template<class sobj,class vobj> strong_inline
+  void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    ret.checkerboard = x.checkerboard;
+    conformable(ret,x);
+    conformable(x,y);
+    parallel_for(int ss=0;ss<x._grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
+      vstream(ret._odata[ss],tmp);
+#else
+      ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
+#endif
+    }
+  }
+
+  template<class sobj,class vobj> strong_inline
+  RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    return axpy_norm_fast(ret,a,x,y);
+  }
+  template<class sobj,class vobj> strong_inline
+  RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+    return axpby_norm_fast(ret,a,b,x,y);
+  }
+
+}
+#endif
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@ -0,0 +1,375 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/lattice/Lattice_base.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_BASE_H
+#define GRID_LATTICE_BASE_H
+
+#define STREAMING_STORES
+
+namespace Grid {
+
+// TODO: 
+//       mac,real,imag
+
+// Functionality:
+//     -=,+=,*=,()
+//     add,+,sub,-,mult,mac,*
+//     adj,conjugate
+//     real,imag
+//     transpose,transposeIndex  
+//     trace,traceIndex
+//     peekIndex
+//     innerProduct,outerProduct,
+//     localNorm2
+//     localInnerProduct
+
+extern int GridCshiftPermuteMap[4][16];
+
+////////////////////////////////////////////////
+// Basic expressions used in Expression Template
+////////////////////////////////////////////////
+
+class LatticeBase
+{
+public:
+    virtual ~LatticeBase(void) = default;
+    GridBase *_grid;
+};
+    
+class LatticeExpressionBase {};
+
+template <typename Op, typename T1>                           
+class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
+ public:
+ LatticeUnaryExpression(const std::pair<Op,std::tuple<T1> > &arg): std::pair<Op,std::tuple<T1> >(arg) {};
+};
+
+template <typename Op, typename T1, typename T2>              
+class LatticeBinaryExpression : public std::pair<Op,std::tuple<T1,T2> > , public LatticeExpressionBase {
+ public:
+ LatticeBinaryExpression(const std::pair<Op,std::tuple<T1,T2> > &arg): std::pair<Op,std::tuple<T1,T2> >(arg) {};
+};
+
+template <typename Op, typename T1, typename T2, typename T3> 
+class LatticeTrinaryExpression :public std::pair<Op,std::tuple<T1,T2,T3> >, public LatticeExpressionBase {
+ public:
+ LatticeTrinaryExpression(const std::pair<Op,std::tuple<T1,T2,T3> > &arg): std::pair<Op,std::tuple<T1,T2,T3> >(arg) {};
+};
+
+void inline conformable(GridBase *lhs,GridBase *rhs)
+{
+  assert(lhs == rhs);
+}
+
+template<class vobj>
+class Lattice : public LatticeBase
+{
+public:
+    int checkerboard;
+    Vector<vobj> _odata;
+    
+    // to pthread need a computable loop where loop induction is not required
+    int begin(void) { return 0;};
+    int end(void)   { return _odata.size(); }
+    vobj & operator[](int i) { return _odata[i]; };
+    const vobj & operator[](int i) const { return _odata[i]; };
+
+public:
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::vector_type vector_type;
+    typedef vobj vector_object;
+   
+  ////////////////////////////////////////////////////////////////////////////////
+  // Expression Template closure support
+  ////////////////////////////////////////////////////////////////////////////////
+  template <typename Op, typename T1>                         strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
+  {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
+      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
+    }
+    return *this;
+  }
+  template <typename Op, typename T1,typename T2> strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
+  {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
+      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
+    }
+    return *this;
+  }
+  template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
+  {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      //vobj tmp = eval(ss,expr);
+      vstream(_odata[ss] ,eval(ss,expr));
+#else
+      _odata[ss] = eval(ss,expr);
+#endif
+    }
+    return *this;
+  }
+  //GridFromExpression is tricky to do
+  template<class Op,class T1>
+    Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
+    _grid = nullptr;
+    GridFromExpression(_grid,expr);
+    assert(_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
+    _odata.resize(_grid->oSites());
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
+      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
+    }
+  };
+  template<class Op,class T1, class T2>
+  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
+    _grid = nullptr;
+    GridFromExpression(_grid,expr);
+    assert(_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
+    _odata.resize(_grid->oSites());
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+#ifdef STREAMING_STORES
+      vobj tmp = eval(ss,expr);
+      vstream(_odata[ss] ,tmp);
+#else
+      _odata[ss]=eval(ss,expr);
+#endif
+    }
+  };
+  template<class Op,class T1, class T2, class T3>
+  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
+    _grid = nullptr;
+    GridFromExpression(_grid,expr);
+    assert(_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    checkerboard=cb;
+
+    _odata.resize(_grid->oSites());
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      vstream(_odata[ss] ,eval(ss,expr));
+    }
+  };
+
+  //////////////////////////////////////////////////////////////////
+  // Constructor requires "grid" passed.
+  // what about a default grid?
+  //////////////////////////////////////////////////////////////////
+  Lattice(GridBase *grid) : _odata(grid->oSites()) {
+    _grid = grid;
+    //        _odata.reserve(_grid->oSites());
+    //        _odata.resize(_grid->oSites());
+    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
+    assert((((uint64_t)&_odata[0])&0xF) ==0);
+    checkerboard=0;
+  }
+  
+  Lattice(const Lattice& r){ // copy constructor
+    _grid = r._grid;
+    checkerboard = r.checkerboard;
+    _odata.resize(_grid->oSites());// essential
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      _odata[ss]=r._odata[ss];
+    }  	
+  }
+
+  Lattice(Lattice&& r){ // move constructor
+    _grid = r._grid;
+    checkerboard = r.checkerboard;
+    _odata=std::move(r._odata);
+  }
+  
+  inline Lattice<vobj> & operator = (Lattice<vobj> && r)
+  {
+    _grid        = r._grid;
+    checkerboard = r.checkerboard;
+    _odata       =std::move(r._odata);
+    return *this;
+  }
+
+  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
+    _grid        = r._grid;
+    checkerboard = r.checkerboard;
+    _odata.resize(_grid->oSites());// essential
+    
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      _odata[ss]=r._odata[ss];
+    }  	
+    return *this;
+  }
+
+  template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
+    this->checkerboard = r.checkerboard;
+    conformable(*this,r);
+    
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      this->_odata[ss]=r._odata[ss];
+    }
+    return *this;
+  }
+
+  virtual ~Lattice(void) = default;
+    
+  void reset(GridBase* grid) {
+    if (_grid != grid) {
+      _grid = grid;
+      _odata.resize(grid->oSites());
+      checkerboard = 0;
+    }
+  }
+  
+
+  template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
+    parallel_for(int ss=0;ss<_grid->oSites();ss++){
+      this->_odata[ss]=r;
+    }
+    return *this;
+  }
+  
+  
+  // *=,+=,-= operators inherit behvour from correspond */+/- operation
+  template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
+    *this = (*this)*r;
+    return *this;
+  }
+  
+  template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
+    *this = (*this)-r;
+    return *this;
+  }
+  template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
+    *this = (*this)+r;
+    return *this;
+  }
+}; // class Lattice
+  
+  template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
+    std::vector<int> gcoor;
+    typedef typename vobj::scalar_object sobj;
+    sobj ss;
+    for(int g=0;g<o._grid->_gsites;g++){
+      o._grid->GlobalIndexToGlobalCoor(g,gcoor);
+      peekSite(ss,o,gcoor);
+      stream<<"[";
+      for(int d=0;d<gcoor.size();d++){
+	stream<<gcoor[d];
+	if(d!=gcoor.size()-1) stream<<",";
+      }
+      stream<<"]\t";
+      stream<<ss<<std::endl;
+    }
+    return stream;
+  }
+  
+}
+
+
+
+#include "Lattice_conformable.h"
+#define GRID_LATTICE_EXPRESSION_TEMPLATES
+#ifdef  GRID_LATTICE_EXPRESSION_TEMPLATES
+#include "Lattice_ET.h"
+#else 
+#include "Lattice_overload.h"
+#endif
+#include "Lattice_arith.h"
+#include "Lattice_trace.h"
+#include "Lattice_transpose.h"
+#include "Lattice_local.h"
+#include "Lattice_reduction.h"
+#include "Lattice_peekpoke.h"
+#include "Lattice_reality.h"
+#include "Lattice_comparison_utils.h"
+#include "Lattice_comparison.h"
+#include "Lattice_coordinate.h"
+#include "Lattice_where.h"
+#include "Lattice_rng.h"
+#include "Lattice_unary.h"
+#include "Lattice_transfer.h"
+
+
+#endif
--- a/Grid/lattice/Lattice_comparison.h
+++ b/Grid/lattice/Lattice_comparison.h
@ -0,0 +1,169 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_comparison.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_COMPARISON_H
+#define GRID_LATTICE_COMPARISON_H
+
+namespace Grid {
+
+    //////////////////////////////////////////////////////////////////////////
+    // relational operators
+    // 
+    // Support <,>,<=,>=,==,!=
+    //
+    //Query supporting bitwise &, |, ^, !
+    //Query supporting logical &&, ||, 
+    //////////////////////////////////////////////////////////////////////////
+
+  //////////////////////////////////////////////////////////////////////////
+  // compare lattice to lattice
+  //////////////////////////////////////////////////////////////////////////
+  template<class vfunctor,class lobj,class robj>  
+    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
+  {
+    Lattice<vInteger> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
+    }
+    return ret;
+  }
+  //////////////////////////////////////////////////////////////////////////
+  // compare lattice to scalar
+  //////////////////////////////////////////////////////////////////////////
+  template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
+  {
+    Lattice<vInteger> ret(lhs._grid);
+    parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs);
+    }
+    return ret;
+  }
+  //////////////////////////////////////////////////////////////////////////
+  // compare scalar to lattice
+  //////////////////////////////////////////////////////////////////////////
+  template<class vfunctor,class lobj,class robj> 
+    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
+  {
+    Lattice<vInteger> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=op(lhs._odata[ss],rhs);
+    }
+    return ret;
+  }
+  
+  //////////////////////////////////////////////////////////////////////////
+  // Map to functors
+  //////////////////////////////////////////////////////////////////////////
+  // Less than
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    return LLComparison(vlt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
+    return LSComparison(vlt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
+    return SLComparison(vlt<lobj,robj>(),lhs,rhs);
+  }
+  
+  // Less than equal
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    return LLComparison(vle<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
+    return LSComparison(vle<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
+    return SLComparison(vle<lobj,robj>(),lhs,rhs);
+  }
+  
+  // Greater than 
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+    return LLComparison(vgt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
+    return LSComparison(vgt<lobj,robj>(),lhs,rhs);
+  }
+  template<class lobj,class robj>
+    inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vgt<lobj,robj>(),lhs,rhs);
+  }
+  
+  
+  // Greater than equal
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+   inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vge<lobj,robj>(),lhs,rhs);
+   }
+   
+   // equal
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(veq<lobj,robj>(),lhs,rhs);
+   }
+   
+   
+   // not equal
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+     return LLComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
+     return LSComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+   template<class lobj,class robj>
+     inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
+     return SLComparison(vne<lobj,robj>(),lhs,rhs);
+   }
+}
+#endif
--- a/Grid/lattice/Lattice_comparison_utils.h
+++ b/Grid/lattice/Lattice_comparison_utils.h
@ -0,0 +1,232 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_comparison_utils.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_COMPARISON_H
+#define GRID_COMPARISON_H
+
+namespace Grid {
+
+  /////////////////////////////////////////
+  // This implementation is a bit poor.
+  //
+  // Only support relational logical operations (<, >  etc)
+  // on scalar objects. Therefore can strip any tensor structures.
+  //
+  // Should guard this with isGridTensor<> enable if?
+  /////////////////////////////////////////
+  //
+  // Generic list of functors
+  //
+  template<class lobj,class robj> class veq {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) == (rhs);
+    }
+  };
+  template<class lobj,class robj> class vne {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) != (rhs);
+    }
+  };
+  template<class lobj,class robj> class vlt {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) < (rhs);
+    }
+  };
+  template<class lobj,class robj> class vle {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) <= (rhs);
+    }
+  };
+  template<class lobj,class robj> class vgt {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) > (rhs);
+    }
+  };
+  template<class lobj,class robj> class vge {
+    public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) >= (rhs);
+    }
+  };
+  
+  // Generic list of functors
+  template<class lobj,class robj> class seq {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) == (rhs);
+    }
+  };
+  template<class lobj,class robj> class sne {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) != (rhs);
+    }
+  };
+  template<class lobj,class robj> class slt {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) < (rhs);
+    }
+  };
+  template<class lobj,class robj> class sle {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) <= (rhs);
+    }
+  };
+  template<class lobj,class robj> class sgt {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) > (rhs);
+    }
+  };
+  template<class lobj,class robj> class sge {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) >= (rhs);
+    }
+  };
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Integer and real get extra relational functions.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
+    {
+      typedef typename vsimd::scalar_type scalar;
+      std::vector<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+      std::vector<scalar> vrhs(vsimd::Nsimd());
+      std::vector<Integer> vpred(vsimd::Nsimd());
+      vInteger ret;
+      extract<vsimd,scalar>(lhs,vlhs);
+      extract<vsimd,scalar>(rhs,vrhs);
+      for(int s=0;s<vsimd::Nsimd();s++){
+	vpred[s] = sop(vlhs[s],vrhs[s]);
+      }
+      merge<vInteger,Integer>(ret,vpred);
+      return ret;
+    }
+
+  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
+    {
+      typedef typename vsimd::scalar_type scalar;
+      std::vector<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+      std::vector<Integer> vpred(vsimd::Nsimd());
+      vInteger ret;
+      extract<vsimd,scalar>(lhs,vlhs);
+      for(int s=0;s<vsimd::Nsimd();s++){
+	vpred[s] = sop(vlhs[s],rhs);
+      }
+      merge<vInteger,Integer>(ret,vpred);
+      return ret;
+    }
+
+  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+    inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
+    {
+      typedef typename vsimd::scalar_type scalar;
+      std::vector<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+      std::vector<Integer> vpred(vsimd::Nsimd());
+      vInteger ret;
+      extract<vsimd,scalar>(rhs,vrhs);
+      for(int s=0;s<vsimd::Nsimd();s++){
+	vpred[s] = sop(lhs,vrhs[s]);
+      }
+      merge<vInteger,Integer>(ret,vpred);
+      return ret;
+    }
+
+#define DECLARE_RELATIONAL_EQ(op,functor) \
+  template<class vsimd,IfSimd<vsimd> = 0>\
+    inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
+    {\
+      typedef typename vsimd::scalar_type scalar;\
+      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
+    }\
+  template<class vsimd,IfSimd<vsimd> = 0>\
+    inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
+    {\
+      typedef typename vsimd::scalar_type scalar;\
+      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
+    }\
+  template<class vsimd,IfSimd<vsimd> = 0>\
+    inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
+    {\
+      typedef typename vsimd::scalar_type scalar;\
+      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
+    }\
+  template<class vsimd>\
+    inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
+    {									\
+      return lhs._internal op rhs;					\
+    }									\
+  template<class vsimd>\
+    inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
+    {									\
+      return lhs op rhs._internal;					\
+    }									\
+
+#define DECLARE_RELATIONAL(op,functor) \
+  DECLARE_RELATIONAL_EQ(op,functor)    \
+  template<class vsimd>\
+    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
+    {									\
+      return lhs._internal op rhs._internal;				\
+    }									
+
+DECLARE_RELATIONAL(<,slt);
+DECLARE_RELATIONAL(<=,sle);
+DECLARE_RELATIONAL(>,sgt);
+DECLARE_RELATIONAL(>=,sge);
+DECLARE_RELATIONAL_EQ(==,seq);
+DECLARE_RELATIONAL(!=,sne);
+
+#undef DECLARE_RELATIONAL
+
+}
+
+
+#endif
--- a/Grid/lattice/Lattice_conformable.h
+++ b/Grid/lattice/Lattice_conformable.h
@ -0,0 +1,40 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_conformable.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_CONFORMABLE_H
+#define GRID_LATTICE_CONFORMABLE_H
+
+namespace Grid {
+
+    template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
+    {
+        assert(lhs._grid == rhs._grid);
+        assert(lhs.checkerboard == rhs.checkerboard);
+    }
+
+}
+#endif
--- a/Grid/lattice/Lattice_coordinate.h
+++ b/Grid/lattice/Lattice_coordinate.h
@ -0,0 +1,56 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_coordinate.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_COORDINATE_H
+#define GRID_LATTICE_COORDINATE_H
+
+namespace Grid {
+
+    template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
+    {
+      typedef typename iobj::scalar_type scalar_type;
+      typedef typename iobj::vector_type vector_type;
+
+      GridBase *grid = l._grid;
+      int Nsimd = grid->iSites();
+
+      std::vector<int> gcoor;
+      std::vector<scalar_type> mergebuf(Nsimd);
+
+      vector_type vI;
+      for(int o=0;o<grid->oSites();o++){
+	for(int i=0;i<grid->iSites();i++){
+	  grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
+	  mergebuf[i]=(Integer)gcoor[mu];
+	}
+	merge<vector_type,scalar_type>(vI,mergebuf);
+	l._odata[o]=vI;
+      }
+    };
+
+}
+#endif
--- a/Grid/lattice/Lattice_local.h
+++ b/Grid/lattice/Lattice_local.h
@ -0,0 +1,75 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_local.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_LOCALREDUCTION_H
+#define GRID_LATTICE_LOCALREDUCTION_H
+
+///////////////////////////////////////////////
+// localInner, localNorm, outerProduct
+///////////////////////////////////////////////
+
+namespace Grid {
+
+  /////////////////////////////////////////////////////
+  // Non site, reduced locally reduced routines
+  /////////////////////////////////////////////////////
+  
+  // localNorm2,
+  template<class vobj>
+    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
+      }
+      return ret;
+    }
+  
+  // localInnerProduct
+  template<class vobj>
+    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
+      }
+      return ret;
+    }
+  
+  // outerProduct Scalar x Scalar -> Scalar
+  //              Vector x Vector -> Matrix
+  template<class ll,class rr>
+    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
+  {
+    Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
+    }
+    return ret;
+  }
+}
+#endif
--- a/Grid/lattice/Lattice_overload.h
+++ b/Grid/lattice/Lattice_overload.h
@ -0,0 +1,138 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_overload.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_OVERLOAD_H
+#define GRID_LATTICE_OVERLOAD_H
+
+namespace Grid {
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // unary negation
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj>
+  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
+  {
+    Lattice<vobj> ret(r._grid);
+    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
+      vstream(ret._odata[ss], -r._odata[ss]);
+    }
+    return ret;
+  } 
+  /////////////////////////////////////////////////////////////////////////////////////
+  // Lattice BinOp Lattice,
+  //NB mult performs conformable check. Do not reapply here for performance.
+  /////////////////////////////////////////////////////////////////////////////////////
+  template<class left,class right>
+    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+  {
+    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+    mult(ret,lhs,rhs);
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
+  {
+    Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
+    add(ret,lhs,rhs);
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
+  {
+    Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
+    sub(ret,lhs,rhs);
+    return ret;
+  }
+  
+  // Scalar BinOp Lattice ;generate return type
+  template<class left,class right>
+  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+  {
+    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
+      vstream(ret._odata[ss],tmp);
+	   //      ret._odata[ss]=lhs*rhs._odata[ss];
+    }
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
+    {
+      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
+	vstream(ret._odata[ss],tmp);
+	//	ret._odata[ss]=lhs+rhs._odata[ss];
+      }
+        return ret;
+    }
+  template<class left,class right>
+    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
+  {
+    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
+      vstream(ret._odata[ss],tmp);
+    }
+    return ret;
+  }
+    template<class left,class right>
+      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
+	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
+	vstream(ret._odata[ss],tmp);
+	//            ret._odata[ss]=lhs._odata[ss]*rhs;
+      }
+      return ret;
+    }
+    template<class left,class right>
+      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
+    {
+        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
+	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
+	  vstream(ret._odata[ss],tmp);
+	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
+        }
+        return ret;
+    }
+    template<class left,class right>
+      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
+    {
+      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
+	  vstream(ret._odata[ss],tmp);
+	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
+      }
+      return ret;
+    }
+}
+#endif
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@ -0,0 +1,205 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_peekpoke.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_PEEK_H
+#define GRID_LATTICE_PEEK_H
+
+///////////////////////////////////////////////
+// Peeking and poking around
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Peek internal indices of a Lattice object
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj>
+       auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
+    {
+      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
+      ret.checkerboard=lhs.checkerboard;
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
+      }
+      return ret;
+    };
+    template<int Index,class vobj>
+      auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
+    {
+      Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
+      ret.checkerboard=lhs.checkerboard;
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
+      }
+      return ret;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Poke internal indices of a Lattice object
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj> 
+    void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
+    {
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
+      }      
+    }
+    template<int Index,class vobj>
+      void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
+    {
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
+      }      
+    }
+
+    //////////////////////////////////////////////////////
+    // Poke a scalar object into the SIMD array
+    //////////////////////////////////////////////////////
+    template<class vobj,class sobj>
+    void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){
+
+      GridBase *grid=l._grid;
+
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      int Nsimd = grid->Nsimd();
+
+      assert( l.checkerboard== l._grid->CheckerBoard(site));
+      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+      int rank,odx,idx;
+      // Optional to broadcast from node 0.
+      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+      grid->Broadcast(grid->BossRank(),s);
+
+      std::vector<sobj> buf(Nsimd);
+
+      // extract-modify-merge cycle is easiest way and this is not perf critical
+      if ( rank == grid->ThisRank() ) {
+	extract(l._odata[odx],buf);
+	buf[idx] = s;
+	merge(l._odata[odx],buf);
+      }
+
+      return;
+    };
+
+
+    //////////////////////////////////////////////////////////
+    // Peek a scalar object from the SIMD array
+    //////////////////////////////////////////////////////////
+    template<class vobj,class sobj>
+      void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){
+        
+      GridBase *grid=l._grid;
+
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      int Nsimd = grid->Nsimd();
+
+      assert( l.checkerboard == l._grid->CheckerBoard(site));
+
+      int rank,odx,idx;
+      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+
+      std::vector<sobj> buf(Nsimd);
+      extract(l._odata[odx],buf);
+
+      s = buf[idx];
+
+      grid->Broadcast(rank,s);
+
+      return;
+    };
+
+
+    //////////////////////////////////////////////////////////
+    // Peek a scalar object from the SIMD array
+    //////////////////////////////////////////////////////////
+    template<class vobj,class sobj>
+    void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
+        
+      GridBase *grid = l._grid;
+
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      int Nsimd = grid->Nsimd();
+
+      assert( l.checkerboard== l._grid->CheckerBoard(site));
+      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+      static const int words=sizeof(vobj)/sizeof(vector_type);
+      int odx,idx;
+      idx= grid->iIndex(site);
+      odx= grid->oIndex(site);
+
+      scalar_type * vp = (scalar_type *)&l._odata[odx];
+      scalar_type * pt = (scalar_type *)&s;
+      
+      for(int w=0;w<words;w++){
+        pt[w] = vp[idx+w*Nsimd];
+      }
+      
+      return;
+    };
+
+    template<class vobj,class sobj>
+    void pokeLocalSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
+
+      GridBase *grid=l._grid;
+
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      int Nsimd = grid->Nsimd();
+
+      assert( l.checkerboard== l._grid->CheckerBoard(site));
+      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+      static const int words=sizeof(vobj)/sizeof(vector_type);
+      int odx,idx;
+      idx= grid->iIndex(site);
+      odx= grid->oIndex(site);
+
+      scalar_type * vp = (scalar_type *)&l._odata[odx];
+      scalar_type * pt = (scalar_type *)&s;
+      
+      for(int w=0;w<words;w++){
+        vp[idx+w*Nsimd] = pt[w];
+      }
+
+      return;
+    };
+
+}
+#endif
+
--- a/Grid/lattice/Lattice_reality.h
+++ b/Grid/lattice/Lattice_reality.h
@ -0,0 +1,57 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_reality.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_REALITY_H
+#define GRID_LATTICE_REALITY_H
+
+
+// FIXME .. this is the sector of the code 
+// I am most worried about the directions
+// The choice of burying complex in the SIMD
+// is making the use of "real" and "imag" very cumbersome
+
+namespace Grid {
+
+    template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = adj(lhs._odata[ss]);
+        }
+        return ret;
+    };
+
+    template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
+        Lattice<vobj> ret(lhs._grid);
+	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	  ret._odata[ss] = conjugate(lhs._odata[ss]);
+        }
+        return ret;
+    };
+}
+#endif
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@ -0,0 +1,733 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid 
+    Source file: ./lib/lattice/Lattice_reduction.h
+    Copyright (C) 2015
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_REDUCTION_H
+#define GRID_LATTICE_REDUCTION_H
+
+#include <Grid/Grid_Eigen_Dense.h>
+
+namespace Grid {
+#ifdef GRID_WARN_SUBOPTIMAL
+#warning "Optimisation alert all these reduction loops are NOT threaded "
+#endif     
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Deterministic Reduction operations
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
+  auto nrm = innerProduct(arg,arg);
+  return std::real(nrm); 
+}
+
+// Double inner product
+template<class vobj>
+inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
+  GridBase *grid = left._grid;
+  const int pad = 8;
+
+  ComplexD  inner;
+  Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
+
+  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
+    int nwork, mywork, myoff;
+    GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
+    
+    decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
+    }
+    // All threads sum across SIMD; reduce serial work at end
+    // one write per cacheline with streaming store
+    ComplexD tmp = Reduce(TensorRemove(vinner)) ;
+    vstream(sumarray[thr*pad],tmp);
+  }
+  
+  inner=0.0;
+  for(int i=0;i<grid->SumArraySize();i++){
+    inner = inner+sumarray[i*pad];
+  } 
+  right._grid->GlobalSum(inner);
+  return inner;
+}
+
+/////////////////////////
+// Fast axpby_norm
+// z = a x + b y
+// return norm z
+/////////////////////////
+template<class sobj,class vobj> strong_inline RealD 
+axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) 
+{
+  sobj one(1.0);
+  return axpby_norm_fast(z,a,one,x,y);
+}
+
+template<class sobj,class vobj> strong_inline RealD 
+axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) 
+{
+  const int pad = 8;
+  z.checkerboard = x.checkerboard;
+  conformable(z,x);
+  conformable(x,y);
+
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
+  RealD  nrm;
+  
+  GridBase *grid = x._grid;
+  
+  Vector<RealD> sumarray(grid->SumArraySize()*pad);
+  
+  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
+    int nwork, mywork, myoff;
+    GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
+    
+    // private to thread; sub summation
+    decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero; 
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
+      vnrm = vnrm + innerProductD(tmp,tmp);
+      vstream(z._odata[ss],tmp);
+    }
+    vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
+  }
+  
+  nrm = 0.0; // sum across threads; linear in thread count but fast
+  for(int i=0;i<grid->SumArraySize();i++){
+    nrm = nrm+sumarray[i*pad];
+  } 
+  z._grid->GlobalSum(nrm);
+  return nrm; 
+}
+
+ 
+template<class Op,class T1>
+inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
+  ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+template<class Op,class T1,class T2>
+inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
+      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+
+template<class Op,class T1,class T2,class T3>
+inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
+  ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+				      eval(0,std::get<1>(expr.second)),
+				      eval(0,std::get<2>(expr.second))
+				      ))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+template<class vobj>
+inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
+{
+  GridBase *grid=arg._grid;
+  int Nsimd = grid->Nsimd();
+  
+  std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
+  for(int i=0;i<grid->SumArraySize();i++){
+    sumarray[i]=zero;
+  }
+  
+  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
+    int nwork, mywork, myoff;
+    GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
+    
+    vobj vvsum=zero;
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vvsum = vvsum + arg._odata[ss];
+    }
+    sumarray[thr]=vvsum;
+  }
+  
+  vobj vsum=zero;  // sum across threads
+  for(int i=0;i<grid->SumArraySize();i++){
+    vsum = vsum+sumarray[i];
+  } 
+  
+  typedef typename vobj::scalar_object sobj;
+  sobj ssum=zero;
+  
+  std::vector<sobj>               buf(Nsimd);
+  extract(vsum,buf);
+  
+  for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
+  arg._grid->GlobalSum(ssum);
+  
+  return ssum;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
+{
+  ///////////////////////////////////////////////////////
+  // FIXME precision promoted summation
+  // may be important for correlation functions
+  // But easily avoided by using double precision fields
+  ///////////////////////////////////////////////////////
+  typedef typename vobj::scalar_object sobj;
+  GridBase  *grid = Data._grid;
+  assert(grid!=NULL);
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
+  std::vector<sobj> lsSum(ld,zero);                    // sum across these down to scalars
+  std::vector<sobj> extracted(Nsimd);                  // splitting the SIMD
+
+  result.resize(fd); // And then global sum to return the same vector to every node 
+  for(int r=0;r<rd;r++){
+    lvSum[r]=zero;
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  // sum over reduced dimension planes, breaking out orthog dir
+  // Parallel over orthog direction
+  parallel_for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	lvSum[r]=lvSum[r]+Data._odata[ss];
+      }
+    }
+  }
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  std::vector<int> icoor(Nd);
+
+  for(int rt=0;rt<rd;rt++){
+
+    extract(lvSum[rt],extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx];
+
+    }
+  }
+  
+  // sum over nodes.
+  sobj gsum;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum=lsSum[lt];
+    } else {
+      gsum=zero;
+    }
+
+    grid->GlobalSum(gsum);
+
+    result[t]=gsum;
+  }
+}
+
+template<class vobj>
+static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
+{
+  // std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
+
+  typedef typename vobj::scalar_type scalar_type;
+  std::vector<scalar_type> lsSum;
+  localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
+  globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
+  // std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
+}
+
+template <class vobj>
+static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
+{
+  // std::cout << GridLogMessage << "Start prep" << std::endl;
+  typedef typename vobj::vector_type   vector_type;
+  typedef typename vobj::scalar_type   scalar_type;
+  GridBase  *grid = lhs._grid;
+  assert(grid!=NULL);
+  conformable(grid,rhs._grid);
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+  // std::cout << GridLogMessage << "Start alloc" << std::endl;
+
+  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
+  lsSum.resize(ld,scalar_type(0.0));                    // sum across these down to scalars
+  std::vector<iScalar<scalar_type>> extracted(Nsimd);   // splitting the SIMD  
+  // std::cout << GridLogMessage << "End alloc" << std::endl;
+
+  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
+  for(int r=0;r<rd;r++){
+    lvSum[r]=zero;
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+  // std::cout << GridLogMessage << "End prep" << std::endl;
+  // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
+  vector_type vv;
+  parallel_for(int r=0;r<rd;r++)
+  {
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+        int ss = so + n * stride + b;
+        vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss]));
+        lvSum[r] = lvSum[r] + vv;
+      }
+    }
+  }
+  // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  std::vector<int> icoor(Nd);
+  for(int rt=0;rt<rd;rt++){
+
+    iScalar<vector_type> temp; 
+    temp._internal = lvSum[rt];
+    extract(temp,extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
+
+    }
+  }
+  // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
+}
+template <class vobj>
+static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  GridBase *grid = lhs._grid;
+  int fd = result.size();
+  int ld = lsSum.size();
+  // sum over nodes.
+  std::vector<scalar_type> gsum;
+  gsum.resize(fd, scalar_type(0.0));
+  // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum[t]=lsSum[lt];
+    }
+  }
+  // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
+  // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
+  grid->GlobalSumVector(&gsum[0], fd);
+  // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
+
+  result = gsum;
+}
+template<class vobj>
+static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
+{
+  typedef typename vobj::vector_type   vector_type;
+  typedef typename vobj::scalar_type   scalar_type;
+  GridBase  *grid = lhs._grid;
+  assert(grid!=NULL);
+  conformable(grid,rhs._grid);
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
+  std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
+  std::vector<iScalar<scalar_type> > extracted(Nsimd);                  // splitting the SIMD
+
+  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
+  for(int r=0;r<rd;r++){
+    lvSum[r]=zero;
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  parallel_for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss]));
+	lvSum[r]=lvSum[r]+vv;
+      }
+    }
+  }
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  std::vector<int> icoor(Nd);
+  for(int rt=0;rt<rd;rt++){
+
+    iScalar<vector_type> temp; 
+    temp._internal = lvSum[rt];
+    extract(temp,extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
+
+    }
+  }
+  
+  // sum over nodes.
+  scalar_type gsum;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum=lsSum[lt];
+    } else {
+      gsum=scalar_type(0.0);
+    }
+
+    grid->GlobalSum(gsum);
+
+    result[t]=gsum;
+  }
+}
+template<class vobj>
+static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  int Nblock = rhs._grid->GlobalDimensions()[Orthog];
+  std::vector<ComplexD> ip(Nblock);
+  sn.resize(Nblock);
+  
+  sliceInnerProductVector(ip,rhs,rhs,Orthog);
+  for(int ss=0;ss<Nblock;ss++){
+    sn[ss] = real(ip[ss]);
+  }
+};
+
+
+template<class vobj>
+static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
+			    int orthogdim,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::tensor_reduced tensor_reduced;
+  
+  scalar_type zscale(scale);
+
+  GridBase *grid  = X._grid;
+
+  int Nsimd  =grid->Nsimd();
+  int Nblock =grid->GlobalDimensions()[orthogdim];
+
+  int fd     =grid->_fdimensions[orthogdim];
+  int ld     =grid->_ldimensions[orthogdim];
+  int rd     =grid->_rdimensions[orthogdim];
+
+  int e1     =grid->_slice_nblock[orthogdim];
+  int e2     =grid->_slice_block [orthogdim];
+  int stride =grid->_slice_stride[orthogdim];
+
+  std::vector<int> icoor;
+
+  for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    vector_type    av;
+
+    for(int l=0;l<Nsimd;l++){
+      grid->iCoorFromIindex(icoor,l);
+      int ldx =r+icoor[orthogdim]*rd;
+      scalar_type *as =(scalar_type *)&av;
+      as[l] = scalar_type(a[ldx])*zscale;
+    }
+
+    tensor_reduced at; at=av;
+
+    parallel_for_nest2(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	R._odata[ss] = at*X._odata[ss]+Y._odata[ss];
+      }
+    }
+  }
+};
+
+/*
+inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
+{
+  int NN    = BlockSolverGrid->_ndimension;
+  int nsimd = BlockSolverGrid->Nsimd();
+  
+  std::vector<int> latt_phys(0);
+  std::vector<int> simd_phys(0);
+  std::vector<int>  mpi_phys(0);
+  
+  for(int d=0;d<NN;d++){
+    if( d!=Orthog ) { 
+      latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
+      simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
+      mpi_phys.push_back(BlockSolverGrid->_processors[d]);
+    }
+  }
+  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
+}
+*/
+
+template<class vobj>
+static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X._grid->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X._grid;
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+#pragma omp parallel 
+  {
+    std::vector<vobj> s_x(Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = Y[o+i*ostride];
+	for(int j=0;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }}
+  }
+};
+
+template<class vobj>
+static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X._grid->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X._grid;
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl=1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+#pragma omp parallel 
+  {
+    std::vector<vobj> s_x(Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = s_x[0]*(scale*aa(0,i));
+	for(int j=1;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }}
+  }
+
+};
+
+
+template<class vobj>
+static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  GridBase *FullGrid  = lhs._grid;
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  
+  int Nblock = FullGrid->GlobalDimensions()[Orthog];
+  
+  //  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+  
+  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+
+  typedef typename vobj::vector_typeD vector_typeD;
+
+#pragma omp parallel 
+  {
+    std::vector<vobj> Left(Nblock);
+    std::vector<vobj> Right(Nblock);
+    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+#pragma omp for collapse(2)
+    for(int n=0;n<nblock;n++){
+    for(int b=0;b<block;b++){
+
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	Left [i] = lhs[o+i*ostride];
+	Right[i] = rhs[o+i*ostride];
+      }
+
+      for(int i=0;i<Nblock;i++){
+      for(int j=0;j<Nblock;j++){
+	auto tmp = innerProduct(Left[i],Right[j]);
+	auto rtmp = TensorRemove(tmp);
+	mat_thread(i,j) += Reduce(rtmp);
+      }}
+    }}
+#pragma omp critical
+    {
+      mat += mat_thread;
+    }  
+  }
+
+  for(int i=0;i<Nblock;i++){
+  for(int j=0;j<Nblock;j++){
+    ComplexD sum = mat(i,j);
+    FullGrid->GlobalSum(sum);
+    mat(i,j)=sum;
+  }}
+
+  return;
+}
+
+} /*END NAMESPACE GRID*/
+#endif
+
+
+
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@ -0,0 +1,520 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_rng.h
+
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_RNG_H
+#define GRID_LATTICE_RNG_H
+
+#include <random>
+
+#ifdef RNG_SITMO
+#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#endif 
+
+#if defined(RNG_SITMO)
+#define RNG_FAST_DISCARD
+#else 
+#undef  RNG_FAST_DISCARD
+#endif
+
+namespace Grid {
+
+  //////////////////////////////////////////////////////////////
+  // Allow the RNG state to be less dense than the fine grid
+  //////////////////////////////////////////////////////////////
+  inline int RNGfillable(GridBase *coarse,GridBase *fine)
+  {
+
+    int rngdims = coarse->_ndimension;
+
+    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+    int lowerdims   = fine->_ndimension - coarse->_ndimension;
+    assert(lowerdims >= 0);
+    for(int d=0;d<lowerdims;d++){
+      assert(fine->_simd_layout[d]==1);
+      assert(fine->_processors[d]==1);
+    }
+
+    int multiplicity=1;
+    for(int d=0;d<lowerdims;d++){
+      multiplicity=multiplicity*fine->_rdimensions[d];
+    }
+    // local and global volumes subdivide cleanly after SIMDization
+    for(int d=0;d<rngdims;d++){
+      int fd= d+lowerdims;
+      assert(coarse->_processors[d]  == fine->_processors[fd]);
+      assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
+      assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
+
+      multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
+    }
+    return multiplicity;
+  }
+
+  
+// merge of April 11 2017
+  // this function is necessary for the LS vectorised field
+  inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
+  {
+    int rngdims = coarse->_ndimension;
+    
+    // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+    int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0);
+    // assumes that the higher dimensions are not using more processors
+    // all further divisions are local
+    for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
+    for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
+
+    // then divide the number of local sites
+    // check that the total number of sims agree, meanse the iSites are the same
+    assert(fine->Nsimd() == coarse->Nsimd());
+
+    // check that the two grids divide cleanly
+    assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
+
+    return fine->lSites() / coarse->lSites();
+  }
+  
+  // real scalars are one component
+  template<class scalar,class distribution,class generator> 
+  void fillScalar(scalar &s,distribution &dist,generator & gen)
+  {
+    s=dist(gen);
+  }
+  template<class distribution,class generator> 
+  void fillScalar(ComplexF &s,distribution &dist, generator &gen)
+  {
+    s=ComplexF(dist(gen),dist(gen));
+  }
+  template<class distribution,class generator> 
+  void fillScalar(ComplexD &s,distribution &dist,generator &gen)
+  {
+    s=ComplexD(dist(gen),dist(gen));
+  }
+  
+  class GridRNGbase {
+  public:
+    // One generator per site.
+    // Uniform and Gaussian distributions from these generators.
+#ifdef RNG_RANLUX
+    typedef std::ranlux48 RngEngine;
+    typedef uint64_t      RngStateType;
+    static const int RngStateCount = 15;
+#endif 
+#ifdef RNG_MT19937 
+    typedef std::mt19937 RngEngine;
+    typedef uint32_t     RngStateType;
+    static const int     RngStateCount = std::mt19937::state_size;
+#endif
+#ifdef RNG_SITMO
+    typedef sitmo::prng_engine 	RngEngine;
+    typedef uint64_t    	RngStateType;
+    static const int    	RngStateCount = 13;
+#endif
+
+    std::vector<RngEngine>                             _generators;
+    std::vector<std::uniform_real_distribution<RealD> > _uniform;
+    std::vector<std::normal_distribution<RealD> >       _gaussian;
+    std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
+    std::vector<std::uniform_int_distribution<uint32_t> > _uid;
+
+    ///////////////////////
+    // support for parallel init
+    ///////////////////////
+#ifdef RNG_FAST_DISCARD
+    static void Skip(RngEngine &eng,uint64_t site)
+    {
+      /////////////////////////////////////////////////////////////////////////////////////
+      // Skip by 2^40 elements between successive lattice sites
+      // This goes by 10^12.
+      // Consider quenched updating; likely never exceeding rate of 1000 sweeps
+      // per second on any machine. This gives us of order 10^9 seconds, or 100 years
+      // skip ahead.
+      // For HMC unlikely to go at faster than a solve per second, and 
+      // tens of seconds per trajectory so this is clean in all reasonable cases,
+      // and margin of safety is orders of magnitude.
+      // We could hack Sitmo to skip in the higher order words of state if necessary
+      //
+      // Replace with 2^30 ; avoid problem on large volumes
+      //
+      /////////////////////////////////////////////////////////////////////////////////////
+      //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
+      const int shift = 30;
+
+      uint64_t skip = site;
+
+      skip = skip<<shift;
+
+      assert((skip >> shift)==site); // check for overflow
+
+      eng.discard(skip);
+      //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
+    } 
+#endif
+    static RngEngine Reseed(RngEngine &eng)
+    {
+      std::vector<uint32_t> newseed;
+      std::uniform_int_distribution<uint32_t> uid;
+      return Reseed(eng,newseed,uid);
+    }
+    static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
+			    std::uniform_int_distribution<uint32_t> &uid)
+    {
+      const int reseeds=4;
+      
+      newseed.resize(reseeds);
+      for(int i=0;i<reseeds;i++){
+	newseed[i] = uid(eng);
+      }
+      std::seed_seq sseq(newseed.begin(),newseed.end());
+      return RngEngine(sseq);
+    }    
+
+    void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
+      saved.resize(RngStateCount);
+      std::stringstream ss;
+      ss<<eng;
+      ss.seekg(0,ss.beg);
+      for(int i=0;i<RngStateCount;i++){
+        ss>>saved[i];
+      }
+    }
+    void GetState(std::vector<RngStateType> & saved,int gen) {
+      GetState(saved,_generators[gen]);
+    }
+    void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
+      assert(saved.size()==RngStateCount);
+      std::stringstream ss;
+      for(int i=0;i<RngStateCount;i++){
+        ss<< saved[i]<<" ";
+      }
+      ss.seekg(0,ss.beg);
+      ss>>eng;
+    }
+    void SetState(std::vector<RngStateType> & saved,int gen){
+      SetState(saved,_generators[gen]);
+    }
+    void SetEngine(RngEngine &Eng, int gen){
+      _generators[gen]=Eng;
+    }
+    void GetEngine(RngEngine &Eng, int gen){
+      Eng=_generators[gen];
+    }
+    template<class source> void Seed(source &src, int gen)
+    {
+      _generators[gen] = RngEngine(src);
+    }    
+  };
+
+  class GridSerialRNG : public GridRNGbase {
+  public:
+
+    GridSerialRNG() : GridRNGbase() {
+      _generators.resize(1);
+      _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
+      _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
+      _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
+      _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
+    }
+
+    template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
+
+      typedef typename sobj::scalar_type scalar_type;
+ 
+      int words = sizeof(sobj)/sizeof(scalar_type);
+
+      scalar_type *buf = (scalar_type *) & l;
+
+      dist[0].reset();
+      for(int idx=0;idx<words;idx++){
+	fillScalar(buf[idx],dist[0],_generators[0]);
+      }
+
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+
+    };
+
+    template <class distribution>  inline void fill(ComplexF &l,std::vector<distribution> &dist){
+      dist[0].reset();
+      fillScalar(l,dist[0],_generators[0]);
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(ComplexD &l,std::vector<distribution> &dist){
+      dist[0].reset();
+      fillScalar(l,dist[0],_generators[0]);
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(RealF &l,std::vector<distribution> &dist){
+      dist[0].reset();
+      fillScalar(l,dist[0],_generators[0]);
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(RealD &l,std::vector<distribution> &dist){
+      dist[0].reset();
+      fillScalar(l,dist[0],_generators[0]);
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    // vector fill
+    template <class distribution>  inline void fill(vComplexF &l,std::vector<distribution> &dist){
+      RealF *pointer=(RealF *)&l;
+      dist[0].reset();
+      for(int i=0;i<2*vComplexF::Nsimd();i++){
+	fillScalar(pointer[i],dist[0],_generators[0]);
+      }
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(vComplexD &l,std::vector<distribution> &dist){
+      RealD *pointer=(RealD *)&l;
+      dist[0].reset();
+      for(int i=0;i<2*vComplexD::Nsimd();i++){
+	fillScalar(pointer[i],dist[0],_generators[0]);
+      }
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(vRealF &l,std::vector<distribution> &dist){
+      RealF *pointer=(RealF *)&l;
+      dist[0].reset();
+      for(int i=0;i<vRealF::Nsimd();i++){
+	fillScalar(pointer[i],dist[0],_generators[0]);
+      }
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    template <class distribution>  inline void fill(vRealD &l,std::vector<distribution> &dist){
+      RealD *pointer=(RealD *)&l;
+      dist[0].reset();
+      for(int i=0;i<vRealD::Nsimd();i++){
+	fillScalar(pointer[i],dist[0],_generators[0]);
+      }
+      CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+    }
+    
+    void SeedFixedIntegers(const std::vector<int> &seeds){
+      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
+      std::seed_seq src(seeds.begin(),seeds.end());
+      Seed(src,0);
+    }
+
+    void SeedUniqueString(const std::string &s){
+      std::vector<int> seeds;
+      std::stringstream sha;
+      seeds = GridChecksum::sha256_seeds(s);
+      for(int i=0;i<seeds.size();i++) { 
+        sha << std::hex << seeds[i];
+      }
+      std::cout << GridLogMessage << "Intialising serial RNG with unique string '" 
+                << s << "'" << std::endl;
+      std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
+      SeedFixedIntegers(seeds);
+    }
+  };
+
+  class GridParallelRNG : public GridRNGbase {
+
+    double _time_counter;
+
+  public:
+    GridBase *_grid;
+    unsigned int _vol;
+
+    int generator_idx(int os,int is) {
+      return is*_grid->oSites()+os;
+    }
+
+    GridParallelRNG(GridBase *grid) : GridRNGbase() {
+      _grid = grid;
+      _vol  =_grid->iSites()*_grid->oSites();
+
+      _generators.resize(_vol);
+      _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
+      _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
+      _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
+      _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
+    }
+
+    template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
+
+      typedef typename vobj::scalar_object scalar_object;
+      typedef typename vobj::scalar_type scalar_type;
+      typedef typename vobj::vector_type vector_type;
+
+      double inner_time_counter = usecond();
+
+      int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
+      int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l._grid too
+      int osites = _grid->oSites();  // guaranteed to be <= l._grid->oSites() by a factor multiplicity
+      int words  = sizeof(scalar_object) / sizeof(scalar_type);
+
+      parallel_for(int ss=0;ss<osites;ss++){
+        std::vector<scalar_object> buf(Nsimd);
+        for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times
+
+          int sm = multiplicity * ss + m;  // Maps the generator site to the fine site
+
+          for (int si = 0; si < Nsimd; si++) {
+            
+            int gdx = generator_idx(ss, si);  // index of generator state
+            scalar_type *pointer = (scalar_type *)&buf[si];
+            dist[gdx].reset();
+            for (int idx = 0; idx < words; idx++) 
+              fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
+          }
+          // merge into SIMD lanes, FIXME suboptimal implementation
+          merge(l._odata[sm], buf);
+        }
+      }
+
+      _time_counter += usecond()- inner_time_counter;
+    };
+
+    void SeedUniqueString(const std::string &s){
+      std::vector<int> seeds;
+      std::stringstream sha;
+      seeds = GridChecksum::sha256_seeds(s);
+      for(int i=0;i<seeds.size();i++) { 
+        sha << std::hex << seeds[i];
+      }
+      std::cout << GridLogMessage << "Intialising parallel RNG with unique string '" 
+                << s << "'" << std::endl;
+      std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
+      SeedFixedIntegers(seeds);
+    }
+    void SeedFixedIntegers(const std::vector<int> &seeds){
+
+      // Everyone generates the same seed_seq based on input seeds
+      CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
+
+      std::seed_seq source(seeds.begin(),seeds.end());
+
+      RngEngine master_engine(source);
+
+#ifdef RNG_FAST_DISCARD
+      ////////////////////////////////////////////////
+      // Skip ahead through a single stream.
+      // Applicable to SITMO and other has based/crypto RNGs
+      // Should be applicable to Mersenne Twister, but the C++11
+      // MT implementation does not implement fast discard even though
+      // in principle this is possible
+      ////////////////////////////////////////////////
+
+      // Everybody loops over global volume.
+      parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){
+
+	// Where is it?
+	int rank,o_idx,i_idx;
+	std::vector<int> gcoor;
+
+	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
+	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+
+	// If this is one of mine we take it
+	if( rank == _grid->ThisRank() ){
+	  int l_idx=generator_idx(o_idx,i_idx);
+	  _generators[l_idx] = master_engine;
+	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
+	}
+
+      }
+#else 
+      ////////////////////////////////////////////////////////////////
+      // Machine and thread decomposition dependent seeding is efficient
+      // and maximally parallel; but NOT reproducible from machine to machine. 
+      // Not ideal, but fastest way to reseed all nodes.
+      ////////////////////////////////////////////////////////////////
+      {
+	// Obtain one Reseed per processor
+	int Nproc = _grid->ProcessorCount();
+	std::vector<RngEngine> seeders(Nproc);
+	int me= _grid->ThisRank();
+	for(int p=0;p<Nproc;p++){
+	  seeders[p] = Reseed(master_engine);
+	}
+	master_engine = seeders[me];
+      }
+
+      {
+	// Obtain one reseeded generator per thread
+	int Nthread = GridThread::GetThreads();
+	std::vector<RngEngine> seeders(Nthread);
+	for(int t=0;t<Nthread;t++){
+	  seeders[t] = Reseed(master_engine);
+	}
+
+	parallel_for(int t=0;t<Nthread;t++) {
+	  // set up one per local site in threaded fashion
+	  std::vector<uint32_t> newseeds;
+	  std::uniform_int_distribution<uint32_t> uid;	
+	  for(int l=0;l<_grid->lSites();l++) {
+	    if ( (l%Nthread)==t ) {
+	      _generators[l] = Reseed(seeders[t],newseeds,uid);
+	    }
+	  }
+	}
+      }
+#endif
+    }
+
+    void Report(){
+      std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
+    }
+
+
+    ////////////////////////////////////////////////////////////////////////
+    // Support for rigorous test of RNG's
+    // Return uniform random uint32_t from requested site generator
+    ////////////////////////////////////////////////////////////////////////
+    uint32_t GlobalU01(int gsite){
+
+      uint32_t the_number;
+      // who
+      std::vector<int> gcoor;
+      int rank,o_idx,i_idx;
+      _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
+      _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+
+      // draw
+      int l_idx=generator_idx(o_idx,i_idx);
+      if( rank == _grid->ThisRank() ){
+	the_number = _uid[l_idx](_generators[l_idx]);
+      }
+      
+      // share & return
+      _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
+      return the_number;
+    }
+
+  };
+
+  template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
+  template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
+  template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
+
+  template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
+  template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
+  template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
+
+}
+#endif
--- a/Grid/lattice/Lattice_trace.h
+++ b/Grid/lattice/Lattice_trace.h
@ -0,0 +1,67 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_trace.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_TRACE_H
+#define GRID_LATTICE_TRACE_H
+
+///////////////////////////////////////////////
+// Tracing, transposing, peeking, poking
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Trace
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<class vobj>
+    inline auto trace(const Lattice<vobj> &lhs)
+      -> Lattice<decltype(trace(lhs._odata[0]))>
+    {
+      Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+            ret._odata[ss] = trace(lhs._odata[ss]);
+        }
+        return ret;
+    };
+    
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Trace Index level dependent operation
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    template<int Index,class vobj>
+    inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
+    {
+      Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+      parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
+      }
+      return ret;
+    };
+
+
+}
+#endif
+
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
--- a/Grid/lattice/Lattice_transpose.h
+++ b/Grid/lattice/Lattice_transpose.h
@ -0,0 +1,63 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_transpose.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_TRANSPOSE_H
+#define GRID_LATTICE_TRANSPOSE_H
+
+///////////////////////////////////////////////
+// Transpose
+///////////////////////////////////////////////
+
+namespace Grid {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Transpose
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj>
+    inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
+    Lattice<vobj> ret(lhs._grid);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+      ret._odata[ss] = transpose(lhs._odata[ss]);
+    }
+    return ret;
+  };
+    
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Index level dependent transpose
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<int Index,class vobj>
+    inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
+  {
+    Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
+    parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
+      ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
+    }
+    return ret;
+  };
+}
+#endif
--- a/Grid/lattice/Lattice_unary.h
+++ b/Grid/lattice/Lattice_unary.h
@ -0,0 +1,84 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_unary.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_UNARY_H
+#define GRID_LATTICE_UNARY_H
+
+namespace Grid {
+
+  template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs,RealD y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=pow(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+  template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs,Integer y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=mod(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+
+  template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=div(rhs._odata[ss],y);
+    }
+    return ret;
+  }
+
+  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
+    }
+
+    return ret;
+
+    
+    
+
+    
+  }
+
+
+
+}
+#endif
--- a/Grid/lattice/Lattice_where.h
+++ b/Grid/lattice/Lattice_where.h
@ -0,0 +1,86 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_where.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_WHERE_H
+#define GRID_LATTICE_WHERE_H
+namespace Grid {
+// Must implement the predicate gating the 
+// Must be able to reduce the predicate down to a single vInteger per site.
+// Must be able to require the type be iScalar x iScalar x ....
+//                              give a GetVtype method in iScalar
+//                              and blow away the tensor structures.
+//
+template<class vobj,class iobj>
+inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+  conformable(iftrue,ret);
+
+  GridBase *grid=iftrue._grid;
+
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  typedef typename iobj::vector_type mask_type;
+
+  const int Nsimd = grid->Nsimd();
+
+  std::vector<Integer> mask(Nsimd);
+  std::vector<scalar_object> truevals (Nsimd);
+  std::vector<scalar_object> falsevals(Nsimd);
+
+  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
+
+    extract(iftrue._odata[ss]   ,truevals);
+    extract(iffalse._odata[ss]  ,falsevals);
+    extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
+
+    for(int s=0;s<Nsimd;s++){
+      if (mask[s]) falsevals[s]=truevals[s];
+    }
+
+    merge(ret._odata[ss],falsevals);
+  }
+}
+
+template<class vobj,class iobj>
+inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+
+  Lattice<vobj> ret(iftrue._grid);
+
+  where(ret,predicate,iftrue,iffalse);
+
+  return ret;
+}
+}
+#endif