Merge branch 'develop' into feature/gpu-port

2025-10-22 08:44:48 +01:00 · 2018-12-13 05:11:34 +00:00
parent adbdc4e65b c509bd3fe2
commit b57a4d32aa
647 changed files with 49155 additions and 11160 deletions
--- a/Grid/lattice/Lattice.h
+++ b/Grid/lattice/Lattice.h
@@ -0,0 +1,46 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Lattice.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+#include <Grid/lattice/Lattice_base.h>
+#include <Grid/lattice/Lattice_conformable.h>
+#include <Grid/lattice/Lattice_ET.h>
+#include <Grid/lattice/Lattice_arith.h>
+#include <Grid/lattice/Lattice_trace.h>
+#include <Grid/lattice/Lattice_transpose.h>
+#include <Grid/lattice/Lattice_local.h>
+#include <Grid/lattice/Lattice_reduction.h>
+#include <Grid/lattice/Lattice_peekpoke.h>
+#include <Grid/lattice/Lattice_reality.h>
+#include <Grid/lattice/Lattice_comparison_utils.h>
+#include <Grid/lattice/Lattice_comparison.h>
+#include <Grid/lattice/Lattice_coordinate.h>
+//#include <Grid/lattice/Lattice_where.h>
+#include <Grid/lattice/Lattice_rng.h>
+#include <Grid/lattice/Lattice_unary.h>
+#include <Grid/lattice/Lattice_transfer.h>
+
--- a/Grid/lattice/Lattice_ET.h
+++ b/Grid/lattice/Lattice_ET.h
@@ -0,0 +1,407 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/lattice/Lattice_ET.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#ifndef GRID_LATTICE_ET_H
+#define GRID_LATTICE_ET_H
+
+#include <iostream>
+#include <tuple>
+#include <typeinfo>
+#include <vector>
+
+NAMESPACE_BEGIN(Grid);
+
+////////////////////////////////////////////////////
+// Predicated where support
+////////////////////////////////////////////////////
+template <class iobj, class vobj, class robj>
+accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
+                            const robj &iffalse) {
+  typename std::remove_const<vobj>::type ret;
+
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  const int Nsimd = vobj::vector_type::Nsimd();
+
+  ExtractBuffer<Integer> mask(Nsimd);
+  ExtractBuffer<scalar_object> truevals(Nsimd);
+  ExtractBuffer<scalar_object> falsevals(Nsimd);
+
+  extract(iftrue, truevals);
+  extract(iffalse, falsevals);
+  extract<vInteger, Integer>(TensorRemove(predicate), mask);
+
+  for (int s = 0; s < Nsimd; s++) {
+    if (mask[s]) falsevals[s] = truevals[s];
+  }
+
+  merge(ret, falsevals);
+  return ret;
+}
+
+/////////////////////////////////////////////////////
+//Specialization of getVectorType for lattices
+/////////////////////////////////////////////////////
+template<typename T>
+struct getVectorType<Lattice<T> >{
+  typedef typename Lattice<T>::vector_object type;
+};
+
+////////////////////////////////////////////
+//--  recursive evaluation of expressions; --
+// handle leaves of syntax tree
+///////////////////////////////////////////////////
+template<class sobj> accelerator_inline 
+sobj eval(const uint64_t ss, const sobj &arg)
+{
+  return arg;
+}
+
+template <class lobj> accelerator_inline 
+const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg) 
+{
+  return arg[ss];
+}
+template <class lobj> accelerator_inline 
+const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) 
+{
+  auto view = arg.View();
+  return view[ss];
+}
+
+///////////////////////////////////////////////////
+// handle nodes in syntax tree- eval one operand
+///////////////////////////////////////////////////
+template <typename Op, typename T1> accelerator_inline 
+auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)  
+  -> decltype(expr.op.func( eval(ss, expr.arg1)))
+{
+  return expr.op.func( eval(ss, expr.arg1) );
+}
+///////////////////////
+// eval two operands
+///////////////////////
+template <typename Op, typename T1, typename T2> accelerator_inline
+auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)  
+  -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
+{
+  return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
+}
+///////////////////////
+// eval three operands
+///////////////////////
+template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
+auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)  
+  -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
+{
+  return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Obtain the grid from an expression, ensuring conformable. This must follow a
+// tree recursion; must retain grid pointer in the LatticeView class which sucks
+// Use a different method, and make it void *.
+// Perhaps a conformable method.
+//////////////////////////////////////////////////////////////////////////
+template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+accelerator_inline void GridFromExpression(GridBase *&grid, const T1 &lat)  // Lattice leaf
+{
+  lat.Conformable(grid);
+}
+
+template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+accelerator_inline 
+void GridFromExpression(GridBase *&grid,const T1 &notlat)  // non-lattice leaf
+{}
+
+template <typename Op, typename T1>
+accelerator_inline 
+void GridFromExpression(GridBase *&grid,const LatticeUnaryExpression<Op, T1> &expr) 
+{
+  GridFromExpression(grid, expr.arg1);  // recurse
+}
+
+template <typename Op, typename T1, typename T2>
+accelerator_inline 
+void GridFromExpression(GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) 
+{
+  GridFromExpression(grid, expr.arg1);  // recurse
+  GridFromExpression(grid, expr.arg2);
+}
+template <typename Op, typename T1, typename T2, typename T3>
+accelerator_inline 
+void GridFromExpression(GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) 
+{
+  GridFromExpression(grid, expr.arg1);  // recurse
+  GridFromExpression(grid, expr.arg2);  // recurse
+  GridFromExpression(grid, expr.arg3);  // recurse
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Obtain the CB from an expression, ensuring conformable. This must follow a
+// tree recursion
+//////////////////////////////////////////////////////////////////////////
+template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &lat)  // Lattice leaf
+{
+  if ((cb == Odd) || (cb == Even)) {
+    assert(cb == lat.Checkerboard());
+  }
+  cb = lat.Checkerboard();
+}
+template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
+inline void CBFromExpression(int &cb, const T1 &notlat)  // non-lattice leaf
+{
+}
+
+template <typename Op, typename T1> inline 
+void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr) 
+{
+  CBFromExpression(cb, expr.arg1);  // recurse AST
+}
+
+template <typename Op, typename T1, typename T2> inline 
+void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr) 
+{
+  CBFromExpression(cb, expr.arg1);  // recurse AST
+  CBFromExpression(cb, expr.arg2);  // recurse AST
+}
+template <typename Op, typename T1, typename T2, typename T3>
+inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) 
+{
+  CBFromExpression(cb, expr.arg1);  // recurse AST
+  CBFromExpression(cb, expr.arg2);  // recurse AST
+  CBFromExpression(cb, expr.arg3);  // recurse AST
+}
+
+////////////////////////////////////////////
+// Unary operators and funcs
+////////////////////////////////////////////
+#define GridUnopClass(name, ret)					\
+  template <class arg>							\
+  struct name {								\
+    static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
+  };
+
+GridUnopClass(UnarySub, -a);
+GridUnopClass(UnaryNot, Not(a));
+GridUnopClass(UnaryAdj, adj(a));
+GridUnopClass(UnaryConj, conjugate(a));
+GridUnopClass(UnaryTrace, trace(a));
+GridUnopClass(UnaryTranspose, transpose(a));
+GridUnopClass(UnaryTa, Ta(a));
+GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
+GridUnopClass(UnaryReal, real(a));
+GridUnopClass(UnaryImag, imag(a));
+GridUnopClass(UnaryToReal, toReal(a));
+GridUnopClass(UnaryToComplex, toComplex(a));
+GridUnopClass(UnaryTimesI, timesI(a));
+GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
+GridUnopClass(UnaryAbs, abs(a));
+GridUnopClass(UnarySqrt, sqrt(a));
+GridUnopClass(UnaryRsqrt, rsqrt(a));
+GridUnopClass(UnarySin, sin(a));
+GridUnopClass(UnaryCos, cos(a));
+GridUnopClass(UnaryAsin, asin(a));
+GridUnopClass(UnaryAcos, acos(a));
+GridUnopClass(UnaryLog, log(a));
+GridUnopClass(UnaryExp, exp(a));
+
+////////////////////////////////////////////
+// Binary operators
+////////////////////////////////////////////
+#define GridBinOpClass(name, combination)			\
+  template <class left, class right>				\
+  struct name {							\
+    static auto accelerator_inline				\
+    func(const left &lhs, const right &rhs)			\
+      -> decltype(combination) const				\
+    {								\
+      return combination;					\
+    }								\
+  };
+
+GridBinOpClass(BinaryAdd, lhs + rhs);
+GridBinOpClass(BinarySub, lhs - rhs);
+GridBinOpClass(BinaryMul, lhs *rhs);
+GridBinOpClass(BinaryDiv, lhs /rhs);
+GridBinOpClass(BinaryAnd, lhs &rhs);
+GridBinOpClass(BinaryOr, lhs | rhs);
+GridBinOpClass(BinaryAndAnd, lhs &&rhs);
+GridBinOpClass(BinaryOrOr, lhs || rhs);
+
+////////////////////////////////////////////////////
+// Trinary conditional op
+////////////////////////////////////////////////////
+#define GridTrinOpClass(name, combination)				\
+  template <class predicate, class left, class right>			\
+  struct name {								\
+    static auto accelerator_inline					\
+    func(const predicate &pred, const left &lhs, const right &rhs)	\
+      -> decltype(combination) const					\
+    {									\
+      return combination;						\
+    }									\
+  };
+
+GridTrinOpClass(TrinaryWhere,
+		(predicatedWhere<predicate, 
+		 typename std::remove_reference<left>::type,
+		 typename std::remove_reference<right>::type>(pred, lhs,rhs)));
+
+////////////////////////////////////////////
+// Operator syntactical glue
+////////////////////////////////////////////
+
+#define GRID_UNOP(name)   name<decltype(eval(0, arg))>
+#define GRID_BINOP(name)  name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
+
+#define GRID_DEF_UNOP(op, name)						\
+  template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
+  inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg)) \
+  {									\
+    return     LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg); \
+  }
+
+#define GRID_BINOP_LEFT(op, name)					\
+  template <typename T1, typename T2,					\
+            typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
+  inline auto op(const T1 &lhs, const T2 &rhs)				\
+    ->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs)) \
+  {									\
+    return     LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs);\
+  }
+
+#define GRID_BINOP_RIGHT(op, name)					\
+  template <typename T1, typename T2,					\
+            typename std::enable_if<!is_lattice<T1>::value&&!is_lattice_expr<T1>::value,T1>::type * = nullptr, \
+            typename std::enable_if< is_lattice<T2>::value|| is_lattice_expr<T2>::value,T2>::type * = nullptr> \
+  inline auto op(const T1 &lhs, const T2 &rhs)				\
+    ->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs)) \
+  {									\
+    return     LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs); \
+  }
+
+#define GRID_DEF_BINOP(op, name)		\
+  GRID_BINOP_LEFT(op, name);			\
+  GRID_BINOP_RIGHT(op, name);
+
+#define GRID_DEF_TRINOP(op, name)					\
+  template <typename T1, typename T2, typename T3>			\
+  inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs)		\
+    ->decltype(LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs)) \
+  {									\
+    return LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs); \
+  }
+
+////////////////////////
+// Operator definitions
+////////////////////////
+GRID_DEF_UNOP(operator-, UnarySub);
+GRID_DEF_UNOP(Not, UnaryNot);
+GRID_DEF_UNOP(operator!, UnaryNot);
+GRID_DEF_UNOP(adj, UnaryAdj);
+GRID_DEF_UNOP(conjugate, UnaryConj);
+GRID_DEF_UNOP(trace, UnaryTrace);
+GRID_DEF_UNOP(transpose, UnaryTranspose);
+GRID_DEF_UNOP(Ta, UnaryTa);
+GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
+GRID_DEF_UNOP(real, UnaryReal);
+GRID_DEF_UNOP(imag, UnaryImag);
+GRID_DEF_UNOP(toReal, UnaryToReal);
+GRID_DEF_UNOP(toComplex, UnaryToComplex);
+GRID_DEF_UNOP(timesI, UnaryTimesI);
+GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
+GRID_DEF_UNOP(abs, UnaryAbs);  // abs overloaded in cmath C++98; DON'T do the
+                               // abs-fabs-dabs-labs thing
+GRID_DEF_UNOP(sqrt, UnarySqrt);
+GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
+GRID_DEF_UNOP(sin, UnarySin);
+GRID_DEF_UNOP(cos, UnaryCos);
+GRID_DEF_UNOP(asin, UnaryAsin);
+GRID_DEF_UNOP(acos, UnaryAcos);
+GRID_DEF_UNOP(log, UnaryLog);
+GRID_DEF_UNOP(exp, UnaryExp);
+
+GRID_DEF_BINOP(operator+, BinaryAdd);
+GRID_DEF_BINOP(operator-, BinarySub);
+GRID_DEF_BINOP(operator*, BinaryMul);
+GRID_DEF_BINOP(operator/, BinaryDiv);
+
+GRID_DEF_BINOP(operator&, BinaryAnd);
+GRID_DEF_BINOP(operator|, BinaryOr);
+GRID_DEF_BINOP(operator&&, BinaryAndAnd);
+GRID_DEF_BINOP(operator||, BinaryOrOr);
+
+GRID_DEF_TRINOP(where, TrinaryWhere);
+
+/////////////////////////////////////////////////////////////
+// Closure convenience to force expression to evaluate
+/////////////////////////////////////////////////////////////
+template <class Op, class T1>
+auto closure(const LatticeUnaryExpression<Op, T1> &expr)
+  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> 
+{
+  Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
+  return ret;
+}
+template <class Op, class T1, class T2>
+auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
+  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> 
+{
+  Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
+  return ret;
+}
+template <class Op, class T1, class T2, class T3>
+auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
+  -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
+				   eval(0, expr.arg2),
+				   eval(0, expr.arg3)))> 
+{
+  Lattice<decltype(expr.op.func(eval(0, expr.arg1),
+				eval(0, expr.arg2),
+				eval(0, expr.arg3)))>  ret(expr);
+  return ret;
+}
+
+#undef GRID_UNOP
+#undef GRID_BINOP
+#undef GRID_TRINOP
+
+#undef GRID_DEF_UNOP
+#undef GRID_DEF_BINOP
+#undef GRID_DEF_TRINOP
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/lattice/Lattice_arith.h
+++ b/Grid/lattice/Lattice_arith.h
@@ -0,0 +1,314 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_arith.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_ARITH_H
+#define GRID_LATTICE_ARITH_H
+
+NAMESPACE_BEGIN(Grid);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+//  avoid copy back routines for mult, mac, sub, add
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class obj1,class obj2,class obj3> inline
+void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  ret.Checkerboard() = lhs.Checkerboard();
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  auto rhs_v = rhs.View();
+  conformable(ret,rhs);
+  conformable(lhs,rhs);
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,lhs_v,{
+    obj1 tmp;
+    mult(&tmp,&lhs_v[ss],&rhs_v[ss]);
+    vstream(ret_v[ss],tmp);
+  });
+#else
+  accelerator_loop(ss,lhs_v,{
+    mult(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]);
+  });
+#endif
+}
+  
+template<class obj1,class obj2,class obj3> inline
+void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  ret.Checkerboard() = lhs.Checkerboard();
+  conformable(ret,rhs);
+  conformable(lhs,rhs);
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  auto rhs_v = rhs.View();
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,lhs_v,{
+    obj1 tmp;
+    mac(&tmp,&lhs_v[ss],&rhs_v[ss]);
+    vstream(ret_v[ss],tmp);
+  });
+#else
+  accelerator_loop(ss,lhs_v,{
+    mac(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]);
+  });
+#endif
+}
+  
+template<class obj1,class obj2,class obj3> inline
+void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  ret.Checkerboard() = lhs.Checkerboard();
+  conformable(ret,rhs);
+  conformable(lhs,rhs);
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  auto rhs_v = rhs.View();
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,lhs_v,{
+    obj1 tmp;
+    sub(&tmp,&lhs_v[ss],&rhs_v[ss]);
+    vstream(ret_v[ss],tmp);
+  });
+#else
+  accelerator_loop(ss,lhs_v,{
+    sub(&ret[ss],&lhs_v[ss],&rhs_v[ss]);
+  });
+#endif
+}
+template<class obj1,class obj2,class obj3> inline
+void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
+  ret.Checkerboard() = lhs.Checkerboard();
+  conformable(ret,rhs);
+  conformable(lhs,rhs);
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  auto rhs_v = rhs.View();
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,lhs_v,{
+    obj1 tmp;
+    add(&tmp,&lhs_v[ss],&rhs_v[ss]);
+    vstream(ret_v[ss],tmp);
+  });
+#else
+  accelerator_loop(ss,lhs_v,{
+    add(&ret_v[ss],&lhs_v[ss],&rhs_v[ss]);
+  });
+#endif
+}
+  
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+//  avoid copy back routines for mult, mac, sub, add
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class obj1,class obj2,class obj3> inline
+void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  ret.Checkerboard() = lhs.Checkerboard();
+  conformable(lhs,ret);
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  accelerator_loop(ss,lhs_v,{
+    obj1 tmp;
+    mult(&tmp,&lhs_v[ss],&rhs);
+    vstream(ret_v[ss],tmp);
+  });
+}
+  
+template<class obj1,class obj2,class obj3> inline
+void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  ret.Checkerboard() = lhs.Checkerboard();
+  conformable(ret,lhs);
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  accelerator_loop(ss,lhs_v,{
+    obj1 tmp;
+    mac(&tmp,&lhs_v[ss],&rhs);
+    vstream(ret_v[ss],tmp);
+  });
+}
+  
+template<class obj1,class obj2,class obj3> inline
+void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  ret.Checkerboard() = lhs.Checkerboard();
+  conformable(ret,lhs);
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,lhs_v,{
+    obj1 tmp;
+    sub(&tmp,&lhs_v[ss],&rhs);
+    vstream(ret_v[ss],tmp);
+  });
+#else 
+  accelerator_loop(ss,lhs_v,{
+    sub(&ret_v[ss],&lhs_v[ss],&rhs);
+  });
+#endif
+}
+template<class obj1,class obj2,class obj3> inline
+void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+  ret.Checkerboard() = lhs.Checkerboard();
+  conformable(lhs,ret);
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,lhs_v,{
+    obj1 tmp;
+    add(&tmp,&lhs_v[ss],&rhs);
+    vstream(ret_v[ss],tmp);
+  });
+#else 
+  accelerator_loop(ss,lhs_v,{
+    add(&ret_v[ss],&lhs_v[ss],&rhs);
+  });
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+//  avoid copy back routines for mult, mac, sub, add
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class obj1,class obj2,class obj3> inline
+void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  ret.Checkerboard() = rhs.Checkerboard();
+  conformable(ret,rhs);
+  auto ret_v = ret.View();
+  auto rhs_v = lhs.View();
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,rhs_v,{
+    obj1 tmp;
+    mult(&tmp,&lhs,&rhs_v[ss]);
+    vstream(ret_v[ss],tmp);
+  });
+#else 
+  accelerator_loop(ss,rhs_v,{
+    mult(&ret_v[ss],&lhs,&rhs_v[ss]);
+  });
+#endif
+}
+  
+template<class obj1,class obj2,class obj3> inline
+void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  ret.Checkerboard() = rhs.Checkerboard();
+  conformable(ret,rhs);
+  auto ret_v = ret.View();
+  auto rhs_v = lhs.View();
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,rhs_v,{
+    obj1 tmp;
+    mac(&tmp,&lhs,&rhs_v[ss]);
+    vstream(ret_v[ss],tmp);
+  });
+#else 
+  accelerator_loop(ss,rhs_v,{
+    mac(&ret_v[ss],&lhs,&rhs_v[ss]);
+  });
+#endif
+}
+  
+template<class obj1,class obj2,class obj3> inline
+void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  ret.Checkerboard() = rhs.Checkerboard();
+  conformable(ret,rhs);
+  auto ret_v = ret.View();
+  auto rhs_v = lhs.View();
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,rhs_v,{
+    obj1 tmp;
+    sub(&tmp,&lhs,&rhs_v[ss]);
+    vstream(ret_v[ss],tmp);
+  });
+#else 
+  accelerator_loop(ss,rhs_v,{
+    sub(&ret_v[ss],&lhs,&rhs_v[ss]);
+  });
+#endif
+}
+template<class obj1,class obj2,class obj3> inline
+void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+  ret.Checkerboard() = rhs.Checkerboard();
+  conformable(ret,rhs);
+  auto ret_v = ret.View();
+  auto rhs_v = lhs.View();
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,rhs_v,{
+    obj1 tmp;
+    add(&tmp,&lhs,&rhs_v[ss]);
+    vstream(ret_v[ss],tmp);
+  });
+#else 
+  accelerator_loop(ss,rhs_v,{
+    add(&ret_v[ss],&lhs,&rhs_v[ss]);
+  });
+#endif
+}
+  
+template<class sobj,class vobj> inline
+void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
+  ret.Checkerboard() = x.Checkerboard();
+  conformable(ret,x);
+  conformable(x,y);
+  auto ret_v = ret.View();
+  auto x_v = x.View();
+  auto y_v = y.View();
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,x_v,{
+    vobj tmp = a*x_v[ss]+y_v[ss];
+    vstream(ret_v[ss],tmp);
+  });
+#else
+  accelerator_loop(ss,x_v,{
+    ret_v[ss]=a*x_v[ss]+y_v[ss];
+  });
+#endif
+}
+template<class sobj,class vobj> inline
+void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
+  ret.Checkerboard() = x.Checkerboard();
+  conformable(ret,x);
+  conformable(x,y);
+  auto ret_v = ret.View();
+  auto x_v = x.View();
+  auto y_v = y.View();
+#ifdef STREAMING_STORES
+  accelerator_loop(ss,x_v,{
+    vobj tmp = a*x_v[ss]+b*y_v[ss];
+    vstream(ret_v[ss],tmp);
+  });
+#else
+  accelerator_loop(ss,x_v,{
+    ret_v[ss]=a*x_v[ss]+b*y_v[ss];
+  });
+#endif
+}
+
+template<class sobj,class vobj> inline
+RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
+{
+    return axpy_norm_fast(ret,a,x,y);
+}
+template<class sobj,class vobj> inline
+RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
+{
+    return axpby_norm_fast(ret,a,b,x,y);
+}
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/lattice/Lattice_base.h
+++ b/Grid/lattice/Lattice_base.h
@@ -0,0 +1,488 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/lattice/Lattice_base.h
+
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#pragma once 
+
+#define STREAMING_STORES
+
+NAMESPACE_BEGIN(Grid);
+
+extern int GridCshiftPermuteMap[4][16];
+
+///////////////////////////////////////////////////////////////////
+// Base class which can be used by traits to pick up behaviour
+///////////////////////////////////////////////////////////////////
+class LatticeBase {};
+
+/////////////////////////////////////////////////////////////////////////////////////////
+// Conformable checks; same instance of Grid required
+/////////////////////////////////////////////////////////////////////////////////////////
+void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
+{
+  assert(lhs == rhs);
+}
+
+////////////////////////////////////////////////////////////////////////////
+// Minimal base class containing only data valid to access from accelerator
+// _odata will be a managed pointer in CUDA
+////////////////////////////////////////////////////////////////////////////
+// Force access to lattice through a view object.
+// prevents writing of code that will not offload to GPU, but perhaps annoyingly
+// strict since host could could in principle direct access through the lattice object
+// Need to decide programming model.
+#define LATTICE_VIEW_STRICT
+template<class vobj> class LatticeAccelerator : public LatticeBase
+{
+protected:
+  GridBase *_grid;
+  int checkerboard;
+  vobj     *_odata;    // A managed pointer
+  uint64_t _odata_size;    
+public:
+  accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { }; 
+  accelerator_inline uint64_t oSites(void) const { return _odata_size; };
+  accelerator_inline int  Checkerboard(void) const { return checkerboard; };
+  accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
+  accelerator_inline void Conformable(GridBase * &grid) const
+  { 
+    if (grid) conformable(grid, _grid);
+    else      grid = _grid;
+  };
+#ifndef LATTICE_VIEW_STRICT
+  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
+  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
+#endif
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////
+// A View class which provides accessor to the data.
+// This will be safe to call from accelerator_loops and is trivially copy constructible
+// The copy constructor for this will need to be used by device lambda functions
+/////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj> 
+class LatticeView : public LatticeAccelerator<vobj>
+{
+public:
+#ifdef LATTICE_VIEW_STRICT
+  accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; };
+  accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
+#endif
+  accelerator_inline uint64_t begin(void) const { return 0;};
+  accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
+  accelerator_inline uint64_t size(void)  const { return this->_odata_size; };
+
+  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
+  {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////
+// Lattice expression types used by ET to assemble the AST
+// 
+// Need to be able to detect code paths according to the whether a lattice object or not
+// so introduce some trait type things
+/////////////////////////////////////////////////////////////////////////////////////////
+
+class LatticeExpressionBase {};
+
+template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
+template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
+
+template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
+template<class T>                 struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
+template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
+
+template <typename Op, typename _T1>                           
+class LatticeUnaryExpression : public  LatticeExpressionBase 
+{
+public:
+  typedef typename ViewMap<_T1>::Type T1;
+  Op op;
+  T1 arg1;
+  LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
+};
+
+template <typename Op, typename _T1, typename _T2>              
+class LatticeBinaryExpression : public LatticeExpressionBase 
+{
+public:
+  typedef typename ViewMap<_T1>::Type T1;
+  typedef typename ViewMap<_T2>::Type T2;
+  Op op;
+  T1 arg1;
+  T2 arg2;
+  LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
+};
+
+template <typename Op, typename _T1, typename _T2, typename _T3> 
+class LatticeTrinaryExpression : public LatticeExpressionBase 
+{
+public:
+  typedef typename ViewMap<_T1>::Type T1;
+  typedef typename ViewMap<_T2>::Type T2;
+  typedef typename ViewMap<_T3>::Type T3;
+  Op op;
+  T1 arg1;
+  T2 arg2;
+  T3 arg3;
+  LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////
+// The real lattice class, with normal copy and assignment semantics.
+// This contains extra (host resident) grid pointer data that may be accessed by host code
+/////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj>
+class Lattice : public LatticeAccelerator<vobj>
+{
+public:
+  GridBase *Grid(void) const { return this->_grid; }
+  ///////////////////////////////////////////////////
+  // Member types
+  ///////////////////////////////////////////////////
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  typedef vobj vector_object;
+
+private:
+  void dealloc(void)
+  {
+    alignedAllocator<vobj> alloc;
+    if( this->_odata_size ) {
+      alloc.deallocate(this->_odata,this->_odata_size);
+      this->_odata=nullptr;
+      this->_odata_size=0;
+    }
+  }
+  void resize(uint64_t size)
+  {
+    alignedAllocator<vobj> alloc;
+    if ( this->_odata_size != size ) {
+      dealloc();
+    }
+    this->_odata_size = size;
+    if ( size ) 
+      this->_odata      = alloc.allocate(this->_odata_size);
+    else 
+      this->_odata      = nullptr;
+  }
+#if 0
+  void copy_vec(vobj *ptr,uint64_t count)
+  {
+    dealloc();
+    this->_odata = ptr;
+    assert(this->_odata_size == count);
+  }
+#endif
+public:
+  /////////////////////////////////////////////////////////////////////////////////
+  // Return a view object that may be dereferenced in site loops.
+  // The view is trivially copy constructible and may be copied to an accelerator device
+  // in device lambdas
+  /////////////////////////////////////////////////////////////////////////////////
+  LatticeView<vobj> View (void) const 
+  {
+    LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
+    return accessor;
+  }
+  
+  ~Lattice() { 
+    if ( this->_odata_size ) {
+      dealloc();
+    }
+   }
+  ////////////////////////////////////////////////////////////////////////////////
+  // Expression Template closure support
+  ////////////////////////////////////////////////////////////////////////////////
+  template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
+  {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(this->_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    this->checkerboard=cb;
+
+    auto me  = View();
+#ifdef STREAMING_STORES
+    accelerator_loop(ss,me,{
+      vobj tmp = eval(ss,expr);
+      vstream(me[ss] ,tmp);
+    });
+#else
+    accelerator_loop(ss,me,{
+      me[ss]=eval(ss,expr);
+    });
+#endif
+    return *this;
+  }
+  template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
+  {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(this->_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    this->checkerboard=cb;
+
+    auto me  = View();
+#ifdef STREAMING_STORES
+    accelerator_loop(ss,me,{
+      vobj tmp = eval(ss,expr);
+      vstream(me[ss] ,tmp);
+    });
+#else
+    accelerator_loop(ss,me,{
+      me[ss]=eval(ss,expr);
+    });
+#endif
+    return *this;
+  }
+  template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
+  {
+    GridBase *egrid(nullptr);
+    GridFromExpression(egrid,expr);
+    assert(egrid!=nullptr);
+    conformable(this->_grid,egrid);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    this->checkerboard=cb;
+    auto me  = View();
+#ifdef STREAMING_STORES
+    accelerator_loop(ss,me,{
+      vobj tmp = eval(ss,expr);
+      vstream(me[ss] ,tmp);
+    });
+#else
+    accelerator_loop(ss,me,{
+      me[ss] = eval(ss,expr);
+    });
+#endif
+    return *this;
+  }
+  //GridFromExpression is tricky to do
+  template<class Op,class T1>
+  Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
+    this->_grid = nullptr;
+    GridFromExpression(this->_grid,expr);
+    assert(this->_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    this->checkerboard=cb;
+
+    resize(this->_grid->oSites());
+
+    *this = expr;
+  }
+  template<class Op,class T1, class T2>
+  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
+    this->_grid = nullptr;
+    GridFromExpression(this->_grid,expr);
+    assert(this->_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    this->checkerboard=cb;
+
+    resize(this->_grid->oSites());
+
+    *this = expr;
+  }
+  template<class Op,class T1, class T2, class T3>
+  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
+    this->_grid = nullptr;
+    GridFromExpression(this->_grid,expr);
+    assert(this->_grid!=nullptr);
+
+    int cb=-1;
+    CBFromExpression(cb,expr);
+    assert( (cb==Odd) || (cb==Even));
+    this->checkerboard=cb;
+
+    resize(this->_grid->oSites());
+
+    *this = expr;
+  }
+
+  template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
+    auto me  = View();
+    accelerator_loop(ss,me,{
+      me[ss]=r;
+    });
+    return *this;
+  }
+
+  //////////////////////////////////////////////////////////////////
+  // Follow rule of five, with Constructor requires "grid" passed
+  // to user defined constructor
+  ///////////////////////////////////////////
+  // user defined constructor
+  ///////////////////////////////////////////
+  Lattice(GridBase *grid) { 
+    this->_grid = grid;
+    resize(this->_grid->oSites());
+    assert((((uint64_t)&this->_odata[0])&0xF) ==0);
+    this->checkerboard=0;
+  }
+  
+  //  virtual ~Lattice(void) = default;
+    
+  void reset(GridBase* grid) {
+    if (this->_grid != grid) {
+      this->_grid = grid;
+      this->_odata.resize(grid->oSites());
+      this->checkerboard = 0;
+    }
+  }
+  ///////////////////////////////////////////
+  // copy constructor
+  ///////////////////////////////////////////
+  Lattice(const Lattice& r){ 
+    //    std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl; 
+    this->_grid = r.Grid();
+    resize(this->_grid->oSites());
+    *this = r;
+  }
+  ///////////////////////////////////////////
+  // move constructor
+  ///////////////////////////////////////////
+  Lattice(Lattice && r){ 
+    this->_grid = r.Grid();
+    this->_odata      = r._odata;
+    this->_odata_size = r._odata_size;
+    this->checkerboard= r.Checkerboard();
+    r._odata      = nullptr;
+    r._odata_size = 0;
+  }
+  ///////////////////////////////////////////
+  // assignment template
+  ///////////////////////////////////////////
+  template<class robj> inline Lattice<vobj> & operator = (const Lattice<robj> & r){
+    typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
+    conformable(*this,r);
+    this->checkerboard = r.Checkerboard();
+    auto me =   View();
+    auto him= r.View();
+    accelerator_loop(ss,me,{
+      me[ss]=him[ss];
+    });
+    return *this;
+  }
+
+  ///////////////////////////////////////////
+  // Copy assignment 
+  ///////////////////////////////////////////
+  inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
+    this->checkerboard = r.Checkerboard();
+    conformable(*this,r);
+    auto me =   View();
+    auto him= r.View();
+    accelerator_loop(ss,me,{
+      me[ss]=him[ss];
+    });
+    return *this;
+  }
+  ///////////////////////////////////////////
+  // Move assignment possible if same type
+  ///////////////////////////////////////////
+  inline Lattice<vobj> & operator = (Lattice<vobj> && r){
+
+    resize(0); // deletes if appropriate
+    this->_grid       = r.Grid();
+    this->_odata      = r._odata;
+    this->_odata_size = r._odata_size;
+    this->checkerboard= r.Checkerboard();
+
+    r._odata      = nullptr;
+    r._odata_size = 0;
+    
+    return *this;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  // *=,+=,-= operators inherit behvour from correspond */+/- operation
+  /////////////////////////////////////////////////////////////////////////////
+  template<class T> inline Lattice<vobj> &operator *=(const T &r) {
+    *this = (*this)*r;
+    return *this;
+  }
+  
+  template<class T> inline Lattice<vobj> &operator -=(const T &r) {
+    *this = (*this)-r;
+    return *this;
+  }
+  template<class T> inline Lattice<vobj> &operator +=(const T &r) {
+    *this = (*this)+r;
+    return *this;
+  }
+
+  friend inline void swap(Lattice &l, Lattice &r) { 
+    conformable(l,r);
+    LatticeAccelerator<vobj> tmp;
+    LatticeAccelerator<vobj> *lp = (LatticeAccelerator<vobj> *)&l;
+    LatticeAccelerator<vobj> *rp = (LatticeAccelerator<vobj> *)&r;
+    tmp = *lp;    *lp=*rp;    *rp=tmp;
+  }
+
+}; // class Lattice
+
+template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
+  typedef typename vobj::scalar_object sobj;
+  for(int g=0;g<o.Grid()->_gsites;g++){
+
+    Coordinate gcoor;
+    o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
+
+    sobj ss;
+    peekSite(ss,o,gcoor);
+    stream<<"[";
+    for(int d=0;d<gcoor.size();d++){
+      stream<<gcoor[d];
+      if(d!=gcoor.size()-1) stream<<",";
+    }
+    stream<<"]\t";
+    stream<<ss<<std::endl;
+  }
+  return stream;
+}
+  
+NAMESPACE_END(Grid);
+
--- a/Grid/lattice/Lattice_comparison.h
+++ b/Grid/lattice/Lattice_comparison.h
@@ -0,0 +1,176 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_comparison.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_COMPARISON_H
+#define GRID_LATTICE_COMPARISON_H
+
+NAMESPACE_BEGIN(Grid);
+
+//////////////////////////////////////////////////////////////////////////
+// relational operators
+// 
+// Support <,>,<=,>=,==,!=
+//
+//Query supporting bitwise &, |, ^, !
+//Query supporting logical &&, ||, 
+//////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////
+// compare lattice to lattice
+//////////////////////////////////////////////////////////////////////////
+template<class vfunctor,class lobj,class robj>  
+inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
+{
+  Lattice<vInteger> ret(rhs.Grid());
+  auto lhs_v = lhs.View();
+  auto rhs_v = rhs.View();
+  auto ret_v = ret.View();
+  accelerator_loop( ss, rhs_v, {
+    ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
+  });
+  return ret;
+}
+//////////////////////////////////////////////////////////////////////////
+// compare lattice to scalar
+//////////////////////////////////////////////////////////////////////////
+template<class vfunctor,class lobj,class robj> 
+inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
+{
+  Lattice<vInteger> ret(lhs.Grid());
+  auto lhs_v = lhs.View();
+  auto ret_v = ret.View();
+  accelerator_loop( ss, lhs_v, {
+    ret_v[ss]=op(lhs_v[ss],rhs);
+  });
+  return ret;
+}
+//////////////////////////////////////////////////////////////////////////
+// compare scalar to lattice
+//////////////////////////////////////////////////////////////////////////
+template<class vfunctor,class lobj,class robj> 
+inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
+{
+  Lattice<vInteger> ret(rhs.Grid());
+  auto rhs_v = rhs.View();
+  auto ret_v = ret.View();
+  accelerator_loop( ss, rhs_v, {
+    ret_v[ss]=op(lhs,rhs_v[ss]);
+  });
+  return ret;
+}
+  
+//////////////////////////////////////////////////////////////////////////
+// Map to functors
+//////////////////////////////////////////////////////////////////////////
+// Less than
+template<class lobj,class robj>
+inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+  return LLComparison(vlt<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
+  return LSComparison(vlt<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
+  return SLComparison(vlt<lobj,robj>(),lhs,rhs);
+}
+  
+// Less than equal
+template<class lobj,class robj>
+inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+  return LLComparison(vle<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
+  return LSComparison(vle<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
+  return SLComparison(vle<lobj,robj>(),lhs,rhs);
+}
+  
+// Greater than 
+template<class lobj,class robj>
+inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+  return LLComparison(vgt<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
+  return LSComparison(vgt<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
+  return SLComparison(vgt<lobj,robj>(),lhs,rhs);
+}
+  
+  
+// Greater than equal
+template<class lobj,class robj>
+inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+  return LLComparison(vge<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
+  return LSComparison(vge<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
+  return SLComparison(vge<lobj,robj>(),lhs,rhs);
+}
+   
+// equal
+template<class lobj,class robj>
+inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+  return LLComparison(veq<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
+  return LSComparison(veq<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
+  return SLComparison(veq<lobj,robj>(),lhs,rhs);
+}
+   
+   
+// not equal
+template<class lobj,class robj>
+inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
+  return LLComparison(vne<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
+  return LSComparison(vne<lobj,robj>(),lhs,rhs);
+}
+template<class lobj,class robj>
+inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
+  return SLComparison(vne<lobj,robj>(),lhs,rhs);
+}
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/lattice/Lattice_comparison_utils.h
+++ b/Grid/lattice/Lattice_comparison_utils.h
@@ -0,0 +1,231 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_comparison_utils.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_COMPARISON_H
+#define GRID_COMPARISON_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////
+// This implementation is a bit poor.
+//
+// Only support relational logical operations (<, >  etc)
+// on scalar objects. Therefore can strip any tensor structures.
+//
+// Should guard this with isGridTensor<> enable if?
+/////////////////////////////////////////
+//
+// Generic list of functors
+//
+template<class lobj,class robj> class veq {
+public:
+  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) == (rhs);
+  }
+};
+template<class lobj,class robj> class vne {
+public:
+  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) != (rhs);
+  }
+};
+template<class lobj,class robj> class vlt {
+public:
+  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) < (rhs);
+  }
+};
+template<class lobj,class robj> class vle {
+public:
+  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) <= (rhs);
+  }
+};
+template<class lobj,class robj> class vgt {
+public:
+  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) > (rhs);
+  }
+};
+template<class lobj,class robj> class vge {
+public:
+  accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) >= (rhs);
+  }
+};
+  
+// Generic list of functors
+template<class lobj,class robj> class seq {
+public:
+  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) == (rhs);
+  }
+};
+template<class lobj,class robj> class sne {
+public:
+  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) != (rhs);
+  }
+};
+template<class lobj,class robj> class slt {
+public:
+  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) < (rhs);
+  }
+};
+template<class lobj,class robj> class sle {
+public:
+  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) <= (rhs);
+  }
+};
+template<class lobj,class robj> class sgt {
+public:
+  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) > (rhs);
+  }
+};
+template<class lobj,class robj> class sge {
+public:
+  accelerator Integer operator()(const lobj &lhs, const robj &rhs)
+  { 
+    return (lhs) >= (rhs);
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+// Integer and real get extra relational functions.
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
+{
+  typedef typename vsimd::scalar_type scalar;
+  ExtractBuffer<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+  ExtractBuffer<scalar> vrhs(vsimd::Nsimd());
+  ExtractBuffer<Integer> vpred(vsimd::Nsimd());
+  vInteger ret;
+  extract<vsimd,scalar>(lhs,vlhs);
+  extract<vsimd,scalar>(rhs,vrhs);
+  for(int s=0;s<vsimd::Nsimd();s++){
+    vpred[s] = sop(vlhs[s],vrhs[s]);
+  }
+  merge<vInteger,Integer>(ret,vpred);
+  return ret;
+}
+
+template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
+{
+  typedef typename vsimd::scalar_type scalar;
+  ExtractBuffer<scalar>  vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+  ExtractBuffer<Integer> vpred(vsimd::Nsimd());
+  vInteger ret;
+  extract<vsimd,scalar>(lhs,vlhs);
+  for(int s=0;s<vsimd::Nsimd();s++){
+    vpred[s] = sop(vlhs[s],rhs);
+  }
+  merge<vInteger,Integer>(ret,vpred);
+  return ret;
+}
+
+template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
+{
+  typedef typename vsimd::scalar_type scalar;
+  ExtractBuffer<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+  ExtractBuffer<Integer> vpred(vsimd::Nsimd());
+  vInteger ret;
+  extract<vsimd,scalar>(rhs,vrhs);
+  for(int s=0;s<vsimd::Nsimd();s++){
+    vpred[s] = sop(lhs,vrhs[s]);
+  }
+  merge<vInteger,Integer>(ret,vpred);
+  return ret;
+}
+
+#define DECLARE_RELATIONAL_EQ(op,functor) \
+  template<class vsimd,IfSimd<vsimd> = 0>				\
+  accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)	\
+  {									\
+    typedef typename vsimd::scalar_type scalar;				\
+    return Comparison(functor<scalar,scalar>(),lhs,rhs);		\
+  }									\
+  template<class vsimd,IfSimd<vsimd> = 0>				\
+  accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
+  {									\
+    typedef typename vsimd::scalar_type scalar;				\
+    return Comparison(functor<scalar,scalar>(),lhs,rhs);		\
+  }									\
+  template<class vsimd,IfSimd<vsimd> = 0>				\
+  accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
+  {									\
+    typedef typename vsimd::scalar_type scalar;				\
+    return Comparison(functor<scalar,scalar>(),lhs,rhs);		\
+  }									\
+  template<class vsimd>							\
+  accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs) \
+  {									\
+    return lhs._internal op rhs._internal;				\
+  }									\
+  template<class vsimd>							\
+  accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
+  {									\
+    return lhs._internal op rhs;					\
+  }									\
+  template<class vsimd>							\
+  accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
+  {									\
+    return lhs op rhs._internal;					\
+    }									\
+
+#define DECLARE_RELATIONAL(op,functor) DECLARE_RELATIONAL_EQ(op,functor)    
+
+DECLARE_RELATIONAL(<,slt);
+DECLARE_RELATIONAL(<=,sle);
+DECLARE_RELATIONAL(>,sgt);
+DECLARE_RELATIONAL(>=,sge);
+DECLARE_RELATIONAL_EQ(==,seq);
+DECLARE_RELATIONAL(!=,sne);
+
+#undef DECLARE_RELATIONAL
+
+NAMESPACE_END(Grid);
+
+
+#endif
--- a/Grid/lattice/Lattice_conformable.h
+++ b/Grid/lattice/Lattice_conformable.h
@@ -0,0 +1,40 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_conformable.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_CONFORMABLE_H
+#define GRID_LATTICE_CONFORMABLE_H
+
+NAMESPACE_BEGIN(Grid);
+
+template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
+{
+  assert(lhs.Grid() == rhs.Grid());
+  assert(lhs.Checkerboard() == rhs.Checkerboard());
+}
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/lattice/Lattice_coordinate.h
+++ b/Grid/lattice/Lattice_coordinate.h
@@ -0,0 +1,74 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_coordinate.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once 
+
+NAMESPACE_BEGIN(Grid);
+
+template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
+{
+  typedef typename iobj::scalar_type scalar_type;
+  typedef typename iobj::vector_type vector_type;
+
+  GridBase *grid = l.Grid();
+  int Nsimd = grid->iSites();
+
+  Coordinate gcoor;
+  ExtractBuffer<scalar_type> mergebuf(Nsimd);
+
+  vector_type vI;
+  auto l_v = l.View();
+  for(int o=0;o<grid->oSites();o++){
+    for(int i=0;i<grid->iSites();i++){
+      grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
+      mergebuf[i]=(Integer)gcoor[mu];
+    }
+    merge<vector_type,scalar_type>(vI,mergebuf);
+    l_v[o]=vI;
+  }
+};
+
+// LatticeCoordinate();
+// FIXME for debug; deprecate this; made obscelete by 
+template<class vobj> void lex_sites(Lattice<vobj> &l){
+  auto l_v = l.View();
+  Real *v_ptr = (Real *)&l_v[0];
+  size_t o_len = l.Grid()->oSites();
+  size_t v_len = sizeof(vobj)/sizeof(vRealF);
+  size_t vec_len = vRealF::Nsimd();
+
+  for(int i=0;i<o_len;i++){
+    for(int j=0;j<v_len;j++){
+      for(int vv=0;vv<vec_len;vv+=2){
+	v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
+	v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
+      }
+    }}
+}
+
+NAMESPACE_END(Grid);
+
--- a/Grid/lattice/Lattice_local.h
+++ b/Grid/lattice/Lattice_local.h
@@ -0,0 +1,83 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_local.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_LOCALREDUCTION_H
+#define GRID_LATTICE_LOCALREDUCTION_H
+
+///////////////////////////////////////////////
+// localInner, localNorm, outerProduct
+///////////////////////////////////////////////
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////
+// Non site, reduced locally reduced routines
+/////////////////////////////////////////////////////
+  
+// localNorm2,
+template<class vobj>
+inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
+{
+  Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
+  auto rhs_v = rhs.View();
+  auto ret_v = ret.View();
+  accelerator_loop(ss,rhs_v,{
+    ret_v[ss]=innerProduct(rhs_v[ss],rhs_v[ss]);
+  });
+  return ret;
+}
+  
+// localInnerProduct
+template<class vobj>
+inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
+{
+  Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
+  auto lhs_v = lhs.View();
+  auto rhs_v = rhs.View();
+  auto ret_v = ret.View();
+  accelerator_loop(ss,rhs_v,{
+    ret_v[ss]=innerProduct(lhs_v[ss],rhs_v[ss]);
+  });
+  return ret;
+}
+  
+// outerProduct Scalar x Scalar -> Scalar
+//              Vector x Vector -> Matrix
+template<class ll,class rr>
+inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(ll(),rr()))>
+{
+  Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
+  auto lhs_v = lhs.View();
+  auto rhs_v = rhs.View();
+  auto ret_v = ret.View();
+  accelerator_loop(ss,rhs_v,{
+    ret_v[ss]=outerProduct(lhs_v[ss],rhs_v[ss]);
+  });
+  return ret;
+}
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/lattice/Lattice_matrix_reduction.h
+++ b/Grid/lattice/Lattice_matrix_reduction.h
@@ -0,0 +1,202 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid 
+    Source file: ./lib/lattice/Lattice_reduction.h
+    Copyright (C) 2015
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once 
+#include <Grid/Grid_Eigen_Dense.h>
+
+#ifdef GRID_WARN_SUBOPTIMAL
+#warning "Optimisation alert all these reduction loops are NOT threaded "
+#endif     
+
+NAMESPACE_BEGIN(Grid);
+
+template<class vobj>
+static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X.Grid();
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+  auto X_v = X.View();
+  auto Y_v = Y.View();
+  auto R_v = R.View();
+  thread_region
+  {
+    std::vector<vobj> s_x(Nblock);
+
+    thread_loop_collapse2( (int n=0;n<nblock;n++),{
+      for(int b=0;b<block;b++){
+	int o  = n*stride + b;
+
+	for(int i=0;i<Nblock;i++){
+	  s_x[i] = X_v[o+i*ostride];
+	}
+
+	vobj dot;
+	for(int i=0;i<Nblock;i++){
+	  dot = Y_v[o+i*ostride];
+	  for(int j=0;j<Nblock;j++){
+	    dot = dot + s_x[j]*(scale*aa(j,i));
+	  }
+	  R_v[o+i*ostride]=dot;
+	}
+      }});
+  }
+};
+
+template<class vobj>
+static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X.Grid();
+  assert( FullGrid->_simd_layout[Orthog]==1);
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+
+  auto X_v = X.View();
+  auto R_v = R.View();
+
+  thread_region
+  {
+    std::vector<vobj> s_x(Nblock);
+    
+    thread_loop_collapse2( (int n=0;n<nblock;n++),{
+      for(int b=0;b<block;b++){
+	int o  = n*stride + b;
+
+	for(int i=0;i<Nblock;i++){
+	  s_x[i] = X_v[o+i*ostride];
+	}
+
+	vobj dot;
+	for(int i=0;i<Nblock;i++){
+	  dot = s_x[0]*(scale*aa(0,i));
+	  for(int j=1;j<Nblock;j++){
+	    dot = dot + s_x[j]*(scale*aa(j,i));
+	  }
+	  R_v[o+i*ostride]=dot;
+	}
+    }});
+  }
+
+};
+
+
+template<class vobj>
+static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  GridBase *FullGrid  = lhs.Grid();
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  
+  int Nblock = FullGrid->GlobalDimensions()[Orthog];
+  
+  //  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+  
+  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  //  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  //  int nl = nh-1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+
+  typedef typename vobj::vector_typeD vector_typeD;
+  auto lhs_v = lhs.View();
+  auto rhs_v = rhs.View();
+  thread_region {
+    std::vector<vobj> Left(Nblock);
+    std::vector<vobj> Right(Nblock);
+    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+    thread_loop_collapse2((int n=0;n<nblock;n++),{
+      for(int b=0;b<block;b++){
+
+	int o  = n*stride + b;
+
+	for(int i=0;i<Nblock;i++){
+	  Left [i] = lhs_v[o+i*ostride];
+	  Right[i] = rhs_v[o+i*ostride];
+	}
+
+	for(int i=0;i<Nblock;i++){
+	  for(int j=0;j<Nblock;j++){
+	    auto tmp = innerProduct(Left[i],Right[j]);
+	    auto rtmp = TensorRemove(tmp);
+	    ComplexD z = Reduce(rtmp);
+	    mat_thread(i,j) += std::complex<double>(real(z),imag(z));
+	  }}
+    }});
+    thread_critical {
+      mat += mat_thread;
+    }  
+  }
+
+  for(int i=0;i<Nblock;i++){
+    for(int j=0;j<Nblock;j++){
+      ComplexD sum = mat(i,j);
+      FullGrid->GlobalSum(sum);
+      mat(i,j)=sum;
+    }}
+
+  return;
+}
+
+NAMESPACE_END(Grid);
+
+
+
--- a/Grid/lattice/Lattice_overload.h
+++ b/Grid/lattice/Lattice_overload.h
@@ -0,0 +1,138 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_overload.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_OVERLOAD_H
+#define GRID_LATTICE_OVERLOAD_H
+
+namespace Grid {
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // unary negation
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class vobj>
+  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
+  {
+    Lattice<vobj> ret(r._grid);
+    parallel_for(int ss=0;ss<r._grid->oSites();ss++){
+      vstream(ret._odata[ss], -r._odata[ss]);
+    }
+    return ret;
+  } 
+  /////////////////////////////////////////////////////////////////////////////////////
+  // Lattice BinOp Lattice,
+  //NB mult performs conformable check. Do not reapply here for performance.
+  /////////////////////////////////////////////////////////////////////////////////////
+  template<class left,class right>
+    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
+  {
+    Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
+    mult(ret,lhs,rhs);
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
+  {
+    Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
+    add(ret,lhs,rhs);
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
+  {
+    Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
+    sub(ret,lhs,rhs);
+    return ret;
+  }
+  
+  // Scalar BinOp Lattice ;generate return type
+  template<class left,class right>
+  inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
+  {
+    Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss]; 
+      vstream(ret._odata[ss],tmp);
+	   //      ret._odata[ss]=lhs*rhs._odata[ss];
+    }
+    return ret;
+  }
+  template<class left,class right>
+    inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
+    {
+      Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];  
+	vstream(ret._odata[ss],tmp);
+	//	ret._odata[ss]=lhs+rhs._odata[ss];
+      }
+        return ret;
+    }
+  template<class left,class right>
+    inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
+  {
+    Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
+    parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+      decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];  
+      vstream(ret._odata[ss],tmp);
+    }
+    return ret;
+  }
+    template<class left,class right>
+      inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
+    {
+      Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
+      parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
+	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
+	vstream(ret._odata[ss],tmp);
+	//            ret._odata[ss]=lhs._odata[ss]*rhs;
+      }
+      return ret;
+    }
+    template<class left,class right>
+      inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
+    {
+        Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
+	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs; 
+	  vstream(ret._odata[ss],tmp);
+	  //	  ret._odata[ss]=lhs._odata[ss]+rhs;
+        }
+        return ret;
+    }
+    template<class left,class right>
+      inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
+    {
+      Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
+      parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
+	  vstream(ret._odata[ss],tmp);
+	  //	ret._odata[ss]=lhs._odata[ss]-rhs;
+      }
+      return ret;
+    }
+}
+#endif
--- a/Grid/lattice/Lattice_peekpoke.h
+++ b/Grid/lattice/Lattice_peekpoke.h
@@ -0,0 +1,217 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_peekpoke.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_PEEK_H
+#define GRID_LATTICE_PEEK_H
+
+///////////////////////////////////////////////
+// Peeking and poking around
+///////////////////////////////////////////////
+
+NAMESPACE_BEGIN(Grid);
+
+
+// FIXME accelerator_loop and accelerator_inline these
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Peek internal indices of a Lattice object
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<int Index,class vobj> 
+auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(vobj(),i))>
+{
+  Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
+  ret.Checkerboard()=lhs.Checkerboard();
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  cpu_loop( ss, lhs_v, {
+      ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
+  });
+  return ret;
+};
+template<int Index,class vobj> 
+auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(vobj(),i,j))>
+{
+  Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
+  ret.Checkerboard()=lhs.Checkerboard();
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  cpu_loop( ss, lhs_v, {
+    ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
+  });
+  return ret;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Poke internal indices of a Lattice object
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<int Index,class vobj>  
+void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
+{
+  auto rhs_v = rhs.View();
+  auto lhs_v = lhs.View();
+  cpu_loop( ss, lhs_v, {
+    pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
+  });
+}
+template<int Index,class vobj> 
+void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
+{
+  auto rhs_v = rhs.View();
+  auto lhs_v = lhs.View();
+  cpu_loop( ss, lhs_v, {
+    pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
+  });
+}
+
+//////////////////////////////////////////////////////
+// Poke a scalar object into the SIMD array
+//////////////////////////////////////////////////////
+template<class vobj,class sobj> 
+void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
+
+  GridBase *grid=l.Grid();
+
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nsimd = grid->Nsimd();
+
+  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
+  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+  int rank,odx,idx;
+  // Optional to broadcast from node 0.
+  grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+  grid->Broadcast(grid->BossRank(),s);
+
+  // extract-modify-merge cycle is easiest way and this is not perf critical
+  ExtractBuffer<sobj> buf(Nsimd);
+  auto l_v = l.View();
+  if ( rank == grid->ThisRank() ) {
+    extract(l_v[odx],buf);
+    buf[idx] = s;
+    merge(l_v[odx],buf);
+  }
+
+  return;
+};
+
+
+//////////////////////////////////////////////////////////
+// Peek a scalar object from the SIMD array
+//////////////////////////////////////////////////////////
+template<class vobj,class sobj>
+void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
+        
+  GridBase *grid=l.Grid();
+
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nsimd = grid->Nsimd();
+
+  assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
+
+  int rank,odx,idx;
+  grid->GlobalCoorToRankIndex(rank,odx,idx,site);
+
+  ExtractBuffer<sobj> buf(Nsimd);
+  auto l_v = l.View();
+  extract(l_v[odx],buf);
+
+  s = buf[idx];
+
+  grid->Broadcast(rank,s);
+
+  return;
+};
+
+
+//////////////////////////////////////////////////////////
+// Peek a scalar object from the SIMD array
+//////////////////////////////////////////////////////////
+template<class vobj,class sobj>
+void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
+        
+  GridBase *grid = l.Grid();
+
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nsimd = grid->Nsimd();
+
+  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
+  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+  static const int words=sizeof(vobj)/sizeof(vector_type);
+  int odx,idx;
+  idx= grid->iIndex(site);
+  odx= grid->oIndex(site);
+  
+  auto l_v = l.View();
+  scalar_type * vp = (scalar_type *)&l_v[odx];
+  scalar_type * pt = (scalar_type *)&s;
+      
+  for(int w=0;w<words;w++){
+    pt[w] = vp[idx+w*Nsimd];
+  }
+      
+  return;
+};
+
+template<class vobj,class sobj>
+void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
+
+  GridBase *grid=l.Grid();
+
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nsimd = grid->Nsimd();
+
+  assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
+  assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+  static const int words=sizeof(vobj)/sizeof(vector_type);
+  int odx,idx;
+  idx= grid->iIndex(site);
+  odx= grid->oIndex(site);
+
+  auto l_v = l.View();
+  scalar_type * vp = (scalar_type *)&l_v[odx];
+  scalar_type * pt = (scalar_type *)&s;
+  for(int w=0;w<words;w++){
+    vp[idx+w*Nsimd] = pt[w];
+  }
+
+  return;
+};
+
+NAMESPACE_END(Grid);
+#endif
+
--- a/Grid/lattice/Lattice_reality.h
+++ b/Grid/lattice/Lattice_reality.h
@@ -0,0 +1,63 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_reality.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_REALITY_H
+#define GRID_LATTICE_REALITY_H
+
+
+// FIXME .. this is the sector of the code 
+// I am most worried about the directions
+// The choice of burying complex in the SIMD
+// is making the use of "real" and "imag" very cumbersome
+
+NAMESPACE_BEGIN(Grid);
+
+template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
+  Lattice<vobj> ret(lhs.Grid());
+  auto lhs_v = lhs.View();
+  auto ret_v = ret.View();
+  accelerator_loop( ss, lhs_v, {
+    ret_v[ss] = adj(lhs_v[ss]);
+  });
+  return ret;
+};
+
+template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
+  Lattice<vobj> ret(lhs.Grid());
+  auto lhs_v = lhs.View();
+  auto ret_v = ret.View();
+  accelerator_loop( ss, lhs_v, {
+    ret_v[ss] = conjugate(lhs_v[ss]);
+  });
+  return ret;
+};
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -0,0 +1,738 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid 
+    Source file: ./lib/lattice/Lattice_reduction.h
+    Copyright (C) 2015
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#pragma once
+
+#include <Grid/Grid_Eigen_Dense.h>
+
+NAMESPACE_BEGIN(Grid);
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Deterministic Reduction operations
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
+  ComplexD nrm = innerProduct(arg,arg);
+  return real(nrm); 
+}
+
+// Double inner product
+template<class vobj>
+inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
+  scalar_type  nrm;
+  
+  GridBase *grid = left.Grid();
+  
+  Vector<vector_type> sumarray(grid->SumArraySize());
+
+  auto left_v = left.View();
+  auto right_v=right.View();
+
+  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
+    int mywork, myoff;
+    GridThread::GetWork(left.Grid()->oSites(),thr,mywork,myoff);
+    
+    decltype(innerProductD(left_v[0],right_v[0])) vnrm=Zero(); // private to thread; sub summation
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vnrm = vnrm + innerProductD(left_v[ss],right_v[ss]);
+    }
+    sumarray[thr]=TensorRemove(vnrm) ;
+  });
+  
+  vector_type vvnrm; vvnrm=Zero();  // sum across threads
+  for(int i=0;i<grid->SumArraySize();i++){
+    vvnrm = vvnrm+sumarray[i];
+  } 
+  nrm = Reduce(vvnrm);// sum across simd
+  right.Grid()->GlobalSum(nrm);
+  return nrm;
+}
+
+/////////////////////////
+// Fast axpby_norm
+// z = a x + b y
+// return norm z
+/////////////////////////
+template<class sobj,class vobj> strong_inline RealD 
+axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) 
+{
+  sobj one(1.0);
+  return axpby_norm_fast(z,a,one,x,y);
+}
+
+template<class sobj,class vobj> strong_inline RealD 
+axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) 
+{
+  const int pad = 8;
+  z.Checkerboard() = x.Checkerboard();
+  conformable(z,x);
+  conformable(x,y);
+
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_typeD vector_type;
+  RealD  nrm;
+  
+  GridBase *grid = x.Grid();
+  
+  Vector<RealD> sumarray(grid->SumArraySize()*pad);
+  
+  auto x_v=x.View();
+  auto y_v=y.View();
+  auto z_v=z.View();
+  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),
+  {
+    int nwork, mywork, myoff;
+    GridThread::GetWork(x.Grid()->oSites(),thr,mywork,myoff);
+    
+    // private to thread; sub summation
+    decltype(innerProductD(z_v[0],z_v[0])) vnrm=Zero(); 
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vobj tmp = a*x_v[ss]+b*y_v[ss];
+      vnrm = vnrm + innerProductD(tmp,tmp);
+      vstream(z_v[ss],tmp);
+    }
+    vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
+  });
+  
+  nrm = 0.0; // sum across threads; linear in thread count but fast
+  for(int i=0;i<grid->SumArraySize();i++){
+    nrm = nrm+sumarray[i*pad];
+  } 
+  z.Grid()->GlobalSum(nrm);
+  return nrm; 
+}
+
+ 
+template<class Op,class T1>
+inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
+  ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+template<class Op,class T1,class T2>
+inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
+      ->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+
+template<class Op,class T1,class T2,class T3>
+inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
+  ->typename decltype(expr.op.func(eval(0,expr.arg1),
+				      eval(0,expr.arg2),
+				      eval(0,expr.arg3)
+				      ))::scalar_object
+{
+  return sum(closure(expr));
+}
+
+template<class vobj>
+inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
+{
+  GridBase *grid=arg.Grid();
+  int Nsimd = grid->Nsimd();
+  
+  Vector<vobj> sumarray(grid->SumArraySize());
+  for(int i=0;i<grid->SumArraySize();i++){
+    sumarray[i]=Zero();
+  }
+  
+  auto arg_v=arg.View();
+  thread_loop( (int thr=0;thr<grid->SumArraySize();thr++),{
+    int nwork, mywork, myoff;
+    GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
+    
+    vobj vvsum=Zero();
+    for(int ss=myoff;ss<mywork+myoff; ss++){
+      vvsum = vvsum + arg_v[ss];
+    }
+    sumarray[thr]=vvsum;
+  });
+  
+  vobj vsum=Zero();  // sum across threads
+  for(int i=0;i<grid->SumArraySize();i++){
+    vsum = vsum+sumarray[i];
+  } 
+  
+  typedef typename vobj::scalar_object sobj;
+  sobj ssum=Zero();
+  
+  ExtractBuffer<sobj>               buf(Nsimd);
+  extract(vsum,buf);
+  
+  for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
+  arg.Grid()->GlobalSum(ssum);
+  
+  return ssum;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
+{
+  ///////////////////////////////////////////////////////
+  // FIXME precision promoted summation
+  // may be important for correlation functions
+  // But easily avoided by using double precision fields
+  ///////////////////////////////////////////////////////
+  typedef typename vobj::scalar_object sobj;
+  GridBase  *grid = Data.Grid();
+  assert(grid!=NULL);
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  Vector<vobj> lvSum(rd); // will locally sum vectors first
+  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
+  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD
+
+  result.resize(fd); // And then global sum to return the same vector to every node 
+  for(int r=0;r<rd;r++){
+    lvSum[r]=Zero();
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  // sum over reduced dimension planes, breaking out orthog dir
+  // Parallel over orthog direction
+  auto Data_v=Data.View();
+  thread_loop( (int r=0;r<rd;r++), {
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	lvSum[r]=lvSum[r]+Data_v[ss];
+      }
+    }
+  });
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  Coordinate icoor(Nd);
+
+  for(int rt=0;rt<rd;rt++){
+
+    extract(lvSum[rt],extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx];
+
+    }
+  }
+  
+  // sum over nodes.
+  sobj gsum;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum=lsSum[lt];
+    } else {
+      gsum=Zero();
+    }
+
+    grid->GlobalSum(gsum);
+
+    result[t]=gsum;
+  }
+}
+
+template<class vobj>
+static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
+{
+  // std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
+
+  typedef typename vobj::scalar_type scalar_type;
+  std::vector<scalar_type> lsSum;
+  localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
+  globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
+  // std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
+}
+
+template <class vobj>
+static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
+{
+  // std::cout << GridLogMessage << "Start prep" << std::endl;
+  typedef typename vobj::vector_type   vector_type;
+  typedef typename vobj::scalar_type   scalar_type;
+  GridBase  *grid = lhs.Grid();
+  assert(grid!=NULL);
+  conformable(grid,rhs.Grid());
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+  // std::cout << GridLogMessage << "Start alloc" << std::endl;
+
+  Vector<vector_type> lvSum(rd); // will locally sum vectors first
+  lsSum.resize(ld,scalar_type(0.0));                    // sum across these down to scalars
+  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  
+  // std::cout << GridLogMessage << "End alloc" << std::endl;
+
+  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
+  for(int r=0;r<rd;r++){
+    lvSum[r]=Zero();
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+  // std::cout << GridLogMessage << "End prep" << std::endl;
+  // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
+  vector_type vv;
+  auto l_v=lhs.View();
+  auto r_v=rhs.View();
+  thread_loop( (int r=0;r<rd;r++),{
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+        int ss = so + n * stride + b;
+        vv = TensorRemove(innerProduct(l_v[ss], r_v[ss]));
+        lvSum[r] = lvSum[r] + vv;
+      }
+    }
+  });
+  // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  Coordinate icoor(Nd);
+  for(int rt=0;rt<rd;rt++){
+
+    iScalar<vector_type> temp; 
+    temp._internal = lvSum[rt];
+    extract(temp,extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
+
+    }
+  }
+  // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
+}
+template <class vobj>
+static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
+{
+  typedef typename vobj::scalar_type scalar_type;
+  GridBase *grid = lhs.Grid();
+  int fd = result.size();
+  int ld = lsSum.size();
+  // sum over nodes.
+  std::vector<scalar_type> gsum;
+  gsum.resize(fd, scalar_type(0.0));
+  // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum[t]=lsSum[lt];
+    }
+  }
+  // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
+  // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
+  grid->GlobalSumVector(&gsum[0], fd);
+  // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
+
+  result = gsum;
+}
+template<class vobj>
+static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
+{
+  typedef typename vobj::vector_type   vector_type;
+  typedef typename vobj::scalar_type   scalar_type;
+  GridBase  *grid = lhs.Grid();
+  assert(grid!=NULL);
+  conformable(grid,rhs.Grid());
+
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  assert(orthogdim >= 0);
+  assert(orthogdim < Nd);
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  Vector<vector_type> lvSum(rd); // will locally sum vectors first
+  Vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars
+  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  
+
+  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
+  for(int r=0;r<rd;r++){
+    lvSum[r]=Zero();
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  auto lhv=lhs.View();
+  auto rhv=rhs.View();
+  thread_loop( (int r=0;r<rd;r++),{
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	vector_type vv = TensorRemove(innerProduct(lhv[ss],rhv[ss]));
+	lvSum[r]=lvSum[r]+vv;
+      }
+    }
+  });
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  Coordinate icoor(Nd);
+  for(int rt=0;rt<rd;rt++){
+
+    iScalar<vector_type> temp; 
+    temp._internal = lvSum[rt];
+    extract(temp,extracted);
+
+    for(int idx=0;idx<Nsimd;idx++){
+
+      grid->iCoorFromIindex(icoor,idx);
+
+      int ldx =rt+icoor[orthogdim]*rd;
+
+      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
+
+    }
+  }
+  
+  // sum over nodes.
+  scalar_type gsum;
+  for(int t=0;t<fd;t++){
+    int pt = t/ld; // processor plane
+    int lt = t%ld;
+    if ( pt == grid->_processor_coor[orthogdim] ) {
+      gsum=lsSum[lt];
+    } else {
+      gsum=scalar_type(0.0);
+    }
+
+    grid->GlobalSum(gsum);
+
+    result[t]=gsum;
+  }
+}
+template<class vobj>
+static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  int Nblock = rhs.Grid()->GlobalDimensions()[Orthog];
+  Vector<ComplexD> ip(Nblock);
+  sn.resize(Nblock);
+  
+  sliceInnerProductVector(ip,rhs,rhs,Orthog);
+  for(int ss=0;ss<Nblock;ss++){
+    sn[ss] = real(ip[ss]);
+  }
+};
+
+
+template<class vobj>
+static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
+			    int orthogdim,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::tensor_reduced tensor_reduced;
+  
+  scalar_type zscale(scale);
+
+  GridBase *grid  = X.Grid();
+
+  int Nsimd  =grid->Nsimd();
+  int Nblock =grid->GlobalDimensions()[orthogdim];
+
+  int fd     =grid->_fdimensions[orthogdim];
+  int ld     =grid->_ldimensions[orthogdim];
+  int rd     =grid->_rdimensions[orthogdim];
+
+  int e1     =grid->_slice_nblock[orthogdim];
+  int e2     =grid->_slice_block [orthogdim];
+  int stride =grid->_slice_stride[orthogdim];
+
+  Coordinate icoor;
+  for(int r=0;r<rd;r++){
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    vector_type    av;
+
+    for(int l=0;l<Nsimd;l++){
+      grid->iCoorFromIindex(icoor,l);
+      int ldx =r+icoor[orthogdim]*rd;
+      scalar_type *as =(scalar_type *)&av;
+      as[l] = scalar_type(a[ldx])*zscale;
+    }
+
+    tensor_reduced at; at=av;
+
+    auto Rv=R.View();
+    auto Xv=X.View();
+    auto Yv=Y.View();
+    thread_loop_collapse2( (int n=0;n<e1;n++) , {
+      for(int b=0;b<e2;b++){
+	int ss= so+n*stride+b;
+	Rv[ss] = at*Xv[ss]+Yv[ss];
+      }
+    });
+  }
+};
+
+/*
+inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
+{
+  int NN    = BlockSolverGrid->_ndimension;
+  int nsimd = BlockSolverGrid->Nsimd();
+  
+  std::vector<int> latt_phys(0);
+  std::vector<int> simd_phys(0);
+  std::vector<int>  mpi_phys(0);
+  
+  for(int d=0;d<NN;d++){
+    if( d!=Orthog ) { 
+      latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
+      simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
+      mpi_phys.push_back(BlockSolverGrid->_processors[d]);
+    }
+  }
+  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
+}
+*/
+
+template<class vobj>
+static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X.Grid();
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+  thread_region
+  {
+    Vector<vobj> s_x(Nblock);
+
+    thread_loop_collapse_in_region(2 ,(int n=0;n<nblock;n++), {
+     for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = Y[o+i*ostride];
+	for(int j=0;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }});
+  }
+};
+
+template<class vobj>
+static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
+{    
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Nblock = X.Grid()->GlobalDimensions()[Orthog];
+
+  GridBase *FullGrid  = X.Grid();
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl=1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+  thread_region
+  {
+    std::vector<vobj> s_x(Nblock);
+
+
+    thread_loop_collapse_in_region( 2 , (int n=0;n<nblock;n++),{
+    for(int b=0;b<block;b++){
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	s_x[i] = X[o+i*ostride];
+      }
+
+      vobj dot;
+      for(int i=0;i<Nblock;i++){
+	dot = s_x[0]*(scale*aa(0,i));
+	for(int j=1;j<Nblock;j++){
+	  dot = dot + s_x[j]*(scale*aa(j,i));
+	}
+	R[o+i*ostride]=dot;
+      }
+    }});
+  }
+};
+
+
+template<class vobj>
+static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
+{
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  GridBase *FullGrid  = lhs.Grid();
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  
+  int Nblock = FullGrid->GlobalDimensions()[Orthog];
+  
+  //  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
+  
+  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+  assert( FullGrid->_simd_layout[Orthog]==1);
+  int nh =  FullGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;
+
+  //FIXME package in a convenient iterator
+  //Should loop over a plane orthogonal to direction "Orthog"
+  int stride=FullGrid->_slice_stride[Orthog];
+  int block =FullGrid->_slice_block [Orthog];
+  int nblock=FullGrid->_slice_nblock[Orthog];
+  int ostride=FullGrid->_ostride[Orthog];
+
+  typedef typename vobj::vector_typeD vector_typeD;
+
+  thread_region
+  {
+    std::vector<vobj> Left(Nblock);
+    std::vector<vobj> Right(Nblock);
+    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+    thread_loop_collapse_in_region( 2, (int n=0;n<nblock;n++),{
+    for(int b=0;b<block;b++){
+
+      int o  = n*stride + b;
+
+      for(int i=0;i<Nblock;i++){
+	Left [i] = lhs[o+i*ostride];
+	Right[i] = rhs[o+i*ostride];
+      }
+
+      for(int i=0;i<Nblock;i++){
+      for(int j=0;j<Nblock;j++){
+	auto tmp = innerProduct(Left[i],Right[j]);
+	auto rtmp = TensorRemove(tmp);
+	mat_thread(i,j) += Reduce(rtmp);
+      }}
+    }});
+    thread_critical
+    {
+      mat += mat_thread;
+    }  
+  }
+
+  for(int i=0;i<Nblock;i++){
+  for(int j=0;j<Nblock;j++){
+    ComplexD sum = mat(i,j);
+    FullGrid->GlobalSum(sum);
+    mat(i,j)=sum;
+  }}
+
+  return;
+}
+
+NAMESPACE_END(Grid);
+
+
+
+
--- a/Grid/lattice/Lattice_rng.h
+++ b/Grid/lattice/Lattice_rng.h
@@ -0,0 +1,518 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_rng.h
+
+    Copyright (C) 2015
+
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_RNG_H
+#define GRID_LATTICE_RNG_H
+
+#include <random>
+
+#ifdef RNG_SITMO
+#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
+#endif 
+
+#if defined(RNG_SITMO)
+#define RNG_FAST_DISCARD
+#else 
+#undef  RNG_FAST_DISCARD
+#endif
+
+NAMESPACE_BEGIN(Grid);
+
+//////////////////////////////////////////////////////////////
+// Allow the RNG state to be less dense than the fine grid
+//////////////////////////////////////////////////////////////
+inline int RNGfillable(GridBase *coarse,GridBase *fine)
+{
+
+  int rngdims = coarse->_ndimension;
+
+  // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+  int lowerdims   = fine->_ndimension - coarse->_ndimension;
+  assert(lowerdims >= 0);
+  for(int d=0;d<lowerdims;d++){
+    assert(fine->_simd_layout[d]==1);
+    assert(fine->_processors[d]==1);
+  }
+
+  int multiplicity=1;
+  for(int d=0;d<lowerdims;d++){
+    multiplicity=multiplicity*fine->_rdimensions[d];
+  }
+  // local and global volumes subdivide cleanly after SIMDization
+  for(int d=0;d<rngdims;d++){
+    int fd= d+lowerdims;
+    assert(coarse->_processors[d]  == fine->_processors[fd]);
+    assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
+    assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); 
+
+    multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; 
+  }
+  return multiplicity;
+}
+
+  
+// merge of April 11 2017
+// this function is necessary for the LS vectorised field
+inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
+{
+  int rngdims = coarse->_ndimension;
+    
+  // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
+  int lowerdims   = fine->_ndimension - coarse->_ndimension;  assert(lowerdims >= 0);
+  // assumes that the higher dimensions are not using more processors
+  // all further divisions are local
+  for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
+  for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
+
+  // then divide the number of local sites
+  // check that the total number of sims agree, meanse the iSites are the same
+  assert(fine->Nsimd() == coarse->Nsimd());
+
+  // check that the two grids divide cleanly
+  assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
+
+  return fine->lSites() / coarse->lSites();
+}
+  
+// real scalars are one component
+template<class scalar,class distribution,class generator> 
+void fillScalar(scalar &s,distribution &dist,generator & gen)
+{
+  s=dist(gen);
+}
+template<class distribution,class generator> 
+void fillScalar(ComplexF &s,distribution &dist, generator &gen)
+{
+  s=ComplexF(dist(gen),dist(gen));
+}
+template<class distribution,class generator> 
+void fillScalar(ComplexD &s,distribution &dist,generator &gen)
+{
+  s=ComplexD(dist(gen),dist(gen));
+}
+  
+class GridRNGbase {
+public:
+  // One generator per site.
+  // Uniform and Gaussian distributions from these generators.
+#ifdef RNG_RANLUX
+  typedef std::ranlux48 RngEngine;
+  typedef uint64_t      RngStateType;
+  static const int RngStateCount = 15;
+#endif 
+#ifdef RNG_MT19937 
+  typedef std::mt19937 RngEngine;
+  typedef uint32_t     RngStateType;
+  static const int     RngStateCount = std::mt19937::state_size;
+#endif
+#ifdef RNG_SITMO
+  typedef sitmo::prng_engine 	RngEngine;
+  typedef uint64_t    	RngStateType;
+  static const int    	RngStateCount = 13;
+#endif
+
+  std::vector<RngEngine>                             _generators;
+  std::vector<std::uniform_real_distribution<RealD> > _uniform;
+  std::vector<std::normal_distribution<RealD> >       _gaussian;
+  std::vector<std::discrete_distribution<int32_t> >   _bernoulli;
+  std::vector<std::uniform_int_distribution<uint32_t> > _uid;
+
+  ///////////////////////
+  // support for parallel init
+  ///////////////////////
+#ifdef RNG_FAST_DISCARD
+  static void Skip(RngEngine &eng,uint64_t site)
+  {
+    /////////////////////////////////////////////////////////////////////////////////////
+    // Skip by 2^40 elements between successive lattice sites
+    // This goes by 10^12.
+    // Consider quenched updating; likely never exceeding rate of 1000 sweeps
+    // per second on any machine. This gives us of order 10^9 seconds, or 100 years
+    // skip ahead.
+    // For HMC unlikely to go at faster than a solve per second, and 
+    // tens of seconds per trajectory so this is clean in all reasonable cases,
+    // and margin of safety is orders of magnitude.
+    // We could hack Sitmo to skip in the higher order words of state if necessary
+      //
+      // Replace with 2^30 ; avoid problem on large volumes
+      //
+    /////////////////////////////////////////////////////////////////////////////////////
+    //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init
+      const int shift = 30;
+
+    uint64_t skip = site;
+
+      skip = skip<<shift;
+
+      assert((skip >> shift)==site); // check for overflow
+
+    eng.discard(skip);
+    //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl;
+  } 
+#endif
+  static RngEngine Reseed(RngEngine &eng)
+  {
+    std::vector<uint32_t> newseed;
+    std::uniform_int_distribution<uint32_t> uid;
+    return Reseed(eng,newseed,uid);
+  }
+  static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
+			  std::uniform_int_distribution<uint32_t> &uid)
+  {
+    const int reseeds=4;
+      
+    newseed.resize(reseeds);
+    for(int i=0;i<reseeds;i++){
+      newseed[i] = uid(eng);
+    }
+    std::seed_seq sseq(newseed.begin(),newseed.end());
+    return RngEngine(sseq);
+  }    
+
+  void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
+    saved.resize(RngStateCount);
+    std::stringstream ss;
+    ss<<eng;
+    ss.seekg(0,ss.beg);
+    for(int i=0;i<RngStateCount;i++){
+      ss>>saved[i];
+    }
+  }
+  void GetState(std::vector<RngStateType> & saved,int gen) {
+    GetState(saved,_generators[gen]);
+  }
+  void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
+    assert(saved.size()==RngStateCount);
+    std::stringstream ss;
+    for(int i=0;i<RngStateCount;i++){
+      ss<< saved[i]<<" ";
+    }
+    ss.seekg(0,ss.beg);
+    ss>>eng;
+  }
+  void SetState(std::vector<RngStateType> & saved,int gen){
+    SetState(saved,_generators[gen]);
+  }
+  void SetEngine(RngEngine &Eng, int gen){
+    _generators[gen]=Eng;
+  }
+  void GetEngine(RngEngine &Eng, int gen){
+    Eng=_generators[gen];
+  }
+  template<class source> void Seed(source &src, int gen)
+  {
+    _generators[gen] = RngEngine(src);
+  }    
+};
+
+class GridSerialRNG : public GridRNGbase {
+public:
+
+  GridSerialRNG() : GridRNGbase() {
+    _generators.resize(1);
+    _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
+    _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
+    _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
+    _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
+  }
+
+  template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
+
+    typedef typename sobj::scalar_type scalar_type;
+ 
+    int words = sizeof(sobj)/sizeof(scalar_type);
+
+    scalar_type *buf = (scalar_type *) & l;
+
+    dist[0].reset();
+    for(int idx=0;idx<words;idx++){
+      fillScalar(buf[idx],dist[0],_generators[0]);
+    }
+
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+
+  }
+
+  template <class distribution>  inline void fill(ComplexF &l,std::vector<distribution> &dist){
+    dist[0].reset();
+    fillScalar(l,dist[0],_generators[0]);
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(ComplexD &l,std::vector<distribution> &dist){
+    dist[0].reset();
+    fillScalar(l,dist[0],_generators[0]);
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(RealF &l,std::vector<distribution> &dist){
+    dist[0].reset();
+    fillScalar(l,dist[0],_generators[0]);
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(RealD &l,std::vector<distribution> &dist){
+    dist[0].reset();
+    fillScalar(l,dist[0],_generators[0]);
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  // vector fill
+  template <class distribution>  inline void fill(vComplexF &l,std::vector<distribution> &dist){
+    RealF *pointer=(RealF *)&l;
+    dist[0].reset();
+    for(int i=0;i<2*vComplexF::Nsimd();i++){
+      fillScalar(pointer[i],dist[0],_generators[0]);
+    }
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(vComplexD &l,std::vector<distribution> &dist){
+    RealD *pointer=(RealD *)&l;
+    dist[0].reset();
+    for(int i=0;i<2*vComplexD::Nsimd();i++){
+      fillScalar(pointer[i],dist[0],_generators[0]);
+    }
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(vRealF &l,std::vector<distribution> &dist){
+    RealF *pointer=(RealF *)&l;
+    dist[0].reset();
+    for(int i=0;i<vRealF::Nsimd();i++){
+      fillScalar(pointer[i],dist[0],_generators[0]);
+    }
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+  template <class distribution>  inline void fill(vRealD &l,std::vector<distribution> &dist){
+    RealD *pointer=(RealD *)&l;
+    dist[0].reset();
+    for(int i=0;i<vRealD::Nsimd();i++){
+      fillScalar(pointer[i],dist[0],_generators[0]);
+    }
+    CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
+  }
+    
+  void SeedFixedIntegers(const std::vector<int> &seeds){
+    CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
+    std::seed_seq src(seeds.begin(),seeds.end());
+    Seed(src,0);
+  }
+
+    void SeedUniqueString(const std::string &s){
+      std::vector<int> seeds;
+      std::stringstream sha;
+      seeds = GridChecksum::sha256_seeds(s);
+      for(int i=0;i<seeds.size();i++) { 
+        sha << std::hex << seeds[i];
+      }
+      std::cout << GridLogMessage << "Intialising serial RNG with unique string '" 
+                << s << "'" << std::endl;
+      std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
+      SeedFixedIntegers(seeds);
+    }
+};
+
+class GridParallelRNG : public GridRNGbase {
+private:
+  double _time_counter;
+  GridBase *_grid;
+  unsigned int _vol;
+
+public:
+  GridBase *Grid(void) const { return _grid; }
+  int generator_idx(int os,int is) {
+    return is*_grid->oSites()+os;
+  }
+
+  GridParallelRNG(GridBase *grid) : GridRNGbase() {
+    _grid = grid;
+    _vol  =_grid->iSites()*_grid->oSites();
+
+    _generators.resize(_vol);
+    _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
+    _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
+    _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
+    _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
+  }
+
+  template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
+
+    typedef typename vobj::scalar_object scalar_object;
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::vector_type vector_type;
+
+    double inner_time_counter = usecond();
+
+    int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid
+    int Nsimd  = _grid->Nsimd();  // guaranteed to be the same for l.Grid() too
+    int osites = _grid->oSites();  // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
+    int words  = sizeof(scalar_object) / sizeof(scalar_type);
+
+    auto l_v = l.View();
+    thread_loop( (int ss=0;ss<osites;ss++), {
+      ExtractBuffer<scalar_object> buf(Nsimd);
+      for (int m = 0; m < multiplicity; m++) {  // Draw from same generator multiplicity times
+
+	int sm = multiplicity * ss + m;  // Maps the generator site to the fine site
+
+	for (int si = 0; si < Nsimd; si++) {
+            
+	  int gdx = generator_idx(ss, si);  // index of generator state
+	  scalar_type *pointer = (scalar_type *)&buf[si];
+	  dist[gdx].reset();
+	  for (int idx = 0; idx < words; idx++) 
+	    fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
+	}
+	// merge into SIMD lanes, FIXME suboptimal implementation
+	merge(l_v[sm], buf);
+      }
+      });
+    //    });
+
+    _time_counter += usecond()- inner_time_counter;
+  }
+
+    void SeedUniqueString(const std::string &s){
+      std::vector<int> seeds;
+      seeds = GridChecksum::sha256_seeds(s);
+      std::cout << GridLogMessage << "Intialising parallel RNG with unique string '" 
+                << s << "'" << std::endl;
+      std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
+      SeedFixedIntegers(seeds);
+    }
+  void SeedFixedIntegers(const std::vector<int> &seeds){
+
+    // Everyone generates the same seed_seq based on input seeds
+    CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
+
+    std::seed_seq source(seeds.begin(),seeds.end());
+
+    RngEngine master_engine(source);
+
+#ifdef RNG_FAST_DISCARD
+    ////////////////////////////////////////////////
+    // Skip ahead through a single stream.
+    // Applicable to SITMO and other has based/crypto RNGs
+    // Should be applicable to Mersenne Twister, but the C++11
+    // MT implementation does not implement fast discard even though
+    // in principle this is possible
+    ////////////////////////////////////////////////
+
+    // Everybody loops over global volume.
+    thread_loop( (int gidx=0;gidx<_grid->_gsites;gidx++) , {
+	// Where is it?
+	int rank;
+	int o_idx;
+	int i_idx;
+
+	Coordinate gcoor;
+	_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
+	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+	
+	// If this is one of mine we take it
+	if( rank == _grid->ThisRank() ){
+	  int l_idx=generator_idx(o_idx,i_idx);
+	  _generators[l_idx] = master_engine;
+	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
+	}
+    });
+#else 
+    ////////////////////////////////////////////////////////////////
+    // Machine and thread decomposition dependent seeding is efficient
+    // and maximally parallel; but NOT reproducible from machine to machine. 
+    // Not ideal, but fastest way to reseed all nodes.
+    ////////////////////////////////////////////////////////////////
+    {
+      // Obtain one Reseed per processor
+      int Nproc = _grid->ProcessorCount();
+      std::vector<RngEngine> seeders(Nproc);
+      int me= _grid->ThisRank();
+      for(int p=0;p<Nproc;p++){
+	seeders[p] = Reseed(master_engine);
+      }
+      master_engine = seeders[me];
+    }
+
+    {
+      // Obtain one reseeded generator per thread
+      int Nthread = GridThread::GetThreads();
+      std::vector<RngEngine> seeders(Nthread);
+      for(int t=0;t<Nthread;t++){
+	seeders[t] = Reseed(master_engine);
+      }
+
+      thread_loop( (int t=0;t<Nthread;t++), {
+	// set up one per local site in threaded fashion
+	std::vector<uint32_t> newseeds;
+	std::uniform_int_distribution<uint32_t> uid;	
+	for(int l=0;l<_grid->lSites();l++) {
+	  if ( (l%Nthread)==t ) {
+	    _generators[l] = Reseed(seeders[t],newseeds,uid);
+	  }
+	}
+      });
+    }
+#endif
+  }
+
+  void Report(){
+    std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////
+  // Support for rigorous test of RNG's
+  // Return uniform random uint32_t from requested site generator
+  ////////////////////////////////////////////////////////////////////////
+  uint32_t GlobalU01(int gsite){
+
+    uint32_t the_number;
+    // who
+    int rank,o_idx,i_idx;
+    Coordinate gcoor;
+    _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
+    _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
+
+    // draw
+    int l_idx=generator_idx(o_idx,i_idx);
+    if( rank == _grid->ThisRank() ){
+      the_number = _uid[l_idx](_generators[l_idx]);
+    }
+      
+    // share & return
+    _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
+    return the_number;
+  }
+
+};
+
+template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  }
+template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
+template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
+
+template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); }
+template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
+template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/lattice/Lattice_trace.h
+++ b/Grid/lattice/Lattice_trace.h
@@ -0,0 +1,69 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_trace.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_TRACE_H
+#define GRID_LATTICE_TRACE_H
+
+///////////////////////////////////////////////
+// Tracing, transposing, peeking, poking
+///////////////////////////////////////////////
+
+NAMESPACE_BEGIN(Grid);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Trace
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj>
+inline auto trace(const Lattice<vobj> &lhs)  -> Lattice<decltype(trace(vobj()))>
+{
+  Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  accelerator_loop( ss, lhs_v, {
+    ret_v[ss] = trace(lhs_v[ss]);
+  });
+  return ret;
+};
+    
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Trace Index level dependent operation
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<int Index,class vobj>
+inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
+{
+  Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  accelerator_loop( ss, lhs_v, {
+    ret_v[ss] = traceIndex<Index>(lhs_v[ss]);
+  });
+  return ret;
+};
+
+NAMESPACE_END(Grid);
+#endif
+
--- a/Grid/lattice/Lattice_transfer.h
+++ b/Grid/lattice/Lattice_transfer.h
--- a/Grid/lattice/Lattice_transpose.h
+++ b/Grid/lattice/Lattice_transpose.h
@@ -0,0 +1,68 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_transpose.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_TRANSPOSE_H
+#define GRID_LATTICE_TRANSPOSE_H
+
+///////////////////////////////////////////////
+// Transpose
+///////////////////////////////////////////////
+
+NAMESPACE_BEGIN(Grid);
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Transpose
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj>
+inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
+  Lattice<vobj> ret(lhs.Grid());
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  accelerator_loop(ss,lhs_v,{
+    ret_v[ss] = transpose(lhs_v[ss]);
+  });
+  return ret;
+};
+    
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Index level dependent transpose
+////////////////////////////////////////////////////////////////////////////////////////////////////
+template<int Index,class vobj>
+inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
+{
+  Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
+  auto ret_v = ret.View();
+  auto lhs_v = lhs.View();
+  accelerator_loop(ss,lhs_v,{
+      ret_v[ss] = transposeIndex<Index>(lhs_v[ss]);
+  });
+  return ret;
+};
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/lattice/Lattice_unary.h
+++ b/Grid/lattice/Lattice_unary.h
@@ -0,0 +1,80 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_unary.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_LATTICE_UNARY_H
+#define GRID_LATTICE_UNARY_H
+
+NAMESPACE_BEGIN(Grid);
+
+template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
+  Lattice<obj> ret_i(rhs_i.Grid());
+  auto rhs = rhs_i.View();
+  auto ret = ret_i.View();
+  ret.Checkerboard() = rhs.Checkerboard();
+  accelerator_loop(ss,rhs,{
+    ret[ss]=pow(rhs[ss],y);
+  });
+  return ret_i;
+}
+template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
+  Lattice<obj> ret_i(rhs_i.Grid());
+  auto rhs = rhs_i.View();
+  auto ret = ret_i.View();
+  ret.Checkerboard() = rhs.Checkerboard();
+  accelerator_loop(ss,rhs,{
+    ret[ss]=mod(rhs[ss],y);
+  });
+  return ret_i;
+}
+
+template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
+  Lattice<obj> ret_i(rhs_i.Grid());
+  auto ret = ret_i.View();
+  auto rhs = rhs_i.View();
+  ret.Checkerboard() = rhs_i.Checkerboard();
+  accelerator_loop(ss,rhs,{
+    ret[ss]=div(rhs[ss],y);
+  });
+  return ret_i;
+}
+
+template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
+  Lattice<obj> ret_i(rhs_i.Grid());
+  auto rhs = rhs_i.View();
+  auto ret = ret_i.View();
+  ret.Checkerboard() = rhs.Checkerboard();
+  accelerator_loop(ss,rhs,{
+    ret[ss]=Exponentiate(rhs[ss],alpha, Nexp);
+  });
+  return ret_i;
+}
+
+NAMESPACE_END(Grid);
+#endif
--- a/Grid/lattice/Lattice_where.h
+++ b/Grid/lattice/Lattice_where.h
@@ -0,0 +1,86 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/lattice/Lattice_where.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef GRID_LATTICE_WHERE_H
+#define GRID_LATTICE_WHERE_H
+namespace Grid {
+// Must implement the predicate gating the 
+// Must be able to reduce the predicate down to a single vInteger per site.
+// Must be able to require the type be iScalar x iScalar x ....
+//                              give a GetVtype method in iScalar
+//                              and blow away the tensor structures.
+//
+template<class vobj,class iobj>
+inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+  conformable(iftrue,ret);
+
+  GridBase *grid=iftrue._grid;
+
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  typedef typename iobj::vector_type mask_type;
+
+  const int Nsimd = grid->Nsimd();
+
+  std::vector<Integer> mask(Nsimd);
+  std::vector<scalar_object> truevals (Nsimd);
+  std::vector<scalar_object> falsevals(Nsimd);
+
+  parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
+
+    extract(iftrue._odata[ss]   ,truevals);
+    extract(iffalse._odata[ss]  ,falsevals);
+    extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
+
+    for(int s=0;s<Nsimd;s++){
+      if (mask[s]) falsevals[s]=truevals[s];
+    }
+
+    merge(ret._odata[ss],falsevals);
+  }
+}
+
+template<class vobj,class iobj>
+inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+{
+  conformable(iftrue,iffalse);
+  conformable(iftrue,predicate);
+
+  Lattice<vobj> ret(iftrue._grid);
+
+  where(ret,predicate,iftrue,iffalse);
+
+  return ret;
+}
+}
+#endif