Merge branch 'master' of https://github.com/paboyle/Grid

2025-09-18 09:11:04 +01:00 · 2015-06-14 01:27:07 +01:00
parent 1cc25837b2 353c9598c1
commit 3f7a66328a
33 changed files with 922 additions and 512 deletions
--- a/10
+++ b/10
@@ -1,11 +1,15 @@
 ================================================================
 *** Hacks and bug fixes to clean up and Audits
 ================================================================
-* Base class to share common code between vRealF, VComplexF etc... done
+
-  - Performance check on Guido's reimplementation strategy  - (GUIDO) tested and no difference was found, merged
+*  Extract/merge/set cleanup ; too many variants; rationalise and call simpler ones
 *  Used #define repetitive sequences to minimise code.
 *  Rewrite core tensor arithmetic support to be more systematic
 *  Ensure we ET as much as possible; move unop functions into ET framework.
   - tests with expression args to all functions
 * FIXME audit
 * const audit
 Insert/Extract
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -33,7 +33,7 @@ public:
  ~alignedAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
-  const_pointer address(const_reference __x) const { return &__x; }
+  //  const_pointer address(const_reference __x) const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@@ -14,5 +14,6 @@
 #include <tensors/Tensor_reality.h>
 #include <tensors/Tensor_unary.h>
 #include <tensors/Tensor_extract_merge.h>
-    
+#include <tensors/Tensor_logical.h>
 #endif
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -183,7 +183,7 @@ namespace Grid {
 	  int dir   = geom.directions[p];
 	  int disp  = geom.displacements[p];
-	  int block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+	  Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
 	  LatticeCoordinate(coor,dir);
@@ -204,8 +204,8 @@ namespace Grid {
 	    oblock = where(mod(coor,block)==(block-1),Mphi,zz);
 	    iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
 	  } else if ( disp==-1 ) {
-	    oblock = where(mod(coor,block)==0,Mphi,zz);
+	    oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
-	    iblock = where(mod(coor,block)!=0,Mphi,zz);
+	    iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
 	  } else {
 	    assert(0);
 	  }
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@@ -9,6 +9,37 @@
 namespace Grid {
  ////////////////////////////////////////////////////
  // Predicated where support
  ////////////////////////////////////////////////////
  template<class iobj,class vobj,class robj>
    inline vobj predicatedWhere(const iobj &predicate,const vobj &iftrue,const robj &iffalse) {
    typename std::remove_const<vobj>::type ret;
    typedef typename vobj::scalar_object scalar_object;
    typedef typename vobj::scalar_type scalar_type;
    typedef typename vobj::vector_type vector_type;
    const int Nsimd = vobj::vector_type::Nsimd();
    const int words = sizeof(vobj)/sizeof(vector_type);
    std::vector<Integer> mask(Nsimd);
    std::vector<scalar_object> truevals (Nsimd);
    std::vector<scalar_object> falsevals(Nsimd);
    extract(iftrue   ,truevals);
    extract(iffalse  ,falsevals);
    extract<vInteger,Integer>(TensorRemove(predicate),mask);
    for(int s=0;s<Nsimd;s++){
      if (mask[s]) falsevals[s]=truevals[s];
    }
    merge(ret,falsevals);
    return ret;
  }
 ////////////////////////////////////////////
 // recursive evaluation of expressions; Could
 // switch to generic approach with variadics, a la
@@ -142,10 +173,23 @@ template <class arg> struct name\
 };
 GridUnopClass(UnarySub,-a);
 GridUnopClass(UnaryNot,Not(a));
 GridUnopClass(UnaryAdj,adj(a));
 GridUnopClass(UnaryConj,conjugate(a));
 GridUnopClass(UnaryTrace,trace(a));
 GridUnopClass(UnaryTranspose,transpose(a));
 GridUnopClass(UnaryTa,Ta(a));
 GridUnopClass(UnaryReal,real(a));
 GridUnopClass(UnaryImag,imag(a));
 GridUnopClass(UnaryToReal,toReal(a));
 GridUnopClass(UnaryToComplex,toComplex(a));
 GridUnopClass(UnaryAbs,abs(a));
 GridUnopClass(UnarySqrt,sqrt(a));
 GridUnopClass(UnaryRsqrt,rsqrt(a));
 GridUnopClass(UnarySin,sin(a));
 GridUnopClass(UnaryCos,cos(a));
 GridUnopClass(UnaryLog,log(a));
 GridUnopClass(UnaryExp,exp(a));
 ////////////////////////////////////////////
 // Binary operators
@@ -163,6 +207,28 @@ GridBinOpClass(BinaryAdd,lhs+rhs);
 GridBinOpClass(BinarySub,lhs-rhs);
 GridBinOpClass(BinaryMul,lhs*rhs);
 GridBinOpClass(BinaryAnd   ,lhs&rhs);
 GridBinOpClass(BinaryOr    ,lhs|rhs);
 GridBinOpClass(BinaryAndAnd,lhs&&rhs);
 GridBinOpClass(BinaryOrOr  ,lhs||rhs);
 ////////////////////////////////////////////////////
 // Trinary conditional op
 ////////////////////////////////////////////////////
 #define GridTrinOpClass(name,combination)\
 template <class predicate,class left, class right>	\
 struct name\
 {\
  static auto inline func(const predicate &pred,const left &lhs,const right &rhs)-> decltype(combination) const \
    {\
      return combination;\
    }\
 }
 GridTrinOpClass(TrinaryWhere,(predicatedWhere<predicate, \
 			       typename std::remove_reference<left>::type, \
 			       typename std::remove_reference<right>::type> (pred,lhs,rhs)));
 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
@@ -218,15 +284,67 @@ template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,con
 ////////////////////////
 GRID_DEF_UNOP(operator -,UnarySub);
 GRID_DEF_UNOP(Not,UnaryNot);
 GRID_DEF_UNOP(operator !,UnaryNot);
 GRID_DEF_UNOP(adj,UnaryAdj);
 GRID_DEF_UNOP(conjugate,UnaryConj);
 GRID_DEF_UNOP(trace,UnaryTrace);
 GRID_DEF_UNOP(transpose,UnaryTranspose);
 GRID_DEF_UNOP(Ta,UnaryTa);
 GRID_DEF_UNOP(real,UnaryReal);
 GRID_DEF_UNOP(imag,UnaryImag);
 GRID_DEF_UNOP(toReal,UnaryToReal);
 GRID_DEF_UNOP(toComplex,UnaryToComplex);
 GRID_DEF_UNOP(abs  ,UnaryAbs); //abs overloaded in cmath C++98; DON'T do the abs-fabs-dabs-labs thing
 GRID_DEF_UNOP(sqrt ,UnarySqrt);
 GRID_DEF_UNOP(rsqrt,UnarySqrt);
 GRID_DEF_UNOP(sin  ,UnarySin);
 GRID_DEF_UNOP(cos  ,UnaryCos);
 GRID_DEF_UNOP(log  ,UnaryLog);
 GRID_DEF_UNOP(exp  ,UnaryExp);
 GRID_DEF_BINOP(operator+,BinaryAdd);
 GRID_DEF_BINOP(operator-,BinarySub);
 GRID_DEF_BINOP(operator*,BinaryMul);
 GRID_DEF_BINOP(operator&,BinaryAnd);
 GRID_DEF_BINOP(operator|,BinaryOr);
 GRID_DEF_BINOP(operator&&,BinaryAndAnd);
 GRID_DEF_BINOP(operator||,BinaryOrOr);
 GRID_DEF_TRINOP(where,TrinaryWhere);
 /////////////////////////////////////////////////////////////
 // Closure convenience to force expression to evaluate
 /////////////////////////////////////////////////////////////
 template<class Op,class T1>
  auto closure(const LatticeUnaryExpression<Op,T1> & expr)
  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))>
 {
  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))> ret(expr);
  return ret;
 }
 template<class Op,class T1, class T2>
  auto closure(const LatticeBinaryExpression<Op,T1,T2> & expr)
  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
 				      eval(0,std::get<1>(expr.second))))>
 {
  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
 				   eval(0,std::get<1>(expr.second))))> ret(expr);
  return ret;
 }
 template<class Op,class T1, class T2, class T3>
  auto closure(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
 				      eval(0,std::get<1>(expr.second)),
 				      eval(0,std::get<2>(expr.second))))>
 {
  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
 				   eval(0,std::get<1>(expr.second)),
 				   eval(0,std::get<2>(expr.second))))> ret(expr);
  return ret;
 }
 #undef GRID_UNOP
 #undef GRID_BINOP
 #undef GRID_TRINOP
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -93,7 +93,7 @@ PARALLEL_FOR_LOOP
    }
    return *this;
  }
-  template <typename Op, typename T1,typename T2>             strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
+  template <typename Op, typename T1,typename T2> strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
@@ -131,8 +131,8 @@ PARALLEL_FOR_LOOP
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
-      vobj tmp = eval(ss,expr);
+      //vobj tmp = eval(ss,expr);
-      vstream(_odata[ss] ,tmp);
+      vstream(_odata[ss] ,eval(ss,expr));
 #else
      _odata[ss] = eval(ss,expr);
 #endif
@@ -196,12 +196,7 @@ PARALLEL_FOR_LOOP
    _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
      vobj tmp = eval(ss,expr);
      vstream(_odata[ss] ,tmp);
 #else
      _odata[ss]=eval(ss,expr);
 #endif
    }
  };
@@ -254,7 +249,7 @@ PARALLEL_FOR_LOOP
        Lattice<vobj> ret(lhs._grid);
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = lhs._odata[ss]/rhs._odata[ss];
+	  ret._odata[ss] = lhs._odata[ss]*pow(rhs._odata[ss],-1.0);
        }
        return ret;
    };
--- a/lib/lattice/Lattice_comparison.h
+++ b/lib/lattice/Lattice_comparison.h
@@ -11,7 +11,11 @@ namespace Grid {
    //Query supporting bitwise &, |, ^, !
    //Query supporting logical &&, ||, 
    //////////////////////////////////////////////////////////////////////////
-    template<class vfunctor,class lobj,class robj> 
+
  //////////////////////////////////////////////////////////////////////////
  // compare lattice to lattice
  //////////////////////////////////////////////////////////////////////////
  template<class vfunctor,class lobj,class robj>  
    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
    {
      Lattice<vInteger> ret(rhs._grid);
@@ -21,6 +25,9 @@ PARALLEL_FOR_LOOP
        }
        return ret;
    }
  //////////////////////////////////////////////////////////////////////////
  // compare lattice to scalar
  //////////////////////////////////////////////////////////////////////////
    template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
    {
@@ -31,6 +38,9 @@ PARALLEL_FOR_LOOP
        }
        return ret;
    }
  //////////////////////////////////////////////////////////////////////////
  // compare scalar to lattice
  //////////////////////////////////////////////////////////////////////////
    template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
    {
@@ -42,6 +52,9 @@ PARALLEL_FOR_LOOP
        return ret;
    }
  //////////////////////////////////////////////////////////////////////////
  // Map to functors
  //////////////////////////////////////////////////////////////////////////
    // Less than
   template<class lobj,class robj>
   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
@@ -99,7 +112,6 @@ PARALLEL_FOR_LOOP
     return SLComparison(vge<lobj,robj>(),lhs,rhs);
   }
   // equal
   template<class lobj,class robj>
   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
--- a/lib/lattice/Lattice_comparison_utils.h
+++ b/lib/lattice/Lattice_comparison_utils.h
@@ -5,140 +5,197 @@ namespace Grid {
  /////////////////////////////////////////
  // This implementation is a bit poor.
-  // Only support logical operations (== etc)
+  //
-  // on scalar objects. Strip any tensor structures.
+  // Only support relational logical operations (<, >  etc)
  // on scalar objects. Therefore can strip any tensor structures.
  //
  // Should guard this with isGridTensor<> enable if?
  /////////////////////////////////////////
-    // Generic list of functors
+  //
-    template<class lobj,class robj> class veq {
+  // Generic list of functors
  //
  template<class lobj,class robj> class veq {
  public:
    vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) == (rhs);
    }
  };
  template<class lobj,class robj> class vne {
  public:
    vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) != (rhs);
    }
  };
  template<class lobj,class robj> class vlt {
  public:
    vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) < (rhs);
    }
  };
  template<class lobj,class robj> class vle {
  public:
    vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) <= (rhs);
    }
  };
  template<class lobj,class robj> class vgt {
  public:
    vInteger operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) > (rhs);
    }
  };
  template<class lobj,class robj> class vge {
    public:
-      vInteger operator()(const lobj &lhs, const robj &rhs)
+    vInteger operator()(const lobj &lhs, const robj &rhs)
-	{ 
+    { 
-	  return TensorRemove(lhs) == TensorRemove(rhs);
+      return (lhs) >= (rhs);
-	}
+    }
-    };
+  };
-    template<class lobj,class robj> class vne {
+  
-    public:
+  // Generic list of functors
-      vInteger operator()(const lobj &lhs, const robj &rhs)
+  template<class lobj,class robj> class seq {
-	{ 
+  public:
-	  return TensorRemove(lhs) != TensorRemove(rhs);
+    Integer operator()(const lobj &lhs, const robj &rhs)
-	}
+    { 
-    };
+      return (lhs) == (rhs);
-    template<class lobj,class robj> class vlt {
+    }
-    public:
+  };
-      vInteger operator()(const lobj &lhs, const robj &rhs)
+  template<class lobj,class robj> class sne {
-	{ 
+  public:
-	  return TensorRemove(lhs) < TensorRemove(rhs);
+    Integer operator()(const lobj &lhs, const robj &rhs)
-	}
+    { 
-    };
+      return (lhs) != (rhs);
-    template<class lobj,class robj> class vle {
+    }
-    public:
+  };
-      vInteger operator()(const lobj &lhs, const robj &rhs)
+  template<class lobj,class robj> class slt {
-	{ 
+  public:
-	  return TensorRemove(lhs) <= TensorRemove(rhs);
+    Integer operator()(const lobj &lhs, const robj &rhs)
-	}
+    { 
-    };
+      return (lhs) < (rhs);
-    template<class lobj,class robj> class vgt {
+    }
-    public:
+  };
-      vInteger operator()(const lobj &lhs, const robj &rhs)
+  template<class lobj,class robj> class sle {
-	{ 
+  public:
-	  return TensorRemove(lhs) > TensorRemove(rhs);
+    Integer operator()(const lobj &lhs, const robj &rhs)
-	}
+    { 
-    };
+      return (lhs) <= (rhs);
-    template<class lobj,class robj> class vge {
+    }
-    public:
+  };
-      vInteger operator()(const lobj &lhs, const robj &rhs)
+  template<class lobj,class robj> class sgt {
-	{ 
+  public:
-	  return TensorRemove(lhs) >= TensorRemove(rhs);
+    Integer operator()(const lobj &lhs, const robj &rhs)
-	}
+    { 
-    };
+      return (lhs) > (rhs);
    }
  };
  template<class lobj,class robj> class sge {
  public:
    Integer operator()(const lobj &lhs, const robj &rhs)
    { 
      return (lhs) >= (rhs);
    }
  };
-    // Generic list of functors
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
-    template<class lobj,class robj> class seq {
+  // Integer and real get extra relational functions.
-    public:
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
-      Integer operator()(const lobj &lhs, const robj &rhs)
+  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
-	{ 
+    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
 	  return TensorRemove(lhs) == TensorRemove(rhs);
 	}
    };
    template<class lobj,class robj> class sne {
    public:
      Integer operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return TensorRemove(lhs) != TensorRemove(rhs);
 	}
    };
    template<class lobj,class robj> class slt {
    public:
      Integer operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return TensorRemove(lhs) < TensorRemove(rhs);
 	}
    };
    template<class lobj,class robj> class sle {
    public:
      Integer operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return TensorRemove(lhs) <= TensorRemove(rhs);
 	}
    };
    template<class lobj,class robj> class sgt {
    public:
      Integer operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return TensorRemove(lhs) > TensorRemove(rhs);
 	}
    };
    template<class lobj,class robj> class sge {
    public:
      Integer operator()(const lobj &lhs, const robj &rhs)
 	{ 
 	  return TensorRemove(lhs) >= TensorRemove(rhs);
 	}
    };
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    // Integer gets extra relational functions. Could also implement these for RealF, RealD etc..
    //////////////////////////////////////////////////////////////////////////////////////////////////////
    template<class sfunctor> 
    inline vInteger Comparison(sfunctor sop,const vInteger & lhs, const vInteger & rhs)
    {
-      std::vector<Integer> vlhs(vInteger::Nsimd());   // Use functors to reduce this to single implementation
+      typedef typename vsimd::scalar_type scalar;
-      std::vector<Integer> vrhs(vInteger::Nsimd());
+      std::vector<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
      std::vector<scalar> vrhs(vsimd::Nsimd());
      std::vector<Integer> vpred(vsimd::Nsimd());
      vInteger ret;
-      extract<vInteger,Integer>(lhs,vlhs);
+      extract<vsimd,scalar>(lhs,vlhs);
-      extract<vInteger,Integer>(rhs,vrhs);
+      extract<vsimd,scalar>(rhs,vrhs);
-      for(int s=0;s<vInteger::Nsimd();s++){
+      for(int s=0;s<vsimd::Nsimd();s++){
-	vlhs[s] = sop(vlhs[s],vrhs[s]);
+	vpred[s] = sop(vlhs[s],vrhs[s]);
      }
-      merge<vInteger,Integer>(ret,vlhs);
+      merge<vInteger,Integer>(ret,vpred);
      return ret;
    }
-    inline vInteger operator < (const vInteger & lhs, const vInteger & rhs)
+
  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
    {
-      return Comparison(slt<Integer,Integer>(),lhs,rhs);
+      typedef typename vsimd::scalar_type scalar;
      std::vector<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
      std::vector<Integer> vpred(vsimd::Nsimd());
      vInteger ret;
      extract<vsimd,scalar>(lhs,vlhs);
      for(int s=0;s<vsimd::Nsimd();s++){
 	vpred[s] = sop(vlhs[s],rhs);
      }
      merge<vInteger,Integer>(ret,vpred);
      return ret;
    }
-    inline vInteger operator <= (const vInteger & lhs, const vInteger & rhs)
+
  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
    inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
    {
-      return Comparison(sle<Integer,Integer>(),lhs,rhs);
+      typedef typename vsimd::scalar_type scalar;
-    }
+      std::vector<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
-    inline vInteger operator > (const vInteger & lhs, const vInteger & rhs)
+      std::vector<Integer> vpred(vsimd::Nsimd());
-    {
+      vInteger ret;
-      return Comparison(sgt<Integer,Integer>(),lhs,rhs);
+      extract<vsimd,scalar>(rhs,vrhs);
-    }
+      for(int s=0;s<vsimd::Nsimd();s++){
-    inline vInteger operator >= (const vInteger & lhs, const vInteger & rhs)
+	vpred[s] = sop(lhs,vrhs[s]);
-    {
+      }
-      return Comparison(sge<Integer,Integer>(),lhs,rhs);
+      merge<vInteger,Integer>(ret,vpred);
-    }
+      return ret;
    inline vInteger operator == (const vInteger & lhs, const vInteger & rhs)
    {
      return Comparison(seq<Integer,Integer>(),lhs,rhs);
    }
    inline vInteger operator != (const vInteger & lhs, const vInteger & rhs)
    {
      return Comparison(sne<Integer,Integer>(),lhs,rhs);
    }
 #define DECLARE_RELATIONAL(op,functor) \
  template<class vsimd,IfSimd<vsimd> = 0>\
    inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd,IfSimd<vsimd> = 0>\
    inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd,IfSimd<vsimd> = 0>\
    inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
    {\
      typedef typename vsimd::scalar_type scalar;\
      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
    }\
  template<class vsimd>\
    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
    {									\
      return lhs._internal op rhs._internal;				\
    }									\
  template<class vsimd>\
    inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
    {									\
      return lhs._internal op rhs;					\
    }									\
  template<class vsimd>\
    inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
    {									\
      return lhs op rhs._internal;					\
    }									
 DECLARE_RELATIONAL(<,slt);
 DECLARE_RELATIONAL(<=,sle);
 DECLARE_RELATIONAL(>,sgt);
 DECLARE_RELATIONAL(>=,sge);
 DECLARE_RELATIONAL(==,seq);
 DECLARE_RELATIONAL(!=,sne);
 #undef DECLARE_RELATIONAL
 }
--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@@ -117,7 +117,9 @@ PARALLEL_FOR_LOOP
      int Nsimd = grid->Nsimd();
      assert( l.checkerboard == l._grid->CheckerBoard(site));
-      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
      // FIXME
      //      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
      int rank,odx,idx;
      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
@@ -132,6 +134,7 @@ PARALLEL_FOR_LOOP
      return;
    };
    //////////////////////////////////////////////////////////
    // Peek a scalar object from the SIMD array
    //////////////////////////////////////////////////////////
--- a/lib/lattice/Lattice_reality.h
+++ b/lib/lattice/Lattice_reality.h
@@ -27,37 +27,6 @@ PARALLEL_FOR_LOOP
        return ret;
    };
    template<class vobj> inline auto real(const Lattice<vobj> &z) -> Lattice<vobj>
    {
      Lattice<vobj> ret(z._grid);
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<z._grid->oSites();ss++){
            ret._odata[ss] = real(z._odata[ss]);
        }
      return ret;
    }
    template<class vobj> inline auto imag(const Lattice<vobj> &z) -> Lattice<vobj>
    {
      Lattice<vobj> ret(z._grid);
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<z._grid->oSites();ss++){
            ret._odata[ss] = imag(z._odata[ss]);
        }
      return ret;
    }
    template<class vobj> inline auto Ta(const Lattice<vobj> &z) -> Lattice<decltype(Ta(z._odata[0]))>
    {
      Lattice<decltype(Ta(z._odata[0]))> ret(z._grid);
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<z._grid->oSites();ss++){
            ret._odata[ss] = Ta(z._odata[ss]);
        }
      return ret;
    }
 }
 #endif
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -51,6 +51,31 @@ PARALLEL_FOR_LOOP
      return nrm;
    }
    template<class Op,class T1>
      inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
    {
      return sum(closure(expr));
    }
    template<class Op,class T1,class T2>
      inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
    {
      return sum(closure(expr));
    }
    template<class Op,class T1,class T2,class T3>
      inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
 				 eval(0,std::get<1>(expr.second)),
 				 eval(0,std::get<2>(expr.second))
 				 ))::scalar_object
    {
      return sum(closure(expr));
    }
    template<class vobj>
    inline typename vobj::scalar_object sum(const Lattice<vobj> &arg){
--- a/lib/lattice/Lattice_transpose.h
+++ b/lib/lattice/Lattice_transpose.h
@@ -24,8 +24,7 @@ PARALLEL_FOR_LOOP
    // Index level dependent transpose
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    template<int Index,class vobj>
-    inline auto transposeIndex(const Lattice<vobj> &lhs)
+    inline auto transposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
      -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
    {
      Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 PARALLEL_FOR_LOOP
--- a/lib/lattice/Lattice_unary.h
+++ b/lib/lattice/Lattice_unary.h
@@ -3,51 +3,6 @@
 namespace Grid {
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  //  avoid copy back routines for mult, mac, sub, add
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class obj> Lattice<obj> sqrt(const Lattice<obj> &rhs){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=sqrt(rhs._odata[ss]);
    }
    return ret;
  }
  template<class obj> Lattice<obj> rsqrt(const Lattice<obj> &rhs){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=rsqrt(rhs._odata[ss]);
    }
    return ret;
  }
  template<class obj> Lattice<obj> sin(const Lattice<obj> &rhs){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=sin(rhs._odata[ss]);
    }
    return ret;
  }
  template<class obj> Lattice<obj> cos(const Lattice<obj> &rhs){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=cos(rhs._odata[ss]);
    }
    return ret;
  }
  template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs,RealD y){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
--- a/lib/lattice/Lattice_where.h
+++ b/lib/lattice/Lattice_where.h
@@ -8,13 +8,14 @@ namespace Grid {
 //                              and blow away the tensor structures.
 //
 template<class vobj,class iobj>
-inline void where(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
 {
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);
  conformable(iftrue,ret);
  GridBase *grid=iftrue._grid;
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -43,7 +44,7 @@ PARALLEL_FOR_LOOP
 }
 template<class vobj,class iobj>
-inline Lattice<vobj> where(const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
 {
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -250,6 +250,7 @@ namespace QCD {
    //////////////////////////////////////////////////////////////////////////////
    // Peek and Poke named after physics attributes
    //////////////////////////////////////////////////////////////////////////////
    //spin
    template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(peekIndex<SpinIndex>(rhs,0))
    {
@@ -289,20 +290,117 @@ namespace QCD {
    {
      return peekIndex<LorentzIndex>(rhs,i);
    }
    template<class vobj> auto peekLorentz(const vobj &rhs,int i,int j) -> decltype(peekIndex<LorentzIndex>(rhs,0,0))
    {
      return peekIndex<LorentzIndex>(rhs,i,j);
    }
    template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(peekIndex<LorentzIndex>(rhs,0))
    {
      return peekIndex<LorentzIndex>(rhs,i);
    }
-    template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i,int j) -> decltype(peekIndex<LorentzIndex>(rhs,0,0))
+
    //////////////////////////////////////////////
    // Poke lattice
    //////////////////////////////////////////////
    template<class vobj> 
      void pokeColour(Lattice<vobj> &lhs,
 		      const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
 		      int i)
    {
-      return peekIndex<LorentzIndex>(rhs,i,j);
+      pokeIndex<ColourIndex>(lhs,rhs,i);
    }
    template<class vobj> 
      void pokeColour(Lattice<vobj> &lhs,
 		      const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
 		      int i,int j)
    {
      pokeIndex<ColourIndex>(lhs,rhs,i,j);
    }
    template<class vobj> 
      void pokeSpin(Lattice<vobj> &lhs,
 		      const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
 		      int i)
    {
      pokeIndex<SpinIndex>(lhs,rhs,i);
    }
    template<class vobj> 
      void pokeSpin(Lattice<vobj> &lhs,
 		      const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
 		      int i,int j)
    {
      pokeIndex<SpinIndex>(lhs,rhs,i,j);
    }
    template<class vobj> 
      void pokeLorentz(Lattice<vobj> &lhs,
 		      const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
 		      int i)
    {
      pokeIndex<LorentzIndex>(lhs,rhs,i);
    }
-    // FIXME transpose Colour, transpose Spin, traceColour traceSpin
+    //////////////////////////////////////////////
    // Poke scalars
    //////////////////////////////////////////////
    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
    {
      pokeIndex<SpinIndex>(lhs,rhs,i);
    }
    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0,0)) & rhs,int i,int j)
    {
      pokeIndex<SpinIndex>(lhs,rhs,i,j);
    }
    template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0)) & rhs,int i)
    {
      pokeIndex<ColourIndex>(lhs,rhs,i);
    }
    template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0,0)) & rhs,int i,int j)
    {
      pokeIndex<ColourIndex>(lhs,rhs,i,j);
    }
    template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<LorentzIndex>(lhs,0)) & rhs,int i)
    {
      pokeIndex<LorentzIndex>(lhs,rhs,i);
    }
    //////////////////////////////////////////////
    // transpose array and scalar
    //////////////////////////////////////////////
    template<int Index,class vobj> inline Lattice<vobj> transposeSpin(const Lattice<vobj> &lhs){
      return transposeIndex<SpinIndex>(lhs);
    }
    template<int Index,class vobj> inline Lattice<vobj> transposeColour(const Lattice<vobj> &lhs){
      return transposeIndex<ColourIndex>(lhs);
    }
    template<int Index,class vobj> inline vobj transposeSpin(const vobj &lhs){
      return transposeIndex<SpinIndex>(lhs);
    }
    template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
      return transposeIndex<ColourIndex>(lhs);
    }
    //////////////////////////////////////////
    // Trace lattice and non-lattice
    //////////////////////////////////////////
    template<int Index,class vobj>
    inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs._odata[0]))>
    {
      return traceIndex<SpinIndex>(lhs);
    }
    template<int Index,class vobj>
    inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs._odata[0]))>
    {
      return traceIndex<ColourIndex>(lhs);
    }
    template<int Index,class vobj>
    inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
    {
      return traceIndex<SpinIndex>(lhs);
    }
    template<int Index,class vobj>
    inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs))>
    {
      return traceIndex<ColourIndex>(lhs);
    }
 }   //namespace QCD
 } // Grid
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -70,7 +70,7 @@ public:
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
-  static void Staple(GaugeMat &staple,GaugeLorentz &Umu,int mu){
+  static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
    std::vector<GaugeMat> U(4,Umu._grid);
    for(int d=0;d<Nd;d++){
@@ -123,12 +123,10 @@ void siteRectangle(GaugeMat &plaq,const std::vector<GaugeMat> &U, const int mu,
 typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> ColourWilsonLoops;
 typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> U1WilsonLoops;
 typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> SU2WilsonLoops;
 typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> SU3WilsonLoops;
 }}
 #endif
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -28,13 +28,19 @@
 namespace Grid {
  //////////////////////////////////////
  // To take the floating point type of real/complex type
  //////////////////////////////////////
  template <typename T> struct RealPart {
    typedef T type;
  };
  template <typename T> struct RealPart< std::complex<T> >{
    typedef T type;
  };
  //////////////////////////////////////
  // demote a vector to real type
  //////////////////////////////////////
  // type alias used to simplify the syntax of std::enable_if
  template <typename T> using Invoke                                  =  typename T::type;
@@ -90,7 +96,7 @@ namespace Grid {
 	Vector_type v;
 	Scalar_type s[sizeof(Vector_type)/sizeof(Scalar_type)];
      conv_t_union(){};
-      } conv_t;
+    } conv_t;
    Vector_type v;
@@ -205,7 +211,6 @@ namespace Grid {
      return *this;
    }
    ///////////////////////////////////////
    // Not all functions are supported
    // through SIMD and must breakout to 
@@ -214,7 +219,6 @@ namespace Grid {
    ///////////////////////////////////////
    template<class functor> friend inline Grid_simd SimdApply (const functor &func,const Grid_simd &v) {
      Grid_simd ret;
      Grid_simd::conv_t conv;
@@ -225,6 +229,19 @@ namespace Grid {
      ret.v = conv.v;
      return ret;
    }
    template<class functor> friend inline Grid_simd SimdApplyBinop (const functor &func,const Grid_simd &x,const Grid_simd &y) {
      Grid_simd ret;
      Grid_simd::conv_t cx;
      Grid_simd::conv_t cy;
      cx.v = x.v;
      cy.v = y.v;
      for(int i=0;i<Nsimd();i++){
 	cx.s[i]=func(cx.s[i],cy.s[i]);
      }
      ret.v = cx.v;
      return ret;
    }
    ////////////////////////////////////////////////////////////////////
    // General permute; assumes vector length is same across 
@@ -235,6 +252,7 @@ namespace Grid {
    {
      Gpermute<Grid_simd>(y,b,perm);
    }
  };// end of Grid_simd class definition 
@@ -383,7 +401,6 @@ namespace Grid {
    return in;
  }
  /////////////////////
  // Inner, outer
  /////////////////////
@@ -405,6 +422,46 @@ namespace Grid {
    return arg;
  }
  ////////////////////////////////////////////////////////////
  // copy/splat complex real parts into real;
  // insert real into complex and zero imag;
  ////////////////////////////////////////////////////////////
  //real = toReal( complex )
  template<class S,class V,IfReal<S>  = 0>	
  inline Grid_simd<S,V> toReal(const Grid_simd<std::complex<S>,V> &in)
  {
    typedef Grid_simd<S,V> simd;
    simd ret;
    typename simd::conv_t conv;
    conv.v = in.v;
    for(int i=0;i<simd::Nsimd();i+=2){
      conv.s[i+1]=conv.s[i];    // duplicate (r,r);(r,r);(r,r); etc...
    }
    ret.v = conv.v;
    return ret;
  }
  //complex = toComplex( real )
  template<class R,class V,IfReal<R> = 0 >	// must be a real arg
  inline Grid_simd<std::complex<R>,V> toComplex (const Grid_simd<R,V> &in)
  {
    typedef Grid_simd<R,V> Rsimd;
    typedef Grid_simd<std::complex<R>,V> Csimd;
    typename Rsimd::conv_t conv;// address as real
    conv.v = in.v;
    for(int i=0;i<Rsimd::Nsimd();i+=2){
      assert(conv.s[i+1]==conv.s[i]); // trap any cases where real was not duplicated 
      // indicating the SIMD grids of real and imag assignment did not correctly match
      conv.s[i+1]=0.0;                // zero imaginary parts
    }
    Csimd ret;
    ret.v = conv.v;
    return ret;
  }
  ///////////////////////////////
  // Define available types
  ///////////////////////////////
@@ -413,6 +470,20 @@ namespace Grid {
  typedef Grid_simd< std::complex< float > , SIMD_Ftype > vComplexF;
  typedef Grid_simd< std::complex< double >, SIMD_Dtype > vComplexD;
  typedef Grid_simd< Integer               , SIMD_Itype > vInteger;
  /////////////////////////////////////////
  // Some traits to recognise the types
  /////////////////////////////////////////
  template <typename T> struct is_simd : public std::false_type{};
  template <> struct is_simd<vRealF>   : public std::true_type {};
  template <> struct is_simd<vRealD>   : public std::true_type {};
  template <> struct is_simd<vComplexF>: public std::true_type {};
  template <> struct is_simd<vComplexD>: public std::true_type {};
  template <> struct is_simd<vInteger> : public std::true_type {};
  template <typename T> using IfSimd     = Invoke<std::enable_if< is_simd<T>::value,int> > ;
  template <typename T> using IfNotSimd  = Invoke<std::enable_if<!is_simd<T>::value,unsigned> > ;
 }
 #endif
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -1,6 +1,8 @@
 #ifndef GRID_VECTOR_UNOPS
 #define GRID_VECTOR_UNOPS
 #include <cmath>
 namespace Grid { 
  template<class scalar> struct SqrtRealFunctor {
@@ -27,6 +29,28 @@ namespace Grid {
    }
  };
  template<class scalar> struct LogRealFunctor {
    scalar operator()(const scalar &a)  const {
      return log(real(a));
    }
  };
  template<class scalar> struct ExpRealFunctor {
    scalar operator()(const scalar &a)  const {
      return exp(real(a));
    }
  };
  template<class scalar> struct NotFunctor {
    scalar operator()(const scalar &a)  const {
      return (!a);
    }
  };
  template<class scalar> struct AbsRealFunctor {
    scalar operator()(const scalar &a)  const {
      return std::abs(real(a));
    }
  };
  template<class scalar> struct PowRealFunctor {
    double y;
  PowRealFunctor(double _y) : y(_y) {};
@@ -43,6 +67,25 @@ namespace Grid {
    }
  };
  template<class scalar> struct RealFunctor {
    scalar operator()(const scalar &a)  const {
      return real(a);
    }
  };
  template<class scalar> struct ImagFunctor {
    scalar operator()(const scalar &a)  const {
      return imag(a);
    }
  };
  template < class S, class V > 
  inline Grid_simd<S,V> real(const Grid_simd<S,V> &r) {
    return SimdApply(RealFunctor<S>(),r);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> imag(const Grid_simd<S,V> &r) {
    return SimdApply(ImagFunctor<S>(),r);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> sqrt(const Grid_simd<S,V> &r) {
    return SimdApply(SqrtRealFunctor<S>(),r);
@@ -60,6 +103,22 @@ namespace Grid {
    return SimdApply(CosRealFunctor<S>(),r);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> log(const Grid_simd<S,V> &r) {
    return SimdApply(LogRealFunctor<S>(),r);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> abs(const Grid_simd<S,V> &r) {
    return SimdApply(AbsRealFunctor<S>(),r);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> exp(const Grid_simd<S,V> &r) {
    return SimdApply(ExpRealFunctor<S>(),r);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> Not(const Grid_simd<S,V> &r) {
    return SimdApply(NotFunctor<S>(),r);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> pow(const Grid_simd<S,V> &r,double y) {
    return SimdApply(PowRealFunctor<S>(y),r);
  }
@@ -67,6 +126,55 @@ namespace Grid {
  inline Grid_simd<S,V> mod(const Grid_simd<S,V> &r,Integer y) {
    return SimdApply(ModIntFunctor<S>(y),r);
  }
  ////////////////////////////////////////////////////////////////////////////
  // Allows us to assign into **conformable** real vectors from complex
  ////////////////////////////////////////////////////////////////////////////
  //  template < class S, class V > 
  //  inline auto ComplexRemove(const Grid_simd<S,V> &c) -> Grid_simd<Grid_simd<S,V>::Real,V> {
  //    Grid_simd<Grid_simd<S,V>::Real,V> ret;
  //    ret.v = c.v;
  //    return ret;
  //  }
  template<class scalar> struct AndFunctor {
    scalar operator()(const scalar &x, const scalar &y)  const {
      return x & y;
    }
  };
  template<class scalar> struct OrFunctor {
    scalar operator()(const scalar &x, const scalar &y)  const {
      return x | y;
    }
  };
  template<class scalar> struct AndAndFunctor {
    scalar operator()(const scalar &x, const scalar &y)  const {
      return x && y;
    }
  };
  template<class scalar> struct OrOrFunctor {
    scalar operator()(const scalar &x, const scalar &y)  const {
      return x || y;
    }
  };
  ////////////////////////////////
  // Calls to simd binop functors
  ////////////////////////////////
  template < class S, class V > 
  inline Grid_simd<S,V> operator &(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
    return SimdApplyBinop(AndFunctor<S>(),x,y);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> operator &&(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
    return SimdApplyBinop(AndAndFunctor<S>(),x,y);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> operator |(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
    return SimdApplyBinop(OrFunctor<S>(),x,y);
  }
  template < class S, class V > 
  inline Grid_simd<S,V> operator ||(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
    return SimdApplyBinop(OrOrFunctor<S>(),x,y);
  }
 }
 #endif
--- a/lib/tensors/Tensor_arith_scalar.h
+++ b/lib/tensors/Tensor_arith_scalar.h
@@ -9,12 +9,12 @@ namespace Grid {
 //////////////////////////////////////////////////////////////////////////////////////////
 // multiplication by fundamental scalar type
-template<class l,int N> strong_inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l> strong_inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs*srhs;
 }
-template<class l,int N> strong_inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+template<class l> strong_inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
 template<class l,int N> strong_inline iVector<l,N> operator * (const iVector<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
@@ -118,12 +118,12 @@ template<class l,int N> strong_inline iMatrix<l,N> operator * (Integer lhs,const
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // addition by fundamental scalar type applies to matrix(down diag) and scalar
 ///////////////////////////////////////////////////////////////////////////////////////////////
-template<class l,int N> strong_inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l> strong_inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs+srhs;
 }
-template<class l,int N> strong_inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+template<class l> strong_inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
 template<class l,int N> strong_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
@@ -176,12 +176,12 @@ template<class l,int N> strong_inline iMatrix<l,N> operator + (Integer lhs,const
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // subtraction of fundamental scalar type applies to matrix(down diag) and scalar
 ///////////////////////////////////////////////////////////////////////////////////////////////
-template<class l,int N> strong_inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l> strong_inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs-srhs;
 }
-template<class l,int N> strong_inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
+template<class l> strong_inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
 {
  typename iScalar<l>::tensor_reduced slhs;slhs=lhs;
  return slhs-rhs;
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -23,13 +23,17 @@ template<class vtype> class iScalar
 public:
  vtype _internal;
-  typedef typename GridTypeMapper<vtype>::scalar_type   scalar_type;
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
  typedef iScalar<recurse_scalar_object> scalar_object;
  // substitutes a real or complex version with same tensor structure
  typedef iScalar<typename GridTypeMapper<vtype>::Complexified > Complexified;
  typedef iScalar<typename GridTypeMapper<vtype>::Realified >    Realified;
  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
  // Scalar no action
@@ -86,9 +90,19 @@ public:
  strong_inline const vtype & operator ()(void) const {
    return _internal;
  }
-  
+
-  operator ComplexD () const { return(TensorRemove(_internal)); };
+  // Type casts meta programmed
-  operator RealD () const { return(real(TensorRemove(_internal))); }
+  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0>  
    operator ComplexF () const { return(TensorRemove(_internal)); };
  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0>  
    operator ComplexD () const { return(TensorRemove(_internal)); };
  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0>  
    operator RealD () const { return(real(TensorRemove(_internal))); }
  template<class U=vtype,class V=scalar_type,IfReal<V>    = 0,IfNotSimd<U> = 0>  
    operator RealD    () const { return TensorRemove(_internal); }
  template<class U=vtype,class V=scalar_type,IfInteger<V> = 0,IfNotSimd<U> = 0>  
    operator Integer  () const { return Integer(TensorRemove(_internal)); }
  // convert from a something to a scalar via constructor of something arg
  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline iScalar<vtype> operator = (T arg)
@@ -123,6 +137,10 @@ public:
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  typedef iVector<recurse_scalar_object,N> scalar_object;
  // substitutes a real or complex version with same tensor structure
  typedef iVector<typename GridTypeMapper<vtype>::Complexified,N > Complexified;
  typedef iVector<typename GridTypeMapper<vtype>::Realified,N >    Realified;
  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iVector<vtype,N>
    { 
      zeroit(*this);
@@ -211,6 +229,12 @@ public:
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
  // substitutes a real or complex version with same tensor structure
  typedef iMatrix<typename GridTypeMapper<vtype>::Complexified,N > Complexified;
  typedef iMatrix<typename GridTypeMapper<vtype>::Realified,N >    Realified;
  // Tensure removal
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  typedef iMatrix<recurse_scalar_object,N> scalar_object;
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -31,18 +31,17 @@ inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::
 		  std::vector<scalar *> &extracted,int offset){
  int Nextr=extracted.size();
  int Nsimd=vsimd::Nsimd();
-  int s=Nsimd/Nextr;
+  int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
-
+                     // replicate n-fold. Use to allow Integer masks to 
                     // predicate floating point of various width assignments and maintain conformable.
  scalar *buf =(scalar *) y;
  for(int i=0;i<Nextr;i++){
    for(int ii=0;ii<s;ii++){
      buf[i*s+ii]=extracted[i][offset];
    }
  }
 };
 ////////////////////////////////////////////////////////////////////////////////////////////////
 // Extract a fundamental vector type to scalar array 
 ////////////////////////////////////////////////////////////////////////////////////////////////
@@ -55,8 +54,17 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
  scalar *buf = (scalar *)&y;
  for(int i=0;i<Nextr;i++){
-    for(int ii=0;ii<s;ii++){
+    extracted[i]=buf[i*s];
-      extracted[i]=buf[i*s+ii];
+    for(int ii=1;ii<s;ii++){
      if ( buf[i*s]!=buf[i*s+ii] ){
 	std::cout << " SIMD extract failure splat="<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
 	for(int vv=0;vv<Nsimd;vv++) {
 	  std::cout<< buf[vv]<<" ";
 	}
 	std::cout<<std::endl;
 	assert(0);
      }
      assert(buf[i*s]==buf[i*s+ii]);
    }
  }
@@ -74,21 +82,7 @@ inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::
  for(int i=0;i<Nextr;i++){
    for(int ii=0;ii<s;ii++){
-      buf[i*s+ii]=extracted[i];
+      buf[i*s+ii]=extracted[i]; // replicates value
    }
  }
 };
 template<class vsimd,class scalar>
 inline void AmergeA(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type  &y,std::vector<scalar> &extracted){
  int Nextr=extracted.size();
  int Nsimd=vsimd::Nsimd();
  int s=Nsimd/Nextr;
  scalar *buf = (scalar *)&y;
  for(int i=0;i<Nextr;i++){
    for(int ii=0;ii<s;ii++){
      buf[i*s+ii]=extracted[i];
    }
  }
 };
@@ -102,12 +96,12 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
  typedef typename vobj::vector_type vector_type ;
  const int Nsimd=vobj::vector_type::Nsimd();
  int Nextr=extracted.size();
  const int words=sizeof(vobj)/sizeof(vector_type);
  int s=Nsimd/Nextr;
-  extracted.resize(Nsimd);
+  std::vector<scalar_type *> pointers(Nextr);
-
+  for(int i=0;i<Nextr;i++) 
  std::vector<scalar_type *> pointers(Nsimd);
  for(int i=0;i<Nsimd;i++) 
    pointers[i] =(scalar_type *)& extracted[i];
  vector_type *vp = (vector_type *)&vec;
@@ -127,11 +121,11 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac
  const int words=sizeof(vobj)/sizeof(vector_type);
  const int Nsimd=vobj::vector_type::Nsimd();
-
+  int Nextr=extracted.size();
-  assert(extracted.size()==Nsimd);
+  int s = Nsimd/Nextr;
  std::vector<scalar_type *> pointers(Nsimd);
-  for(int i=0;i<Nsimd;i++) {
+  for(int i=0;i<Nextr;i++) {
    pointers[i] =(scalar_type *)& extracted[i][offset];
  }
@@ -153,10 +147,11 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
  const int Nsimd=vobj::vector_type::Nsimd();
  const int words=sizeof(vobj)/sizeof(vector_type);
-  assert(extracted.size()==Nsimd);
+  int Nextr = extracted.size();
  int splat=Nsimd/Nextr;
-  std::vector<scalar_type *> pointers(Nsimd);
+  std::vector<scalar_type *> pointers(Nextr);
-  for(int i=0;i<Nsimd;i++) 
+  for(int i=0;i<Nextr;i++) 
    pointers[i] =(scalar_type *)& extracted[i];
  vector_type *vp = (vector_type *)&vec;
@@ -177,14 +172,14 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  const int Nsimd=vobj::vector_type::Nsimd();
  const int words=sizeof(vobj)/sizeof(vector_type);
-  assert(extracted.size()==Nsimd);
+  int Nextr=extracted.size();
-  std::vector<scalar_type *> pointers(Nsimd);
+  std::vector<scalar_type *> pointers(Nextr);
-  for(int i=0;i<Nsimd;i++) 
+  for(int i=0;i<Nextr;i++) 
    pointers[i] =(scalar_type *)& extracted[i][offset];
-  
+
  vector_type *vp = (vector_type *)&vec;
-  assert((void *)vp!=NULL);
+
  for(int w=0;w<words;w++){
    merge<vector_type,scalar_type>(&vp[w],pointers,w);
  }
--- a/lib/tensors/Tensor_inner.h
+++ b/lib/tensors/Tensor_inner.h
@@ -10,7 +10,8 @@ namespace Grid {
      typedef typename sobj::scalar_type scalar;
      decltype(innerProduct(arg,arg)) nrm;
      nrm = innerProduct(arg,arg);
-      return real(nrm);
+      RealD ret = real(nrm);
      return ret;
    }
    template<class l,class r,int N> inline
--- a/lib/tensors/Tensor_logical.h
+++ b/lib/tensors/Tensor_logical.h
@@ -0,0 +1,32 @@
 #ifndef GRID_TENSOR_LOGICAL_H
 #define GRID_TENSOR_LOGICAL_H
 namespace Grid {
 #define LOGICAL_BINOP(Op)\
 template<class v> strong_inline iScalar<v> operator Op (const iScalar<v>& lhs,const iScalar<v>& rhs) \
 {\
  iScalar<v> ret;\
  ret._internal = lhs._internal Op rhs._internal ;\
  return ret;\
 }\
 template<class l> strong_inline iScalar<l> operator Op (const iScalar<l>& lhs,Integer rhs) \
 {\
  typename iScalar<l>::scalar_type t; t=rhs;\
  typename iScalar<l>::tensor_reduced srhs; srhs=t;\
  return lhs Op srhs;\
 }\
 template<class l> strong_inline iScalar<l> operator Op (Integer lhs,const iScalar<l>& rhs) \
 {\
  typename iScalar<l>::scalar_type t;t=lhs;\
  typename iScalar<l>::tensor_reduced slhs;slhs=t;\
  return slhs Op rhs;\
 }
 LOGICAL_BINOP(|);
 LOGICAL_BINOP(&);
 LOGICAL_BINOP(||);
 LOGICAL_BINOP(&&);
 }
 #endif
--- a/lib/tensors/Tensor_traits.h
+++ b/lib/tensors/Tensor_traits.h
@@ -26,6 +26,8 @@ namespace Grid {
    typedef typename T::vector_type vector_type;
    typedef typename T::tensor_reduced tensor_reduced;
    typedef typename T::scalar_object scalar_object;
    typedef typename T::Complexified Complexified;
    typedef typename T::Realified Realified;
    enum { TensorLevel = T::TensorLevel };
  };
@@ -38,6 +40,8 @@ namespace Grid {
    typedef RealF vector_type;
    typedef RealF tensor_reduced ;
    typedef RealF scalar_object;
    typedef ComplexF Complexified;
    typedef RealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<RealD> {
@@ -46,6 +50,8 @@ namespace Grid {
    typedef RealD vector_type;
    typedef RealD tensor_reduced;
    typedef RealD scalar_object;
    typedef ComplexD Complexified;
    typedef RealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<ComplexF> {
@@ -54,6 +60,8 @@ namespace Grid {
    typedef ComplexF vector_type;
    typedef ComplexF tensor_reduced;
    typedef ComplexF scalar_object;
    typedef ComplexF Complexified;
    typedef RealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<ComplexD> {
@@ -62,6 +70,8 @@ namespace Grid {
    typedef ComplexD vector_type;
    typedef ComplexD tensor_reduced;
    typedef ComplexD scalar_object;
    typedef ComplexD Complexified;
    typedef RealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<Integer> {
@@ -70,6 +80,8 @@ namespace Grid {
    typedef Integer vector_type;
    typedef Integer tensor_reduced;
    typedef Integer scalar_object;
    typedef void Complexified;
    typedef void Realified;
    enum { TensorLevel = 0 };
  };
@@ -79,6 +91,8 @@ namespace Grid {
    typedef vRealF vector_type;
    typedef vRealF tensor_reduced;
    typedef RealF  scalar_object;
    typedef vComplexF Complexified;
    typedef vRealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vRealD> {
@@ -87,6 +101,8 @@ namespace Grid {
    typedef vRealD vector_type;
    typedef vRealD tensor_reduced;
    typedef RealD  scalar_object;
    typedef vComplexD Complexified;
    typedef vRealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vComplexF> {
@@ -95,6 +111,8 @@ namespace Grid {
    typedef vComplexF vector_type;
    typedef vComplexF tensor_reduced;
    typedef ComplexF  scalar_object;
    typedef vComplexF Complexified;
    typedef vRealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vComplexD> {
@@ -103,6 +121,8 @@ namespace Grid {
    typedef vComplexD vector_type;
    typedef vComplexD tensor_reduced;
    typedef ComplexD  scalar_object;
    typedef vComplexD Complexified;
    typedef vRealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vInteger> {
@@ -111,6 +131,8 @@ namespace Grid {
    typedef vInteger vector_type;
    typedef vInteger tensor_reduced;
    typedef  Integer scalar_object;
    typedef void Complexified;
    typedef void Realified;
    enum { TensorLevel = 0 };
  };
--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -2,7 +2,7 @@
 #define GRID_TENSOR_UNARY_H
 namespace Grid {
-#define UNARY_REAL(func)\
+#define UNARY(func)\
 template<class obj> inline auto func(const iScalar<obj> &z) -> iScalar<obj>\
 {\
    iScalar<obj> ret;\
@@ -53,14 +53,71 @@ template<class obj> inline iScalar<obj> func(const iScalar<obj> &z,scal y)	\
    return ret;\
 }
-UNARY_REAL(sqrt);
+UNARY(sqrt);
-UNARY_REAL(rsqrt);
+UNARY(rsqrt);
-UNARY_REAL(sin);
+UNARY(sin);
-UNARY_REAL(cos);
+UNARY(cos);
 UNARY(log);
 UNARY(exp);
 UNARY(abs);
 UNARY(Not);
 template<class obj> inline auto toReal(const iScalar<obj> &z) -> typename iScalar<obj>::Realified
 {
  typename iScalar<obj>::Realified ret;
  ret._internal = toReal(z._internal);
  return ret;
 }
 template<class obj,int N> inline auto toReal(const iVector<obj,N> &z) -> typename iVector<obj,N>::Realified
 {
  typename iVector<obj,N>::Realified ret;
  for(int c1=0;c1<N;c1++){  
    ret._internal[c1] = toReal(z._internal[c1]); 
  }
  return ret;
 }
 template<class obj,int N> inline auto toReal(const iMatrix<obj,N> &z) -> typename iMatrix<obj,N>::Realified
 {
  typename iMatrix<obj,N>::Realified ret;
  for(int c1=0;c1<N;c1++){
  for(int c2=0;c2<N;c2++){
    ret._internal[c1][c2] = toReal(z._internal[c1][c2]);
  }}
  return ret;
 }
 template<class obj> inline auto toComplex(const iScalar<obj> &z) -> typename iScalar<obj>::Complexified
 {
  typename iScalar<obj>::Complexified ret;
  ret._internal = toComplex(z._internal);
  return ret;
 }
 template<class obj,int N> inline auto toComplex(const iVector<obj,N> &z) -> typename iVector<obj,N>::Complexified
 {
  typename iVector<obj,N>::Complexified ret;
  for(int c1=0;c1<N;c1++){  
    ret._internal[c1] = toComplex(z._internal[c1]); 
  }
  return ret;
 }
 template<class obj,int N> inline auto toComplex(const iMatrix<obj,N> &z) -> typename iMatrix<obj,N>::Complexified
 {
  typename iMatrix<obj,N>::Complexified ret;
  for(int c1=0;c1<N;c1++){
  for(int c2=0;c2<N;c2++){
    ret._internal[c1][c2] = toComplex(z._internal[c1][c2]);
  }}
  return ret;
 }
 BINARY_RSCALAR(mod,Integer);
 BINARY_RSCALAR(pow,RealD);
 #undef UNARY
 #undef BINARY_RSCALAR
 }
 #endif
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,11 +1,15 @@
-bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cg Test_cayley_ldop_cr Test_cayley_ldop_cr_chebyshev Test_cf_coarsen_support Test_cf_cr_unprec Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_gamma Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd
+bin_PROGRAMS = Test_GaugeAction Test_Metropolis Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cg Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_gamma Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd
 Test_GaugeAction_SOURCES=Test_GaugeAction.cc
 Test_GaugeAction_LDADD=-lGrid
 Test_Metropolis_SOURCES=Test_Metropolis.cc
 Test_Metropolis_LDADD=-lGrid
 Test_cayley_cg_SOURCES=Test_cayley_cg.cc
 Test_cayley_cg_LDADD=-lGrid
@@ -26,10 +30,6 @@ Test_cayley_ldop_cr_SOURCES=Test_cayley_ldop_cr.cc
 Test_cayley_ldop_cr_LDADD=-lGrid
 Test_cayley_ldop_cr_chebyshev_SOURCES=Test_cayley_ldop_cr_chebyshev.cc
 Test_cayley_ldop_cr_chebyshev_LDADD=-lGrid
 Test_cf_coarsen_support_SOURCES=Test_cf_coarsen_support.cc
 Test_cf_coarsen_support_LDADD=-lGrid
--- a/tests/Test_GaugeAction.cc
+++ b/tests/Test_GaugeAction.cc
@@ -27,6 +27,7 @@ public:
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
--- a/tests/Test_cayley_coarsen_support.cc
+++ b/tests/Test_cayley_coarsen_support.cc
@@ -102,16 +102,11 @@ int main (int argc, char ** argv)
  CoarseVector c_res (Coarse5d);
  CoarseVector c_proj(Coarse5d);
  //  TODO
  // -- promote from subspace, check we get the vector we wanted
  // -- apply ldop; check we get the same as inner product of M times big vec
  // -- pick blocks one by one. Evaluate matrix elements.
  Complex one(1.0);
  c_src = one;  // 1 in every element for vector 1.
  blockPromote(c_src,err,subspace);
  prom=zero;
  for(int b=0;b<nbasis;b++){
    prom=prom+subspace[b];
--- a/tests/Test_cshift.cc
+++ b/tests/Test_cshift.cc
@@ -66,7 +66,8 @@ int main (int argc, char ** argv)
 	  nrm = abs(scm-cm()()());
 	  std::vector<int> peer(4);
-	  int index=real(cm);
+	  Complex tmp  =cm;
 	  Integer index=real(tmp);
 	  Fine.CoorFromIndex(peer,index,latt_size);
 	  if (nrm > 0){
--- a/tests/Test_cshift_red_black.cc
+++ b/tests/Test_cshift_red_black.cc
@@ -100,7 +100,8 @@ int main (int argc, char ** argv)
 	  double nrm = abs(scm-cm()()());
 	  std::vector<int> peer(4);
-	  int index=real(cm);
+	  Complex ctmp = cm;
 	  Integer index=real(ctmp);
 	  Fine.CoorFromIndex(peer,index,latt_size);
 	  if (nrm > 0){
@@ -138,7 +139,8 @@ int main (int argc, char ** argv)
 	  nrm = abs(scm-cm()()());
 	  std::vector<int> peer(4);
-	  int index=real(cm);
+	  Complex ctmp=cm;
 	  Integer index=real(ctmp);
 	  Fine.CoorFromIndex(peer,index,latt_size);
 	  if (nrm > 0){
--- a/tests/Test_gamma.cc
+++ b/tests/Test_gamma.cc
@@ -95,7 +95,7 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<6;mu++){
    result =  Gamma(g[mu])* ident * Gamma(g[mu]);
    result = result - ident;
-    double mag = TensorRemove(norm2(result));
+    RealD mag = norm2(result);
    std::cout << list[mu]<<" " << mag<<std::endl;
  }
@@ -103,7 +103,7 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<6;mu++){
    result =          rnd * Gamma(g[mu]);
    result = result + rnd * Gamma(g[mu+6]);
-    double mag = TensorRemove(norm2(result));
+    RealD mag = norm2(result);
    std::cout << list[mu]<<" " << mag<<std::endl;
  }
@@ -111,7 +111,7 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<6;mu++){
    result =           Gamma(g[mu])  *rnd;
    result = result +  Gamma(g[mu+6])*rnd;
-    double mag = TensorRemove(norm2(result));
+    RealD mag = norm2(result);
    std::cout << list[mu]<<" " << mag<<std::endl;
  }
--- a/tests/Test_lie_generators.cc
+++ b/tests/Test_lie_generators.cc
@@ -2,226 +2,91 @@
 #include <qcd/utils/CovariantCshift.h>
 #include <qcd/utils/WilsonLoops.h>
 #include <qcd/utils/SUn.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 class suN {
 public:
  static int generators(int ncolour)   { return ncolour*ncolour-1; }
  static int su2subgroups(int ncolour) { return (ncolour*(ncolour-1))/2; }
  template<typename CComplex,int N> using suNmatrix = iScalar<iScalar<iMatrix<CComplex,N> > > ;
  ////////////////////////////////////////////////////////////////////////
  // There are N^2-1 generators for SU(N).
  //
  // We take a traceless hermitian generator basis as follows
  //
  // * Normalisation: trace ta tb = 1/2 delta_ab
  // 
  // * Off diagonal
  //    - pairs of rows i1,i2 behaving like pauli matrices signma_x, sigma_y
  //     
  //    - there are (Nc-1-i1) slots for i2 on each row [ x  0  x ]                                   
  //      direct count off each row                                                                    
  //
  //    - Sum of all pairs is Nc(Nc-1)/2: proof arithmetic series
  //
  //      (Nc-1) + (Nc-2)+...  1      ==> Nc*(Nc-1)/2 
  //      1+ 2+          +   + Nc-1                        
  // 
  //    - There are 2 x Nc (Nc-1)/ 2 of these = Nc^2 - Nc
  //
  //    - We enumerate the row-col pairs.
  //    - for each row col pair there is a (sigma_x) and a (sigma_y) like generator
  //
  //
  //   t^a_ij = { in 0.. Nc(Nc-1)/2 -1} =>  delta_{i,i1} delta_{j,i2} +  delta_{i,i1} delta_{j,i2}  
  //   t^a_ij = { in Nc(Nc-1)/2 ... Nc^(Nc-1) -1} =>  i delta_{i,i1} delta_{j,i2} - i delta_{i,i1} delta_{j,i2}  
  //   
  // * Diagonal; must be traceless and normalised
  //   - Sequence is 
  //   N  (1,-1,0,0...)
  //   N  (1, 1,-2,0...)
  //   N  (1, 1, 1,-3,0...)
  //   N  (1, 1, 1, 1,-4,0...)
  //
  //   where 1/2 = N^2 (1+.. m^2)etc.... for the m-th diagonal generator
  //   NB this gives the famous SU3 result for su2 index 8
  //
  //   N= sqrt(1/2 . 1/6 ) = 1/2 . 1/sqrt(3) 
  //
  //   ( 1      )
  //   (    1   ) / sqrt(3) /2  = 1/2 lambda_8
  //   (      -2)
  ////////////////////////////////////////////////////////////////////////
  template<class CComplex,int Ncolour> 
  static void suNgenerator(int lieIndex,suNmatrix<CComplex,Ncolour> &ta){
    // map lie index to which type of generator
    int diagIndex;
    int su2Index;
    int sigxy;
    int NNm1 =  Ncolour*(Ncolour-1);
    if ( lieIndex>= NNm1 ) {
      diagIndex = lieIndex -NNm1;
      suNgeneratorDiagonal(diagIndex,ta);
      return;
    }
    sigxy   = lieIndex&0x1;
    su2Index= lieIndex>>1;
    if ( sigxy ) suNgeneratorSigmaY(su2Index,ta);
    else         suNgeneratorSigmaX(su2Index,ta);
  }
  template<class CComplex,int Ncolour> 
  static void suNgeneratorSigmaX(int su2Index,suNmatrix<CComplex,Ncolour> &ta){
    ta=zero;
    int i1,i2;
    su2SubGroupIndex<Ncolour>(i1,i2,su2Index);
    ta()()(i1,i2)=1.0;
    ta()()(i2,i1)=1.0;
    ta= ta *0.5;
  }
  template<class CComplex,int Ncolour> 
  static void suNgeneratorSigmaY(int su2Index,suNmatrix<CComplex,Ncolour> &ta){
    ta=zero;
    Complex i(0.0,1.0);
    int i1,i2;
    su2SubGroupIndex<Ncolour>(i1,i2,su2Index);
    ta()()(i1,i2)=-i;
    ta()()(i2,i1)= i;
    ta= ta *0.5;
  }
  template<class CComplex,int Ncolour> 
  static void suNgeneratorDiagonal(int diagIndex,suNmatrix<CComplex,Ncolour> &ta){
    ta=zero;
    int trsq=0;
    int last=diagIndex+1;
    for(int i=0;i<=diagIndex;i++){
      ta()()(i,i) = 1.0;
      trsq++;
    }
    ta()()(last,last) = -last;
    trsq+=last*last;
    RealD nrm = 1.0/std::sqrt(2.0*trsq);
    ta = ta *nrm;
  }
  ////////////////////////////////////////////////////////////////////////
  // Map a 
  //
  ////////////////////////////////////////////////////////////////////////
  template<int Ncolour>
  static void su2SubGroupIndex(int &i1,int &i2,int su2_index){
    assert( (su2_index>=0) && (su2_index< (Ncolour*(Ncolour-1))/2) );
    int spare=su2_index;
    for(i1=0;spare >= (Ncolour-1-i1);i1++ ){
      spare = spare - (Ncolour-1-i1);  // remove the Nc-1-i1 terms                                 
    }
    i2=i1+1+spare;
  }
  template<class CComplex,int Ncolour>
  static void su2Extract(std::vector<LatticeComplex> &r,const Lattice<suNmatrix<CComplex,Ncolour> > &source, int su2_index)
  {
    assert(r.size() == 4); // store in 4 real parts
    for(int i=0;i<4;i++){
      conformable(r[i],source);
    }
    int i1,i2;    
    su2SubGroupIndex<Ncolour>(i1,i2,su2_index);
    /* Compute the b(k) of A_SU(2) = b0 + i sum_k bk sigma_k */ 
    r[0] = real(source()()(i1,i1) + source()()(i2,i2));
    r[1] = imag(source()()(i1,i2) + source()()(i2,i1));
    r[2] = real(source()()(i1,i2) - source()()(i2,i1));
    r[3] = imag(source()()(i1,i1) - source()()(i2,i2));
  }
  template<int Ncolour> static void printGenerators(void)
  {
    for(int gen=0;gen<suN::generators(Ncolour);gen++){
      suN::suNmatrix<Complex,Ncolour> ta;
      suN::suNgenerator(gen,ta);
      std::cout<< "Nc = "<<Ncolour<<" t_"<<gen<<std::endl;
      std::cout<<ta<<std::endl;
    }
  }
  template<int Ncolour> static void testGenerators(void){
    suNmatrix<Complex,Ncolour> ta;
    suNmatrix<Complex,Ncolour> tb;
    std::cout<<"Checking trace ta tb is 0.5 delta_ab"<<std::endl;
    for(int a=0;a<generators(Ncolour);a++){
      for(int b=0;b<generators(Ncolour);b++){
 	suNgenerator(a,ta);
 	suNgenerator(b,tb);
 	Complex tr =TensorRemove(trace(ta*tb)); 
 	std::cout<<tr<<" ";
 	if(a==b) assert(abs(tr-Complex(0.5))<1.0e-6);
 	if(a!=b) assert(abs(tr)<1.0e-6);
      }
      std::cout<<std::endl;
    }
    std::cout<<"Checking hermitian"<<std::endl;
    for(int a=0;a<generators(Ncolour);a++){
      suNgenerator(a,ta);
      std::cout<<a<<" ";
      assert(norm2(ta-adj(ta))<1.0e-6);
    }    
    std::cout<<std::endl;
    std::cout<<"Checking traceless"<<std::endl;
    for(int a=0;a<generators(Ncolour);a++){
      suNgenerator(a,ta);
      Complex tr =TensorRemove(trace(ta)); 
      std::cout<<a<<" ";
      assert(abs(tr)<1.0e-6);
    }    
    std::cout<<std::endl;
  }
 };
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
-
+  std::vector<int> latt({4,4,4,8});
-  std::vector<int> simd_layout = GridDefaultSimd(4,vComplexF::Nsimd());
+  GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, 
-  std::vector<int> mpi_layout  = GridDefaultMpi();
+							GridDefaultSimd(Nd,vComplexF::Nsimd()),
-  std::vector<int> latt_size  ({4,4,4,4});
+							GridDefaultMpi());
-
+  
-  GridCartesian     Fine(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian * rbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);
  LatticeGaugeField Umu(&Fine);
  std::cout<<"*********************************************"<<std::endl;
  std::cout<<"* Generators for SU(2)"<<std::endl;
  std::cout<<"*********************************************"<<std::endl;
-  suN::printGenerators<2>();
+  SU2::printGenerators();
-  suN::testGenerators<2>();
+  SU2::testGenerators();
  std::cout<<"*********************************************"<<std::endl;
  std::cout<<"* Generators for SU(3)"<<std::endl;
  std::cout<<"*********************************************"<<std::endl;
-  suN::printGenerators<3>();
+  SU3::printGenerators();
-  suN::testGenerators<3>();
+  SU3::testGenerators();
-  std::cout<<"*********************************************"<<std::endl;
+
-  std::cout<<"* Generators for SU(4)"<<std::endl;
+  //  std::cout<<"*********************************************"<<std::endl;
-  std::cout<<"*********************************************"<<std::endl;
+  //  std::cout<<"* Generators for SU(4)"<<std::endl;
-  suN::printGenerators<4>();
+  //  std::cout<<"*********************************************"<<std::endl;
-  suN::testGenerators<4>();
+  //  SU4::printGenerators();
-  std::cout<<"*********************************************"<<std::endl;
+  //  SU4::testGenerators();
-  std::cout<<"* Generators for SU(5)"<<std::endl;
+
-  std::cout<<"*********************************************"<<std::endl;
+  //  std::cout<<"*********************************************"<<std::endl;
-  suN::printGenerators<5>();
+  //  std::cout<<"* Generators for SU(5)"<<std::endl;
-  suN::testGenerators<5>();
+  //  std::cout<<"*********************************************"<<std::endl;
  //  SU5::printGenerators();
  //  SU5::testGenerators();
  ///////////////////////////////
  // Configuration of known size
  ///////////////////////////////
  NerscField header;
  std::string file("./ckpoint_lat.400");
  LatticeGaugeField Umu(grid);
  //  readNerscConfiguration(Umu,header,file);
  Umu=1.0; // Cold start
  // RNG set up for test
  std::vector<int> pseeds({1,2,3,4,5}); // once I caught a fish alive
  std::vector<int> sseeds({6,7,8,9,10});// then i let it go again
  GridParallelRNG  pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
  GridSerialRNG    sRNG;       sRNG.SeedFixedIntegers(sseeds);
  // SU3 colour operatoions
  LatticeColourMatrix link(grid);
  LatticeColourMatrix staple(grid);
  int mu=0;
  // Get Staple
  ColourWilsonLoops::Staple(staple,Umu,mu);
  // Get Link
  link = peekIndex<LorentzIndex>(Umu,mu);
  // Apply heatbath to the link
  RealD beta=6.0;
  int subgroup=0;
  int nhb=1;
  int trials=0;
  int fails=0;
  LatticeInteger one(rbGrid);  one = 1; // fill with ones
  LatticeInteger mask(grid);   mask= zero;
  one.checkerboard=Even;
  setCheckerboard(mask,one);
  // update Even checkerboard
  SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,
 			nhb,trials,fails,mask);
  Grid_finalize();
 }
--- a/tests/Test_main.cc
+++ b/tests/Test_main.cc
@@ -242,7 +242,7 @@ int main (int argc, char ** argv)
    { // Peek-ology and Poke-ology, with a little app-ology
-      TComplex      c;
+      Complex      c;
      ColourMatrix c_m;   
      SpinMatrix   s_m;   
      SpinColourMatrix sc_m; 
@@ -299,7 +299,7 @@ int main (int argc, char ** argv)
    }
    Bar = zero;
-    Bar = where(lex<10,Foo,Bar);
+    Bar = where(lex<Integer(10),Foo,Bar);
    {
      std::vector<int> coor(4);
      for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
@@ -467,7 +467,8 @@ int main (int argc, char ** argv)
        mdiff = shifted1-shifted2;
        amdiff=adj(mdiff);
        ColourMatrix prod = amdiff*mdiff;
-        Real Ttr=real(trace(prod));
+	Complex trprod = trace(prod);
        Real Ttr=real(trprod);
        double nn=Ttr;
        if ( nn > 0 )
            cout<<"Shift real trace fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<endl;