Corrected bug in integer multiplications for SSE4 and AVX2

Merge remote-tracking branch 'upstream/master' Conflicts: tests/Make.inc
2025-10-16 06:04:43 +01:00 · 2015-06-16 23:34:45 +09:00
parent f7a1cef15b 55d7483608
commit 6e5db0b1da
37 changed files with 1341 additions and 515 deletions
--- a/10
+++ b/10
@@ -1,11 +1,15 @@
 ================================================================
 *** Hacks and bug fixes to clean up and Audits
 ================================================================
-* Base class to share common code between vRealF, VComplexF etc... done
-  - Performance check on Guido's reimplementation strategy  - (GUIDO) tested and no difference was found, merged
+
+*  Extract/merge/set cleanup ; too many variants; rationalise and call simpler ones
+*  Used #define repetitive sequences to minimise code.
+*  Rewrite core tensor arithmetic support to be more systematic
+*  Ensure we ET as much as possible; move unop functions into ET framework.
+   - tests with expression args to all functions
+

 * FIXME audit
-
 * const audit

 Insert/Extract
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -33,7 +33,7 @@ public:
  ~alignedAllocator() throw() { }

  pointer       address(reference __x)       const { return &__x; }
-  const_pointer address(const_reference __x) const { return &__x; }
+  //  const_pointer address(const_reference __x) const { return &__x; }

  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }

--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@

-HFILES=./Cshift.h ./simd/Grid_avx.h ./simd/Grid_vector_types.h ./simd/Grid_sse4.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_vector_unops.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./Tensors.h ./Algorithms.h ./communicator/Communicator_base.h ./lattice/Lattice_rng.h ./lattice/Lattice_reduction.h ./lattice/Lattice_transfer.h ./lattice/Lattice_unary.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_comparison.h ./lattice/Lattice_overload.h ./lattice/Lattice_reality.h ./lattice/Lattice_local.h ./lattice/Lattice_conformable.h ./lattice/Lattice_where.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_ET.h ./lattice/Lattice_transpose.h ./lattice/Lattice_trace.h ./Stencil.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_poke.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_class.h ./tensors/Tensor_transpose.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_outer.h ./tensors/Tensor_inner.h ./tensors/Tensor_traits.h ./tensors/Tensor_Ta.h ./tensors/Tensor_unary.h ./tensors/Tensor_peek.h ./tensors/Tensor_arith.h ./tensors/Tensor_extract_merge.h ./Communicator.h ./Cartesian.h ./parallelIO/NerscIO.h ./qcd/QCD.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/LinalgUtils.h ./qcd/utils/CovariantCshift.h ./qcd/utils/WilsonLoops.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/spin/TwoSpinor.h ./qcd/spin/Dirac.h ./cshift/Cshift_common.h ./cshift/Cshift_none.h ./cshift/Cshift_mpi.h ./Simd.h ./GridConfig.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_red_black.h ./cartesian/Cartesian_full.h ./AlignedAllocator.h ./Lattice.h ./Threads.h ./Grid.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Remez.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./algorithms/CoarsenedMatrix.h ./stencil/Lebesgue.h
+HFILES=./Cshift.h ./simd/Grid_avx.h ./simd/Grid_vector_types.h ./simd/Grid_sse4.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_vector_unops.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./Tensors.h ./Algorithms.h ./communicator/Communicator_base.h ./lattice/Lattice_rng.h ./lattice/Lattice_reduction.h ./lattice/Lattice_transfer.h ./lattice/Lattice_unary.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_comparison.h ./lattice/Lattice_overload.h ./lattice/Lattice_reality.h ./lattice/Lattice_local.h ./lattice/Lattice_conformable.h ./lattice/Lattice_where.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_ET.h ./lattice/Lattice_transpose.h ./lattice/Lattice_trace.h ./Stencil.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_poke.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_class.h ./tensors/Tensor_logical.h ./tensors/Tensor_transpose.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_outer.h ./tensors/Tensor_inner.h ./tensors/Tensor_traits.h ./tensors/Tensor_Ta.h ./tensors/Tensor_unary.h ./tensors/Tensor_peek.h ./tensors/Tensor_arith.h ./tensors/Tensor_extract_merge.h ./Communicator.h ./Cartesian.h ./parallelIO/NerscIO.h ./qcd/QCD.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/LinalgUtils.h ./qcd/utils/CovariantCshift.h ./qcd/utils/WilsonLoops.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/spin/TwoSpinor.h ./qcd/spin/Dirac.h ./cshift/Cshift_common.h ./cshift/Cshift_none.h ./cshift/Cshift_mpi.h ./Simd.h ./GridConfig.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_red_black.h ./cartesian/Cartesian_full.h ./AlignedAllocator.h ./Lattice.h ./Threads.h ./Grid.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Remez.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./algorithms/CoarsenedMatrix.h ./stencil/Lebesgue.h

 CCFILES=./qcd/utils/SpaceTimeGrid.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/spin/Dirac.cc ./GridInit.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@@ -14,5 +14,6 @@
 #include <tensors/Tensor_reality.h>
 #include <tensors/Tensor_unary.h>
 #include <tensors/Tensor_extract_merge.h>
-    
+#include <tensors/Tensor_logical.h>
+
 #endif
--- a/lib/algorithms/CoarsenedMatrix.h
+++ b/lib/algorithms/CoarsenedMatrix.h
@@ -183,7 +183,7 @@ namespace Grid {
 	  int dir   = geom.directions[p];
 	  int disp  = geom.displacements[p];

-	  int block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
+	  Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);

 	  LatticeCoordinate(coor,dir);

@@ -204,8 +204,8 @@ namespace Grid {
 	    oblock = where(mod(coor,block)==(block-1),Mphi,zz);
 	    iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
 	  } else if ( disp==-1 ) {
-	    oblock = where(mod(coor,block)==0,Mphi,zz);
-	    iblock = where(mod(coor,block)!=0,Mphi,zz);
+	    oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
+	    iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
 	  } else {
 	    assert(0);
 	  }
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@@ -9,6 +9,37 @@

 namespace Grid {

+  ////////////////////////////////////////////////////
+  // Predicated where support
+  ////////////////////////////////////////////////////
+  template<class iobj,class vobj,class robj>
+    inline vobj predicatedWhere(const iobj &predicate,const vobj &iftrue,const robj &iffalse) {
+
+    typename std::remove_const<vobj>::type ret;
+
+    typedef typename vobj::scalar_object scalar_object;
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::vector_type vector_type;
+
+    const int Nsimd = vobj::vector_type::Nsimd();
+    const int words = sizeof(vobj)/sizeof(vector_type);
+
+    std::vector<Integer> mask(Nsimd);
+    std::vector<scalar_object> truevals (Nsimd);
+    std::vector<scalar_object> falsevals(Nsimd);
+
+    extract(iftrue   ,truevals);
+    extract(iffalse  ,falsevals);
+    extract<vInteger,Integer>(TensorRemove(predicate),mask);
+
+    for(int s=0;s<Nsimd;s++){
+      if (mask[s]) falsevals[s]=truevals[s];
+    }
+
+    merge(ret,falsevals);
+    return ret;
+  }
+
 ////////////////////////////////////////////
 // recursive evaluation of expressions; Could
 // switch to generic approach with variadics, a la
@@ -142,10 +173,23 @@ template <class arg> struct name\
 };

 GridUnopClass(UnarySub,-a);
+GridUnopClass(UnaryNot,Not(a));
 GridUnopClass(UnaryAdj,adj(a));
 GridUnopClass(UnaryConj,conjugate(a));
 GridUnopClass(UnaryTrace,trace(a));
 GridUnopClass(UnaryTranspose,transpose(a));
+GridUnopClass(UnaryTa,Ta(a));
+GridUnopClass(UnaryReal,real(a));
+GridUnopClass(UnaryImag,imag(a));
+GridUnopClass(UnaryToReal,toReal(a));
+GridUnopClass(UnaryToComplex,toComplex(a));
+GridUnopClass(UnaryAbs,abs(a));
+GridUnopClass(UnarySqrt,sqrt(a));
+GridUnopClass(UnaryRsqrt,rsqrt(a));
+GridUnopClass(UnarySin,sin(a));
+GridUnopClass(UnaryCos,cos(a));
+GridUnopClass(UnaryLog,log(a));
+GridUnopClass(UnaryExp,exp(a));

 ////////////////////////////////////////////
 // Binary operators
@@ -163,6 +207,28 @@ GridBinOpClass(BinaryAdd,lhs+rhs);
 GridBinOpClass(BinarySub,lhs-rhs);
 GridBinOpClass(BinaryMul,lhs*rhs);

+GridBinOpClass(BinaryAnd   ,lhs&rhs);
+GridBinOpClass(BinaryOr    ,lhs|rhs);
+GridBinOpClass(BinaryAndAnd,lhs&&rhs);
+GridBinOpClass(BinaryOrOr  ,lhs||rhs);
+
+////////////////////////////////////////////////////
+// Trinary conditional op
+////////////////////////////////////////////////////
+#define GridTrinOpClass(name,combination)\
+template <class predicate,class left, class right>	\
+struct name\
+{\
+  static auto inline func(const predicate &pred,const left &lhs,const right &rhs)-> decltype(combination) const \
+    {\
+      return combination;\
+    }\
+}
+
+GridTrinOpClass(TrinaryWhere,(predicatedWhere<predicate, \
+			       typename std::remove_reference<left>::type, \
+			       typename std::remove_reference<right>::type> (pred,lhs,rhs)));
+
 ////////////////////////////////////////////
 // Operator syntactical glue
 ////////////////////////////////////////////
@@ -218,15 +284,67 @@ template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,con
 ////////////////////////

 GRID_DEF_UNOP(operator -,UnarySub);
+GRID_DEF_UNOP(Not,UnaryNot);
+GRID_DEF_UNOP(operator !,UnaryNot);
 GRID_DEF_UNOP(adj,UnaryAdj);
 GRID_DEF_UNOP(conjugate,UnaryConj);
 GRID_DEF_UNOP(trace,UnaryTrace);
 GRID_DEF_UNOP(transpose,UnaryTranspose);
+GRID_DEF_UNOP(Ta,UnaryTa);
+GRID_DEF_UNOP(real,UnaryReal);
+GRID_DEF_UNOP(imag,UnaryImag);
+GRID_DEF_UNOP(toReal,UnaryToReal);
+GRID_DEF_UNOP(toComplex,UnaryToComplex);
+GRID_DEF_UNOP(abs  ,UnaryAbs); //abs overloaded in cmath C++98; DON'T do the abs-fabs-dabs-labs thing
+GRID_DEF_UNOP(sqrt ,UnarySqrt);
+GRID_DEF_UNOP(rsqrt,UnarySqrt);
+GRID_DEF_UNOP(sin  ,UnarySin);
+GRID_DEF_UNOP(cos  ,UnaryCos);
+GRID_DEF_UNOP(log  ,UnaryLog);
+GRID_DEF_UNOP(exp  ,UnaryExp);

 GRID_DEF_BINOP(operator+,BinaryAdd);
 GRID_DEF_BINOP(operator-,BinarySub);
 GRID_DEF_BINOP(operator*,BinaryMul);

+GRID_DEF_BINOP(operator&,BinaryAnd);
+GRID_DEF_BINOP(operator|,BinaryOr);
+GRID_DEF_BINOP(operator&&,BinaryAndAnd);
+GRID_DEF_BINOP(operator||,BinaryOrOr);
+
+GRID_DEF_TRINOP(where,TrinaryWhere);
+
+/////////////////////////////////////////////////////////////
+// Closure convenience to force expression to evaluate
+/////////////////////////////////////////////////////////////
+template<class Op,class T1>
+  auto closure(const LatticeUnaryExpression<Op,T1> & expr)
+  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))>
+{
+  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))> ret(expr);
+  return ret;
+}
+template<class Op,class T1, class T2>
+  auto closure(const LatticeBinaryExpression<Op,T1,T2> & expr)
+  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+				      eval(0,std::get<1>(expr.second))))>
+{
+  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+				   eval(0,std::get<1>(expr.second))))> ret(expr);
+  return ret;
+}
+template<class Op,class T1, class T2, class T3>
+  auto closure(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
+  -> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+				      eval(0,std::get<1>(expr.second)),
+				      eval(0,std::get<2>(expr.second))))>
+{
+  Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+				   eval(0,std::get<1>(expr.second)),
+				   eval(0,std::get<2>(expr.second))))> ret(expr);
+  return ret;
+}
+
 #undef GRID_UNOP
 #undef GRID_BINOP
 #undef GRID_TRINOP
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -93,7 +93,7 @@ PARALLEL_FOR_LOOP
    }
    return *this;
  }
-  template <typename Op, typename T1,typename T2>             strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
+  template <typename Op, typename T1,typename T2> strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
  {
    GridBase *egrid(nullptr);
    GridFromExpression(egrid,expr);
@@ -131,8 +131,8 @@ PARALLEL_FOR_LOOP
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
 #ifdef STREAMING_STORES
-      vobj tmp = eval(ss,expr);
-      vstream(_odata[ss] ,tmp);
+      //vobj tmp = eval(ss,expr);
+      vstream(_odata[ss] ,eval(ss,expr));
 #else
      _odata[ss] = eval(ss,expr);
 #endif
@@ -196,12 +196,7 @@ PARALLEL_FOR_LOOP
    _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
    for(int ss=0;ss<_grid->oSites();ss++){
-#ifdef STREAMING_STORES
-      vobj tmp = eval(ss,expr);
-      vstream(_odata[ss] ,tmp);
-#else
      _odata[ss]=eval(ss,expr);
-#endif
    }
  };

@@ -254,7 +249,7 @@ PARALLEL_FOR_LOOP
        Lattice<vobj> ret(lhs._grid);
 PARALLEL_FOR_LOOP
        for(int ss=0;ss<lhs._grid->oSites();ss++){
-            ret._odata[ss] = lhs._odata[ss]/rhs._odata[ss];
+	  ret._odata[ss] = lhs._odata[ss]*pow(rhs._odata[ss],-1.0);
        }
        return ret;
    };
--- a/lib/lattice/Lattice_comparison.h
+++ b/lib/lattice/Lattice_comparison.h
@@ -11,7 +11,11 @@ namespace Grid {
    //Query supporting bitwise &, |, ^, !
    //Query supporting logical &&, ||, 
    //////////////////////////////////////////////////////////////////////////
-    template<class vfunctor,class lobj,class robj> 
+
+  //////////////////////////////////////////////////////////////////////////
+  // compare lattice to lattice
+  //////////////////////////////////////////////////////////////////////////
+  template<class vfunctor,class lobj,class robj>  
    inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
    {
      Lattice<vInteger> ret(rhs._grid);
@@ -21,6 +25,9 @@ PARALLEL_FOR_LOOP
        }
        return ret;
    }
+  //////////////////////////////////////////////////////////////////////////
+  // compare lattice to scalar
+  //////////////////////////////////////////////////////////////////////////
    template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
    {
@@ -31,6 +38,9 @@ PARALLEL_FOR_LOOP
        }
        return ret;
    }
+  //////////////////////////////////////////////////////////////////////////
+  // compare scalar to lattice
+  //////////////////////////////////////////////////////////////////////////
    template<class vfunctor,class lobj,class robj> 
    inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
    {
@@ -42,6 +52,9 @@ PARALLEL_FOR_LOOP
        return ret;
    }

+  //////////////////////////////////////////////////////////////////////////
+  // Map to functors
+  //////////////////////////////////////////////////////////////////////////
    // Less than
   template<class lobj,class robj>
   inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
@@ -99,7 +112,6 @@ PARALLEL_FOR_LOOP
     return SLComparison(vge<lobj,robj>(),lhs,rhs);
   }

-
   // equal
   template<class lobj,class robj>
   inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
--- a/lib/lattice/Lattice_comparison_utils.h
+++ b/lib/lattice/Lattice_comparison_utils.h
@@ -5,140 +5,197 @@ namespace Grid {

  /////////////////////////////////////////
  // This implementation is a bit poor.
-  // Only support logical operations (== etc)
-  // on scalar objects. Strip any tensor structures.
+  //
+  // Only support relational logical operations (<, >  etc)
+  // on scalar objects. Therefore can strip any tensor structures.
+  //
  // Should guard this with isGridTensor<> enable if?
  /////////////////////////////////////////
-    // Generic list of functors
-    template<class lobj,class robj> class veq {
+  //
+  // Generic list of functors
+  //
+  template<class lobj,class robj> class veq {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) == (rhs);
+    }
+  };
+  template<class lobj,class robj> class vne {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) != (rhs);
+    }
+  };
+  template<class lobj,class robj> class vlt {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) < (rhs);
+    }
+  };
+  template<class lobj,class robj> class vle {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) <= (rhs);
+    }
+  };
+  template<class lobj,class robj> class vgt {
+  public:
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) > (rhs);
+    }
+  };
+  template<class lobj,class robj> class vge {
    public:
-      vInteger operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) == TensorRemove(rhs);
-	}
-    };
-    template<class lobj,class robj> class vne {
-    public:
-      vInteger operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) != TensorRemove(rhs);
-	}
-    };
-    template<class lobj,class robj> class vlt {
-    public:
-      vInteger operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) < TensorRemove(rhs);
-	}
-    };
-    template<class lobj,class robj> class vle {
-    public:
-      vInteger operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) <= TensorRemove(rhs);
-	}
-    };
-    template<class lobj,class robj> class vgt {
-    public:
-      vInteger operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) > TensorRemove(rhs);
-	}
-    };
-    template<class lobj,class robj> class vge {
-    public:
-      vInteger operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) >= TensorRemove(rhs);
-	}
-    };
+    vInteger operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) >= (rhs);
+    }
+  };
+  
+  // Generic list of functors
+  template<class lobj,class robj> class seq {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) == (rhs);
+    }
+  };
+  template<class lobj,class robj> class sne {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) != (rhs);
+    }
+  };
+  template<class lobj,class robj> class slt {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) < (rhs);
+    }
+  };
+  template<class lobj,class robj> class sle {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) <= (rhs);
+    }
+  };
+  template<class lobj,class robj> class sgt {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) > (rhs);
+    }
+  };
+  template<class lobj,class robj> class sge {
+  public:
+    Integer operator()(const lobj &lhs, const robj &rhs)
+    { 
+      return (lhs) >= (rhs);
+    }
+  };

-    // Generic list of functors
-    template<class lobj,class robj> class seq {
-    public:
-      Integer operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) == TensorRemove(rhs);
-	}
-    };
-    template<class lobj,class robj> class sne {
-    public:
-      Integer operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) != TensorRemove(rhs);
-	}
-    };
-    template<class lobj,class robj> class slt {
-    public:
-      Integer operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) < TensorRemove(rhs);
-	}
-    };
-    template<class lobj,class robj> class sle {
-    public:
-      Integer operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) <= TensorRemove(rhs);
-	}
-    };
-    template<class lobj,class robj> class sgt {
-    public:
-      Integer operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) > TensorRemove(rhs);
-	}
-    };
-    template<class lobj,class robj> class sge {
-    public:
-      Integer operator()(const lobj &lhs, const robj &rhs)
-	{ 
-	  return TensorRemove(lhs) >= TensorRemove(rhs);
-	}
-    };
-
-
-    //////////////////////////////////////////////////////////////////////////////////////////////////////
-    // Integer gets extra relational functions. Could also implement these for RealF, RealD etc..
-    //////////////////////////////////////////////////////////////////////////////////////////////////////
-    template<class sfunctor> 
-    inline vInteger Comparison(sfunctor sop,const vInteger & lhs, const vInteger & rhs)
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Integer and real get extra relational functions.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
    {
-      std::vector<Integer> vlhs(vInteger::Nsimd());   // Use functors to reduce this to single implementation
-      std::vector<Integer> vrhs(vInteger::Nsimd());
+      typedef typename vsimd::scalar_type scalar;
+      std::vector<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+      std::vector<scalar> vrhs(vsimd::Nsimd());
+      std::vector<Integer> vpred(vsimd::Nsimd());
      vInteger ret;
-      extract<vInteger,Integer>(lhs,vlhs);
-      extract<vInteger,Integer>(rhs,vrhs);
-      for(int s=0;s<vInteger::Nsimd();s++){
-	vlhs[s] = sop(vlhs[s],vrhs[s]);
+      extract<vsimd,scalar>(lhs,vlhs);
+      extract<vsimd,scalar>(rhs,vrhs);
+      for(int s=0;s<vsimd::Nsimd();s++){
+	vpred[s] = sop(vlhs[s],vrhs[s]);
      }
-      merge<vInteger,Integer>(ret,vlhs);
+      merge<vInteger,Integer>(ret,vpred);
      return ret;
    }
-    inline vInteger operator < (const vInteger & lhs, const vInteger & rhs)
+
+  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+    inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
    {
-      return Comparison(slt<Integer,Integer>(),lhs,rhs);
+      typedef typename vsimd::scalar_type scalar;
+      std::vector<scalar> vlhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+      std::vector<Integer> vpred(vsimd::Nsimd());
+      vInteger ret;
+      extract<vsimd,scalar>(lhs,vlhs);
+      for(int s=0;s<vsimd::Nsimd();s++){
+	vpred[s] = sop(vlhs[s],rhs);
+      }
+      merge<vInteger,Integer>(ret,vpred);
+      return ret;
    }
-    inline vInteger operator <= (const vInteger & lhs, const vInteger & rhs)
+
+  template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> 
+    inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
    {
-      return Comparison(sle<Integer,Integer>(),lhs,rhs);
-    }
-    inline vInteger operator > (const vInteger & lhs, const vInteger & rhs)
-    {
-      return Comparison(sgt<Integer,Integer>(),lhs,rhs);
-    }
-    inline vInteger operator >= (const vInteger & lhs, const vInteger & rhs)
-    {
-      return Comparison(sge<Integer,Integer>(),lhs,rhs);
-    }
-    inline vInteger operator == (const vInteger & lhs, const vInteger & rhs)
-    {
-      return Comparison(seq<Integer,Integer>(),lhs,rhs);
-    }
-    inline vInteger operator != (const vInteger & lhs, const vInteger & rhs)
-    {
-      return Comparison(sne<Integer,Integer>(),lhs,rhs);
+      typedef typename vsimd::scalar_type scalar;
+      std::vector<scalar> vrhs(vsimd::Nsimd());   // Use functors to reduce this to single implementation
+      std::vector<Integer> vpred(vsimd::Nsimd());
+      vInteger ret;
+      extract<vsimd,scalar>(rhs,vrhs);
+      for(int s=0;s<vsimd::Nsimd();s++){
+	vpred[s] = sop(lhs,vrhs[s]);
+      }
+      merge<vInteger,Integer>(ret,vpred);
+      return ret;
    }
+
+#define DECLARE_RELATIONAL(op,functor) \
+  template<class vsimd,IfSimd<vsimd> = 0>\
+    inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
+    {\
+      typedef typename vsimd::scalar_type scalar;\
+      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
+    }\
+  template<class vsimd,IfSimd<vsimd> = 0>\
+    inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
+    {\
+      typedef typename vsimd::scalar_type scalar;\
+      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
+    }\
+  template<class vsimd,IfSimd<vsimd> = 0>\
+    inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
+    {\
+      typedef typename vsimd::scalar_type scalar;\
+      return Comparison(functor<scalar,scalar>(),lhs,rhs);\
+    }\
+  template<class vsimd>\
+    inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
+    {									\
+      return lhs._internal op rhs._internal;				\
+    }									\
+  template<class vsimd>\
+    inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
+    {									\
+      return lhs._internal op rhs;					\
+    }									\
+  template<class vsimd>\
+    inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
+    {									\
+      return lhs op rhs._internal;					\
+    }									
+
+
+DECLARE_RELATIONAL(<,slt);
+DECLARE_RELATIONAL(<=,sle);
+DECLARE_RELATIONAL(>,sgt);
+DECLARE_RELATIONAL(>=,sge);
+DECLARE_RELATIONAL(==,seq);
+DECLARE_RELATIONAL(!=,sne);
+
+#undef DECLARE_RELATIONAL
+
 }


--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@@ -117,7 +117,9 @@ PARALLEL_FOR_LOOP
      int Nsimd = grid->Nsimd();

      assert( l.checkerboard == l._grid->CheckerBoard(site));
-      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
+
+      // FIXME
+      //      assert( sizeof(sobj)*Nsimd == sizeof(vobj));

      int rank,odx,idx;
      grid->GlobalCoorToRankIndex(rank,odx,idx,site);
@@ -132,6 +134,7 @@ PARALLEL_FOR_LOOP
      return;
    };

+
    //////////////////////////////////////////////////////////
    // Peek a scalar object from the SIMD array
    //////////////////////////////////////////////////////////
--- a/lib/lattice/Lattice_reality.h
+++ b/lib/lattice/Lattice_reality.h
@@ -27,37 +27,6 @@ PARALLEL_FOR_LOOP
        return ret;
    };

-    template<class vobj> inline auto real(const Lattice<vobj> &z) -> Lattice<vobj>
-    {
-      Lattice<vobj> ret(z._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<z._grid->oSites();ss++){
-            ret._odata[ss] = real(z._odata[ss]);
-        }
-      return ret;
-    }
-
-    template<class vobj> inline auto imag(const Lattice<vobj> &z) -> Lattice<vobj>
-    {
-      Lattice<vobj> ret(z._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<z._grid->oSites();ss++){
-            ret._odata[ss] = imag(z._odata[ss]);
-        }
-      return ret;
-    }
-
-
-    template<class vobj> inline auto Ta(const Lattice<vobj> &z) -> Lattice<decltype(Ta(z._odata[0]))>
-    {
-      Lattice<decltype(Ta(z._odata[0]))> ret(z._grid);
-PARALLEL_FOR_LOOP
-        for(int ss=0;ss<z._grid->oSites();ss++){
-            ret._odata[ss] = Ta(z._odata[ss]);
-        }
-      return ret;
-    }
-

 }
 #endif
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -51,6 +51,31 @@ PARALLEL_FOR_LOOP
      return nrm;
    }

+    template<class Op,class T1>
+      inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
+      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
+    {
+      return sum(closure(expr));
+    }
+
+    template<class Op,class T1,class T2>
+      inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
+      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
+    {
+      return sum(closure(expr));
+    }
+
+
+    template<class Op,class T1,class T2,class T3>
+      inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
+      ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
+				 eval(0,std::get<1>(expr.second)),
+				 eval(0,std::get<2>(expr.second))
+				 ))::scalar_object
+    {
+      return sum(closure(expr));
+    }
+
    template<class vobj>
    inline typename vobj::scalar_object sum(const Lattice<vobj> &arg){

--- a/lib/lattice/Lattice_transpose.h
+++ b/lib/lattice/Lattice_transpose.h
@@ -24,8 +24,7 @@ PARALLEL_FOR_LOOP
    // Index level dependent transpose
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    template<int Index,class vobj>
-    inline auto transposeIndex(const Lattice<vobj> &lhs)
-      -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
+    inline auto transposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
    {
      Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
 PARALLEL_FOR_LOOP
--- a/lib/lattice/Lattice_unary.h
+++ b/lib/lattice/Lattice_unary.h
@@ -3,51 +3,6 @@

 namespace Grid {

-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  //  avoid copy back routines for mult, mac, sub, add
-  //////////////////////////////////////////////////////////////////////////////////////////////////////
-  template<class obj> Lattice<obj> sqrt(const Lattice<obj> &rhs){
-    Lattice<obj> ret(rhs._grid);
-    ret.checkerboard = rhs.checkerboard;
-    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
-      ret._odata[ss]=sqrt(rhs._odata[ss]);
-    }
-    return ret;
-  }
-  template<class obj> Lattice<obj> rsqrt(const Lattice<obj> &rhs){
-    Lattice<obj> ret(rhs._grid);
-    ret.checkerboard = rhs.checkerboard;
-    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
-      ret._odata[ss]=rsqrt(rhs._odata[ss]);
-    }
-    return ret;
-  }
-  template<class obj> Lattice<obj> sin(const Lattice<obj> &rhs){
-    Lattice<obj> ret(rhs._grid);
-    ret.checkerboard = rhs.checkerboard;
-    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
-      ret._odata[ss]=sin(rhs._odata[ss]);
-    }
-    return ret;
-  }
-  template<class obj> Lattice<obj> cos(const Lattice<obj> &rhs){
-    Lattice<obj> ret(rhs._grid);
-    ret.checkerboard = rhs.checkerboard;
-    conformable(ret,rhs);
-PARALLEL_FOR_LOOP
-    for(int ss=0;ss<rhs._grid->oSites();ss++){
-      ret._odata[ss]=cos(rhs._odata[ss]);
-    }
-    return ret;
-  }
-
-
  template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs,RealD y){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
--- a/lib/lattice/Lattice_where.h
+++ b/lib/lattice/Lattice_where.h
@@ -8,13 +8,14 @@ namespace Grid {
 //                              and blow away the tensor structures.
 //
 template<class vobj,class iobj>
-inline void where(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
 {
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);
  conformable(iftrue,ret);

  GridBase *grid=iftrue._grid;
+
  typedef typename vobj::scalar_object scalar_object;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
@@ -43,7 +44,7 @@ PARALLEL_FOR_LOOP
 }

 template<class vobj,class iobj>
-inline Lattice<vobj> where(const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
+inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
 {
  conformable(iftrue,iffalse);
  conformable(iftrue,predicate);
--- a/lib/qcd/QCD.h
+++ b/lib/qcd/QCD.h
@@ -250,6 +250,7 @@ namespace QCD {
    //////////////////////////////////////////////////////////////////////////////
    // Peek and Poke named after physics attributes
    //////////////////////////////////////////////////////////////////////////////
+
    //spin
    template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(peekIndex<SpinIndex>(rhs,0))
    {
@@ -289,20 +290,117 @@ namespace QCD {
    {
      return peekIndex<LorentzIndex>(rhs,i);
    }
-    template<class vobj> auto peekLorentz(const vobj &rhs,int i,int j) -> decltype(peekIndex<LorentzIndex>(rhs,0,0))
-    {
-      return peekIndex<LorentzIndex>(rhs,i,j);
-    }
    template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(peekIndex<LorentzIndex>(rhs,0))
    {
      return peekIndex<LorentzIndex>(rhs,i);
    }
-    template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i,int j) -> decltype(peekIndex<LorentzIndex>(rhs,0,0))
+
+    //////////////////////////////////////////////
+    // Poke lattice
+    //////////////////////////////////////////////
+    template<class vobj> 
+      void pokeColour(Lattice<vobj> &lhs,
+		      const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
+		      int i)
    {
-      return peekIndex<LorentzIndex>(rhs,i,j);
+      pokeIndex<ColourIndex>(lhs,rhs,i);
+    }
+    template<class vobj> 
+      void pokeColour(Lattice<vobj> &lhs,
+		      const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
+		      int i,int j)
+    {
+      pokeIndex<ColourIndex>(lhs,rhs,i,j);
+    }
+    template<class vobj> 
+      void pokeSpin(Lattice<vobj> &lhs,
+		      const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
+		      int i)
+    {
+      pokeIndex<SpinIndex>(lhs,rhs,i);
+    }
+    template<class vobj> 
+      void pokeSpin(Lattice<vobj> &lhs,
+		      const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
+		      int i,int j)
+    {
+      pokeIndex<SpinIndex>(lhs,rhs,i,j);
+    }
+    template<class vobj> 
+      void pokeLorentz(Lattice<vobj> &lhs,
+		      const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
+		      int i)
+    {
+      pokeIndex<LorentzIndex>(lhs,rhs,i);
    }

-    // FIXME transpose Colour, transpose Spin, traceColour traceSpin
+    //////////////////////////////////////////////
+    // Poke scalars
+    //////////////////////////////////////////////
+
+    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
+    {
+      pokeIndex<SpinIndex>(lhs,rhs,i);
+    }
+    template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0,0)) & rhs,int i,int j)
+    {
+      pokeIndex<SpinIndex>(lhs,rhs,i,j);
+    }
+
+    template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0)) & rhs,int i)
+    {
+      pokeIndex<ColourIndex>(lhs,rhs,i);
+    }
+    template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0,0)) & rhs,int i,int j)
+    {
+      pokeIndex<ColourIndex>(lhs,rhs,i,j);
+    }
+
+    template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<LorentzIndex>(lhs,0)) & rhs,int i)
+    {
+      pokeIndex<LorentzIndex>(lhs,rhs,i);
+    }
+
+
+    //////////////////////////////////////////////
+    // transpose array and scalar
+    //////////////////////////////////////////////
+    template<int Index,class vobj> inline Lattice<vobj> transposeSpin(const Lattice<vobj> &lhs){
+      return transposeIndex<SpinIndex>(lhs);
+    }
+    template<int Index,class vobj> inline Lattice<vobj> transposeColour(const Lattice<vobj> &lhs){
+      return transposeIndex<ColourIndex>(lhs);
+    }
+    template<int Index,class vobj> inline vobj transposeSpin(const vobj &lhs){
+      return transposeIndex<SpinIndex>(lhs);
+    }
+    template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
+      return transposeIndex<ColourIndex>(lhs);
+    }
+
+    //////////////////////////////////////////
+    // Trace lattice and non-lattice
+    //////////////////////////////////////////
+    template<int Index,class vobj>
+    inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs._odata[0]))>
+    {
+      return traceIndex<SpinIndex>(lhs);
+    }
+    template<int Index,class vobj>
+    inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs._odata[0]))>
+    {
+      return traceIndex<ColourIndex>(lhs);
+    }
+    template<int Index,class vobj>
+    inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
+    {
+      return traceIndex<SpinIndex>(lhs);
+    }
+    template<int Index,class vobj>
+    inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs))>
+    {
+      return traceIndex<ColourIndex>(lhs);
+    }

 }   //namespace QCD
 } // Grid
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -0,0 +1,414 @@
+#ifndef QCD_UTIL_SUN_H
+#define QCD_UTIL_SUN_H
+
+
+namespace Grid {
+  namespace QCD {
+
+template<int ncolour>
+class SU {
+public:
+
+  static int generators(void)   { return ncolour*ncolour-1; }
+  static int su2subgroups(void) { return (ncolour*(ncolour-1))/2; }
+
+  template<typename vtype> using iSUnMatrix              = iScalar<iScalar<iMatrix<vtype, ncolour> > > ;
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+  // Types can be accessed as SU<2>::Matrix , SU<2>::vSUnMatrix, SU<2>::LatticeMatrix etc...
+  //////////////////////////////////////////////////////////////////////////////////////////////////
+  typedef iSUnMatrix<Complex>         Matrix;
+  typedef iSUnMatrix<ComplexF>        MatrixF;
+  typedef iSUnMatrix<ComplexD>        MatrixD;
+
+  typedef iSUnMatrix<vComplex>       vMatrix;
+  typedef iSUnMatrix<vComplexF>      vMatrixF;
+  typedef iSUnMatrix<vComplexD>      vMatrixD;
+
+  typedef Lattice<vMatrix>     LatticeMatrix;
+  typedef Lattice<vMatrixF>    LatticeMatrixF;
+  typedef Lattice<vMatrixD>    LatticeMatrixD;
+
+  ////////////////////////////////////////////////////////////////////////
+  // There are N^2-1 generators for SU(N).
+  //
+  // We take a traceless hermitian generator basis as follows
+  //
+  // * Normalisation: trace ta tb = 1/2 delta_ab
+  // 
+  // * Off diagonal
+  //    - pairs of rows i1,i2 behaving like pauli matrices signma_x, sigma_y
+  //     
+  //    - there are (Nc-1-i1) slots for i2 on each row [ x  0  x ]                                   
+  //      direct count off each row                                                                    
+  //
+  //    - Sum of all pairs is Nc(Nc-1)/2: proof arithmetic series
+  //
+  //      (Nc-1) + (Nc-2)+...  1      ==> Nc*(Nc-1)/2 
+  //      1+ 2+          +   + Nc-1                        
+  // 
+  //    - There are 2 x Nc (Nc-1)/ 2 of these = Nc^2 - Nc
+  //
+  //    - We enumerate the row-col pairs.
+  //    - for each row col pair there is a (sigma_x) and a (sigma_y) like generator
+  //
+  //
+  //   t^a_ij = { in 0.. Nc(Nc-1)/2 -1} =>  delta_{i,i1} delta_{j,i2} +  delta_{i,i1} delta_{j,i2}  
+  //   t^a_ij = { in Nc(Nc-1)/2 ... Nc^(Nc-1) -1} =>  i delta_{i,i1} delta_{j,i2} - i delta_{i,i1} delta_{j,i2}  
+  //   
+  // * Diagonal; must be traceless and normalised
+  //   - Sequence is 
+  //   N  (1,-1,0,0...)
+  //   N  (1, 1,-2,0...)
+  //   N  (1, 1, 1,-3,0...)
+  //   N  (1, 1, 1, 1,-4,0...)
+  //
+  //   where 1/2 = N^2 (1+.. m^2)etc.... for the m-th diagonal generator
+  //   NB this gives the famous SU3 result for su2 index 8
+  //
+  //   N= sqrt(1/2 . 1/6 ) = 1/2 . 1/sqrt(3) 
+  //
+  //   ( 1      )
+  //   (    1   ) / sqrt(3) /2  = 1/2 lambda_8
+  //   (      -2)
+  ////////////////////////////////////////////////////////////////////////
+  template<class cplx>
+  static void generator(int lieIndex,iSUnMatrix<cplx> &ta){
+    // map lie index to which type of generator
+    int diagIndex;
+    int su2Index;
+    int sigxy;
+    int NNm1 =  ncolour*(ncolour-1);
+    if ( lieIndex>= NNm1 ) {
+      diagIndex = lieIndex -NNm1;
+      generatorDiagonal(diagIndex,ta);
+      return;
+    }
+    sigxy   = lieIndex&0x1;
+    su2Index= lieIndex>>1;
+    if ( sigxy ) generatorSigmaY(su2Index,ta);
+    else         generatorSigmaX(su2Index,ta);
+  }
+  template<class cplx>
+  static void generatorSigmaX(int su2Index,iSUnMatrix<cplx> &ta){
+    ta=zero;
+    int i1,i2;
+    su2SubGroupIndex(i1,i2,su2Index);
+    ta()()(i1,i2)=1.0;
+    ta()()(i2,i1)=1.0;
+    ta= ta *0.5;
+  }
+  template<class cplx>
+  static void generatorSigmaY(int su2Index,iSUnMatrix<cplx> &ta){
+    ta=zero;
+    cplx i(0.0,1.0);
+    int i1,i2;
+    su2SubGroupIndex(i1,i2,su2Index);
+    ta()()(i1,i2)=-i;
+    ta()()(i2,i1)= i;
+    ta= ta *0.5;
+  }
+  template<class cplx>
+  static void generatorDiagonal(int diagIndex,iSUnMatrix<cplx> &ta){
+    ta=zero;
+    int trsq=0;
+    int last=diagIndex+1;
+    for(int i=0;i<=diagIndex;i++){
+      ta()()(i,i) = 1.0;
+      trsq++;
+    }
+    ta()()(last,last) = -last;
+    trsq+=last*last;
+    RealD nrm = 1.0/std::sqrt(2.0*trsq);
+    ta = ta *nrm;
+  }
+  ////////////////////////////////////////////////////////////////////////
+  // Map a su2 subgroup number to the pair of rows that are non zero
+  ////////////////////////////////////////////////////////////////////////
+  static void su2SubGroupIndex(int &i1,int &i2,int su2_index){
+
+    assert( (su2_index>=0) && (su2_index< (ncolour*(ncolour-1))/2) );
+
+    int spare=su2_index;
+    for(i1=0;spare >= (ncolour-1-i1);i1++ ){
+      spare = spare - (ncolour-1-i1);  // remove the Nc-1-i1 terms  
+    }
+    i2=i1+1+spare;
+  }
+  template<class vreal,class vcplx>
+  static void su2Extract(std::vector<Lattice<iSinglet  <vreal> > > &r,
+			       const Lattice<iSUnMatrix<vcplx> > &source, 
+			 int su2_index)
+  {
+    GridBase *grid(source._grid);
+
+    assert(r.size() == 4); // store in 4 real parts
+    
+    for(int i=0;i<4;i++){
+      conformable(r[i],source);
+    }
+    
+    int i1,i2;    
+    su2SubGroupIndex(i1,i2,su2_index);
+
+    /* Compute the b(k) of A_SU(2) = b0 + i sum_k bk sigma_k */ 
+    
+    //    r[0] = toReal(real(peekColour(source,i1,i1)) + real(peekColour(source,i2,i2)));
+    //    r[1] = toReal(imag(peekColour(source,i1,i2)) + imag(peekColour(source,i2,i1)));
+    //    r[2] = toReal(real(peekColour(source,i1,i2)) - real(peekColour(source,i2,i1)));
+    //    r[3] = toReal(imag(peekColour(source,i1,i1)) - imag(peekColour(source,i2,i2)));
+    r[0] = toReal(real(peekColour(source,i1,i1)) + real(peekColour(source,i2,i2)));
+    r[1] = toReal(imag(peekColour(source,i1,i2)) + imag(peekColour(source,i2,i1)));
+    r[2] = toReal(real(peekColour(source,i1,i2)) - real(peekColour(source,i2,i1)));
+    r[3] = toReal(imag(peekColour(source,i1,i1)) - imag(peekColour(source,i2,i2)));
+    
+  }
+
+  template<class vreal,class vcplx>
+  static void su2Insert(const std::vector<Lattice<iSinglet<vreal> > > &r,
+			Lattice<iSUnMatrix<vcplx> > &dest,
+			int su2_index)
+  {
+    typedef typename Lattice<iSUnMatrix<vcplx> >::scalar_type cplx;
+    typedef Lattice<iSinglet<vcplx> > Lcomplex;
+    GridBase * grid = dest._grid;
+
+    assert(r.size() == 4); // store in 4 real parts
+
+    Lcomplex tmp(grid);
+
+    std::vector<Lcomplex > cr(4,grid);
+    for(int i=0;i<r.size();i++){
+      conformable(r[i],dest);
+      cr[i]=toComplex(r[i]);
+    }
+
+    int i1,i2;
+    su2SubGroupIndex(i1,i2,su2_index);
+
+    cplx one   (1.0,0.0);
+    cplx cplx_i(0.0,1.0);
+
+    tmp =   cr[0]*one+ cr[3]*cplx_i;   pokeColour(dest,tmp,i1,i2); 
+    tmp =   cr[2]*one+ cr[1]*cplx_i;   pokeColour(dest,tmp,i1,i2);
+    tmp =  -cr[2]*one+ cr[1]*cplx_i;   pokeColour(dest,tmp,i2,i1);
+    tmp =   cr[0]*one- cr[3]*cplx_i;   pokeColour(dest,tmp,i2,i2);
+  }
+
+  static void SubGroupHeatBath(      GridSerialRNG       &sRNG,
+				     GridParallelRNG     &pRNG,
+			             RealD beta,
+				     LatticeMatrix &link, 
+			       const LatticeMatrix &staple, 
+				     int su2_subgroup,
+				     int nheatbath,
+				     int& ntrials,
+				     int& nfails,
+				     LatticeInteger &wheremask)
+  {
+    GridBase *grid = link._grid;
+
+    LatticeMatrix V(grid);
+    V = link*staple;
+
+    std::vector<LatticeReal> r(4,grid);
+    std::vector<LatticeReal> a(4,grid);
+    su2Extract(r,V,su2_subgroup); // HERE
+
+    LatticeReal r_l(grid);
+    r_l  = r[0]*r[0]+r[1]*r[1]+r[2]*r[2]+r[3]*r[3];
+    r_l = sqrt(r_l);
+
+    LatticeReal ftmp(grid);
+    LatticeReal ftmp1(grid);
+    LatticeReal ftmp2(grid);
+    LatticeReal one (grid);    one = 1.0;
+    LatticeReal zz  (grid);    zz  = zero;
+    LatticeReal recip(grid);   recip = one/r_l;
+    
+    Real machine_epsilon= 1.0e-14;
+    
+    ftmp = where(r_l>machine_epsilon,recip,one);
+    a[0] = where(r_l>machine_epsilon,   r[0] * ftmp , one);
+    a[1] = where(r_l>machine_epsilon, -(r[1] * ftmp), zz);
+    a[2] = where(r_l>machine_epsilon, -(r[2] * ftmp), zz);
+    a[3] = where(r_l>machine_epsilon, -(r[3] * ftmp), zz);
+
+    r_l *=  beta / ncolour;
+    
+    ftmp1 = where(wheremask,one,zz);
+    Real num_sites = TensorRemove(sum(ftmp1));
+
+    Integer itrials = (Integer)num_sites;
+    ntrials = 0;
+    nfails = 0;
+
+    LatticeInteger lupdate(grid);
+    LatticeInteger lbtmp(grid);
+    LatticeInteger lbtmp2(grid); lbtmp2=zero;
+
+    int n_done = 0;
+    int nhb = 0;
+
+    r[0] = a[0];
+    lupdate = 1;
+
+    LatticeReal ones (grid); ones = 1.0;
+    LatticeReal zeros(grid); zeros=zero;
+
+    const RealD twopi=2.0*M_PI;
+    while ( nhb < nheatbath && n_done < num_sites ) {
+
+       ntrials += itrials;
+
+       random(pRNG,r[1]);
+       std::cout<<"RANDOM SPARSE FLOAT r[1]"<<std::endl;
+       std::cout<<r[1]<<std::endl;
+
+       random(pRNG,r[2]);
+       random(pRNG,ftmp);
+
+       r[1] = log(r[1]);
+       r[2] = log(r[2]);
+
+       ftmp = ftmp*twopi;
+       r[3] = cos(ftmp);
+       r[3] = r[3]*r[3];
+
+       r[1] += r[2] * r[3];
+       r[2]  = r[1] / r_l;
+
+       random(pRNG,ftmp);
+       r[1] = ftmp*ftmp;
+
+       {
+	 LatticeInteger mask_true (grid); mask_true = 1;
+	 LatticeInteger mask_false(grid); mask_false= 0;
+	 LatticeReal    thresh(grid); thresh = (1.0 + 0.5*r[2]);
+	 lbtmp = where(r[1] <= thresh,mask_true,mask_false);
+       }
+       lbtmp2= lbtmp && lupdate;       
+       r[0]  = where(lbtmp2, 1.0+r[2], r[0]);
+
+       ftmp1 = where(lbtmp2,ones,zeros);
+       RealD sitesum = sum(ftmp1);
+       Integer itmp = sitesum;
+
+       n_done += itmp;
+       itrials -= itmp;
+       nfails += itrials;
+       lbtmp   = !lbtmp;
+       lupdate = lupdate & lbtmp;
+       ++nhb;
+    }
+    // Now create r[1], r[2] and r[3] according to the spherical measure 
+    // Take absolute value to guard against round-off 
+    random(pRNG,ftmp1);
+    r[2] = 1.0 - 2.0*ftmp1;
+ 
+    ftmp1 = abs(1.0 - r[0]*r[0]);
+    r[3]   = -(sqrt(ftmp1) * r[2]);
+
+    // Take absolute value to guard against round-off 
+    r_l    = sqrt(abs(ftmp1 - r[3]*r[3]));
+ 
+    random(pRNG,ftmp1);
+    ftmp1 *= twopi;
+    r[1]    = r_l * cos(ftmp1);
+    r[2]    = r_l * sin(ftmp1);
+
+    // Update matrix is B = R * A, with B, R and A given by b_i, r_i and a_i 
+    std::vector<LatticeReal> b(4,grid);
+    b[0] = r[0]*a[0] - r[1]*a[1] - r[2]*a[2] - r[3]*a[3];
+    b[1] = r[0]*a[1] + r[1]*a[0] - r[2]*a[3] + r[3]*a[2];
+    b[2] = r[0]*a[2] + r[2]*a[0] - r[3]*a[1] + r[1]*a[3];
+    b[3] = r[0]*a[3] + r[3]*a[0] - r[1]*a[2] + r[2]*a[1];
+
+     //
+     // Now fill an SU(3) matrix V with the SU(2) submatrix su2_index
+     // parametrized by a_k in the sigma matrix basis.
+     //
+    su2Insert(b,V,su2_subgroup);
+
+    // U = V*U
+    LatticeMatrix tmp(grid);
+    tmp = V * link;
+
+    //mask the assignment back
+    link = where(wheremask,tmp,link);
+
+  }
+
+  static void printGenerators(void)
+  {
+    for(int gen=0;gen<generators();gen++){
+      Matrix ta;
+      generator(gen,ta);
+      std::cout<< "Nc = "<<ncolour<<" t_"<<gen<<std::endl;
+      std::cout<<ta<<std::endl;
+    }
+  }
+
+  static void testGenerators(void){
+    Matrix ta;
+    Matrix tb;
+    std::cout<<"Checking trace ta tb is 0.5 delta_ab"<<std::endl;
+    for(int a=0;a<generators();a++){
+      for(int b=0;b<generators();b++){
+	generator(a,ta);
+	generator(b,tb);
+	Complex tr =TensorRemove(trace(ta*tb)); 
+	std::cout<<tr<<" ";
+	if(a==b) assert(abs(tr-Complex(0.5))<1.0e-6);
+	if(a!=b) assert(abs(tr)<1.0e-6);
+      }
+      std::cout<<std::endl;
+    }
+    std::cout<<"Checking hermitian"<<std::endl;
+    for(int a=0;a<generators();a++){
+      generator(a,ta);
+      std::cout<<a<<" ";
+      assert(norm2(ta-adj(ta))<1.0e-6);
+    }    
+    std::cout<<std::endl;
+
+    std::cout<<"Checking traceless"<<std::endl;
+    for(int a=0;a<generators();a++){
+      generator(a,ta);
+      Complex tr =TensorRemove(trace(ta)); 
+      std::cout<<a<<" ";
+      assert(abs(tr)<1.0e-6);
+    }    
+    std::cout<<std::endl;
+  }
+
+  // reunitarise??
+
+  static void taProj( const LatticeMatrix &in,  LatticeMatrix &out){
+    out = Ta(in);
+  }
+  static void taExp( const LatticeMatrix &x,  LatticeMatrix &ex){ 
+    LatticeMatrix xn   = x;
+
+    RealD nfac = 1.0;
+    ex = 1+x; // 1+x
+
+    // Do a 12th order exponentiation
+    for(int i= 2; i <= 12; ++i)
+    {
+      nfac = nfac/i; 
+      xn   = xn * x; // x2, x3,x4....
+      ex  += xn*nfac;// x2/2!, x3/3!....
+    }
+  }
+
+};
+
+ typedef SU<2> SU2;
+ typedef SU<3> SU3;
+ typedef SU<4> SU4;
+ typedef SU<5> SU5;
+
+  }
+}
+#endif
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -70,7 +70,7 @@ public:
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
-  static void Staple(GaugeMat &staple,GaugeLorentz &Umu,int mu){
+  static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){

    std::vector<GaugeMat> U(4,Umu._grid);
    for(int d=0;d<Nd;d++){
@@ -123,12 +123,10 @@ void siteRectangle(GaugeMat &plaq,const std::vector<GaugeMat> &U, const int mu,


 typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> ColourWilsonLoops;
+ typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> U1WilsonLoops;
+ typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> SU2WilsonLoops;
+ typedef WilsonLoops<LatticeColourMatrix,LatticeGaugeField> SU3WilsonLoops;

 }}

-
-
-
-
-
 #endif
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -4,7 +4,7 @@

  Using intrinsics
 */
-// Time-stamp: <2015-06-09 14:26:59 neo>
+// Time-stamp: <2015-06-16 23:30:41 neo>
 //----------------------------------------------------------------------

 #include <immintrin.h>
@@ -248,7 +248,7 @@ namespace Optimization {
      return _mm256_set_m128i(a1,a0);
 #endif
 #if defined (AVX2)
-      return _mm256_mul_epi32(a,b);
+      return _mm256_mullo_epi32(a,b);
 #endif

    }
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -4,7 +4,7 @@

  Using intrinsics
 */
-// Time-stamp: <2015-06-09 14:24:01 neo>
+// Time-stamp: <2015-06-16 23:27:54 neo>
 //----------------------------------------------------------------------

 #include <pmmintrin.h>
@@ -97,7 +97,7 @@ namespace Optimization {
    }
    // Integer
    inline __m128i operator()(Integer *a){
-      return _mm_set_epi32(a[0],a[1],a[2],a[3]);
+      return _mm_set_epi32(a[3],a[2],a[1],a[0]);
    }


@@ -181,7 +181,7 @@ namespace Optimization {
    }
    // Integer
    inline __m128i operator()(__m128i a, __m128i b){
-      return _mm_mul_epi32(a,b);
+      return _mm_mullo_epi32(a,b);
    }
  };

--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -28,13 +28,19 @@

 namespace Grid {

+  //////////////////////////////////////
  // To take the floating point type of real/complex type
+  //////////////////////////////////////
  template <typename T> struct RealPart {
    typedef T type;
  };
  template <typename T> struct RealPart< std::complex<T> >{
    typedef T type;
  };
+  
+  //////////////////////////////////////
+  // demote a vector to real type
+  //////////////////////////////////////

  // type alias used to simplify the syntax of std::enable_if
  template <typename T> using Invoke                                  =  typename T::type;
@@ -90,7 +96,7 @@ namespace Grid {
 	Vector_type v;
 	Scalar_type s[sizeof(Vector_type)/sizeof(Scalar_type)];
      conv_t_union(){};
-      } conv_t;
+    } conv_t;
    
   
    Vector_type v;
@@ -205,7 +211,6 @@ namespace Grid {
      return *this;
    }

-
    ///////////////////////////////////////
    // Not all functions are supported
    // through SIMD and must breakout to 
@@ -214,7 +219,6 @@ namespace Grid {
    ///////////////////////////////////////

    template<class functor> friend inline Grid_simd SimdApply (const functor &func,const Grid_simd &v) {
-
      Grid_simd ret;
      Grid_simd::conv_t conv;

@@ -225,6 +229,19 @@ namespace Grid {
      ret.v = conv.v;
      return ret;
    }
+    template<class functor> friend inline Grid_simd SimdApplyBinop (const functor &func,const Grid_simd &x,const Grid_simd &y) {
+      Grid_simd ret;
+      Grid_simd::conv_t cx;
+      Grid_simd::conv_t cy;
+
+      cx.v = x.v;
+      cy.v = y.v;
+      for(int i=0;i<Nsimd();i++){
+	cx.s[i]=func(cx.s[i],cy.s[i]);
+      }
+      ret.v = cx.v;
+      return ret;
+    }

    ////////////////////////////////////////////////////////////////////
    // General permute; assumes vector length is same across 
@@ -235,6 +252,7 @@ namespace Grid {
    {
      Gpermute<Grid_simd>(y,b,perm);
    }
+
    
  };// end of Grid_simd class definition 

@@ -383,7 +401,6 @@ namespace Grid {
    return in;
  }

-
  /////////////////////
  // Inner, outer
  /////////////////////
@@ -405,6 +422,46 @@ namespace Grid {
    return arg;
  }

+
+  ////////////////////////////////////////////////////////////
+  // copy/splat complex real parts into real;
+  // insert real into complex and zero imag;
+  ////////////////////////////////////////////////////////////
+
+  //real = toReal( complex )
+  template<class S,class V,IfReal<S>  = 0>	
+  inline Grid_simd<S,V> toReal(const Grid_simd<std::complex<S>,V> &in)
+  {
+    typedef Grid_simd<S,V> simd;
+    simd ret;
+    typename simd::conv_t conv;
+    conv.v = in.v;
+    for(int i=0;i<simd::Nsimd();i+=2){
+      conv.s[i+1]=conv.s[i];    // duplicate (r,r);(r,r);(r,r); etc...
+    }
+    ret.v = conv.v;
+    return ret;
+  }
+  
+  //complex = toComplex( real )
+  template<class R,class V,IfReal<R> = 0 >	// must be a real arg
+  inline Grid_simd<std::complex<R>,V> toComplex (const Grid_simd<R,V> &in)
+  {
+    typedef Grid_simd<R,V> Rsimd;
+    typedef Grid_simd<std::complex<R>,V> Csimd;
+    typename Rsimd::conv_t conv;// address as real
+    
+    conv.v = in.v;
+    for(int i=0;i<Rsimd::Nsimd();i+=2){
+      assert(conv.s[i+1]==conv.s[i]); // trap any cases where real was not duplicated 
+      // indicating the SIMD grids of real and imag assignment did not correctly match
+      conv.s[i+1]=0.0;                // zero imaginary parts
+    }
+    Csimd ret;
+    ret.v = conv.v;
+    return ret;
+  }
+
  ///////////////////////////////
  // Define available types
  ///////////////////////////////
@@ -413,6 +470,20 @@ namespace Grid {
  typedef Grid_simd< std::complex< float > , SIMD_Ftype > vComplexF;
  typedef Grid_simd< std::complex< double >, SIMD_Dtype > vComplexD;
  typedef Grid_simd< Integer               , SIMD_Itype > vInteger;
+
+  /////////////////////////////////////////
+  // Some traits to recognise the types
+  /////////////////////////////////////////
+  template <typename T> struct is_simd : public std::false_type{};
+  template <> struct is_simd<vRealF>   : public std::true_type {};
+  template <> struct is_simd<vRealD>   : public std::true_type {};
+  template <> struct is_simd<vComplexF>: public std::true_type {};
+  template <> struct is_simd<vComplexD>: public std::true_type {};
+  template <> struct is_simd<vInteger> : public std::true_type {};
+
+  template <typename T> using IfSimd     = Invoke<std::enable_if< is_simd<T>::value,int> > ;
+  template <typename T> using IfNotSimd  = Invoke<std::enable_if<!is_simd<T>::value,unsigned> > ;
+
 }

 #endif
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -1,6 +1,8 @@
 #ifndef GRID_VECTOR_UNOPS
 #define GRID_VECTOR_UNOPS

+#include <cmath>
+
 namespace Grid { 

  template<class scalar> struct SqrtRealFunctor {
@@ -27,6 +29,28 @@ namespace Grid {
    }
  };

+  template<class scalar> struct LogRealFunctor {
+    scalar operator()(const scalar &a)  const {
+      return log(real(a));
+    }
+  };
+
+  template<class scalar> struct ExpRealFunctor {
+    scalar operator()(const scalar &a)  const {
+      return exp(real(a));
+    }
+  };
+  template<class scalar> struct NotFunctor {
+    scalar operator()(const scalar &a)  const {
+      return (!a);
+    }
+  };
+  template<class scalar> struct AbsRealFunctor {
+    scalar operator()(const scalar &a)  const {
+      return std::abs(real(a));
+    }
+  };
+
  template<class scalar> struct PowRealFunctor {
    double y;
  PowRealFunctor(double _y) : y(_y) {};
@@ -43,6 +67,25 @@ namespace Grid {
    }
  };

+  template<class scalar> struct RealFunctor {
+    scalar operator()(const scalar &a)  const {
+      return real(a);
+    }
+  };
+  template<class scalar> struct ImagFunctor {
+    scalar operator()(const scalar &a)  const {
+      return imag(a);
+    }
+  };
+  template < class S, class V > 
+  inline Grid_simd<S,V> real(const Grid_simd<S,V> &r) {
+    return SimdApply(RealFunctor<S>(),r);
+  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> imag(const Grid_simd<S,V> &r) {
+    return SimdApply(ImagFunctor<S>(),r);
+  }
+
  template < class S, class V > 
  inline Grid_simd<S,V> sqrt(const Grid_simd<S,V> &r) {
    return SimdApply(SqrtRealFunctor<S>(),r);
@@ -60,6 +103,22 @@ namespace Grid {
    return SimdApply(CosRealFunctor<S>(),r);
  }
  template < class S, class V > 
+  inline Grid_simd<S,V> log(const Grid_simd<S,V> &r) {
+    return SimdApply(LogRealFunctor<S>(),r);
+  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> abs(const Grid_simd<S,V> &r) {
+    return SimdApply(AbsRealFunctor<S>(),r);
+  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> exp(const Grid_simd<S,V> &r) {
+    return SimdApply(ExpRealFunctor<S>(),r);
+  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> Not(const Grid_simd<S,V> &r) {
+    return SimdApply(NotFunctor<S>(),r);
+  }
+  template < class S, class V > 
  inline Grid_simd<S,V> pow(const Grid_simd<S,V> &r,double y) {
    return SimdApply(PowRealFunctor<S>(y),r);
  }
@@ -67,6 +126,55 @@ namespace Grid {
  inline Grid_simd<S,V> mod(const Grid_simd<S,V> &r,Integer y) {
    return SimdApply(ModIntFunctor<S>(y),r);
  }
+  ////////////////////////////////////////////////////////////////////////////
+  // Allows us to assign into **conformable** real vectors from complex
+  ////////////////////////////////////////////////////////////////////////////
+  //  template < class S, class V > 
+  //  inline auto ComplexRemove(const Grid_simd<S,V> &c) -> Grid_simd<Grid_simd<S,V>::Real,V> {
+  //    Grid_simd<Grid_simd<S,V>::Real,V> ret;
+  //    ret.v = c.v;
+  //    return ret;
+  //  }
+  template<class scalar> struct AndFunctor {
+    scalar operator()(const scalar &x, const scalar &y)  const {
+      return x & y;
+    }
+  };
+  template<class scalar> struct OrFunctor {
+    scalar operator()(const scalar &x, const scalar &y)  const {
+      return x | y;
+    }
+  };
+  template<class scalar> struct AndAndFunctor {
+    scalar operator()(const scalar &x, const scalar &y)  const {
+      return x && y;
+    }
+  };
+  template<class scalar> struct OrOrFunctor {
+    scalar operator()(const scalar &x, const scalar &y)  const {
+      return x || y;
+    }
+  };
+
+  ////////////////////////////////
+  // Calls to simd binop functors
+  ////////////////////////////////
+  template < class S, class V > 
+  inline Grid_simd<S,V> operator &(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
+    return SimdApplyBinop(AndFunctor<S>(),x,y);
+  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> operator &&(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
+    return SimdApplyBinop(AndAndFunctor<S>(),x,y);
+  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> operator |(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
+    return SimdApplyBinop(OrFunctor<S>(),x,y);
+  }
+  template < class S, class V > 
+  inline Grid_simd<S,V> operator ||(const Grid_simd<S,V> &x,const Grid_simd<S,V> &y) {
+    return SimdApplyBinop(OrOrFunctor<S>(),x,y);
+  }

 }
 #endif
--- a/lib/tensors/Tensor_Ta.h
+++ b/lib/tensors/Tensor_Ta.h
@@ -102,10 +102,10 @@ namespace Grid {
    }

  template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
-    inline auto Determinant(const iMatrix<vtype,N> &arg)-> iScalar<decltype(Determinant(arg._internal[0][0]))>
+    inline iScalar<vtype> Determinant(const iMatrix<vtype,N> &arg)
    {
      iMatrix<vtype,N> ret(arg);
-      iScalar<decltype(Determinant(arg._internal[0][0]))> det = 1.0;
+      iScalar<vtype> det = vtype(1.0);
      /* Conversion of matrix to upper triangular */
      for(int i = 0; i < N; i++){
        for(int j = 0; j < N; j++){
--- a/lib/tensors/Tensor_arith_scalar.h
+++ b/lib/tensors/Tensor_arith_scalar.h
@@ -9,12 +9,12 @@ namespace Grid {
 //////////////////////////////////////////////////////////////////////////////////////////

 // multiplication by fundamental scalar type
-template<class l,int N> strong_inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l> strong_inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs*srhs;
 }
-template<class l,int N> strong_inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+template<class l> strong_inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }

 template<class l,int N> strong_inline iVector<l,N> operator * (const iVector<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
@@ -118,12 +118,12 @@ template<class l,int N> strong_inline iMatrix<l,N> operator * (Integer lhs,const
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // addition by fundamental scalar type applies to matrix(down diag) and scalar
 ///////////////////////////////////////////////////////////////////////////////////////////////
-template<class l,int N> strong_inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l> strong_inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs+srhs;
 }
-template<class l,int N> strong_inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+template<class l> strong_inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }

 template<class l,int N> strong_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
@@ -176,12 +176,12 @@ template<class l,int N> strong_inline iMatrix<l,N> operator + (Integer lhs,const
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // subtraction of fundamental scalar type applies to matrix(down diag) and scalar
 ///////////////////////////////////////////////////////////////////////////////////////////////
-template<class l,int N> strong_inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+template<class l> strong_inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
 {
  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
  return lhs-srhs;
 }
-template<class l,int N> strong_inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
+template<class l> strong_inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
 {
  typename iScalar<l>::tensor_reduced slhs;slhs=lhs;
  return slhs-rhs;
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -23,13 +23,17 @@ template<class vtype> class iScalar
 public:
  vtype _internal;

-  typedef typename GridTypeMapper<vtype>::scalar_type   scalar_type;
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
  typedef iScalar<recurse_scalar_object> scalar_object;

+  // substitutes a real or complex version with same tensor structure
+  typedef iScalar<typename GridTypeMapper<vtype>::Complexified > Complexified;
+  typedef iScalar<typename GridTypeMapper<vtype>::Realified >    Realified;
+
  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};

  // Scalar no action
@@ -86,9 +90,19 @@ public:
  strong_inline const vtype & operator ()(void) const {
    return _internal;
  }
-  
-  operator ComplexD () const { return(TensorRemove(_internal)); };
-  operator RealD () const { return(real(TensorRemove(_internal))); }
+
+  // Type casts meta programmed
+  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0>  
+    operator ComplexF () const { return(TensorRemove(_internal)); };
+  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0>  
+    operator ComplexD () const { return(TensorRemove(_internal)); };
+  template<class U=vtype,class V=scalar_type,IfComplex<V> = 0,IfNotSimd<U> = 0>  
+    operator RealD () const { return(real(TensorRemove(_internal))); }
+  template<class U=vtype,class V=scalar_type,IfReal<V>    = 0,IfNotSimd<U> = 0>  
+    operator RealD    () const { return TensorRemove(_internal); }
+  template<class U=vtype,class V=scalar_type,IfInteger<V> = 0,IfNotSimd<U> = 0>  
+    operator Integer  () const { return Integer(TensorRemove(_internal)); }
+
  
  // convert from a something to a scalar via constructor of something arg
  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline iScalar<vtype> operator = (T arg)
@@ -123,6 +137,10 @@ public:
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  typedef iVector<recurse_scalar_object,N> scalar_object;

+  // substitutes a real or complex version with same tensor structure
+  typedef iVector<typename GridTypeMapper<vtype>::Complexified,N > Complexified;
+  typedef iVector<typename GridTypeMapper<vtype>::Realified,N >    Realified;
+
  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iVector<vtype,N>
    { 
      zeroit(*this);
@@ -211,6 +229,12 @@ public:
  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
+
+  // substitutes a real or complex version with same tensor structure
+  typedef iMatrix<typename GridTypeMapper<vtype>::Complexified,N > Complexified;
+  typedef iMatrix<typename GridTypeMapper<vtype>::Realified,N >    Realified;
+
+  // Tensure removal
  typedef iScalar<tensor_reduced_v> tensor_reduced;
  typedef iMatrix<recurse_scalar_object,N> scalar_object;

--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -31,18 +31,17 @@ inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::
 		  std::vector<scalar *> &extracted,int offset){
  int Nextr=extracted.size();
  int Nsimd=vsimd::Nsimd();
-  int s=Nsimd/Nextr;
-
+  int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
+                     // replicate n-fold. Use to allow Integer masks to 
+                     // predicate floating point of various width assignments and maintain conformable.
  scalar *buf =(scalar *) y;
  for(int i=0;i<Nextr;i++){
    for(int ii=0;ii<s;ii++){
      buf[i*s+ii]=extracted[i][offset];
    }
  }
-
 };

-
 ////////////////////////////////////////////////////////////////////////////////////////////////
 // Extract a fundamental vector type to scalar array 
 ////////////////////////////////////////////////////////////////////////////////////////////////
@@ -55,8 +54,17 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v

  scalar *buf = (scalar *)&y;
  for(int i=0;i<Nextr;i++){
-    for(int ii=0;ii<s;ii++){
-      extracted[i]=buf[i*s+ii];
+    extracted[i]=buf[i*s];
+    for(int ii=1;ii<s;ii++){
+      if ( buf[i*s]!=buf[i*s+ii] ){
+	std::cout << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
+	for(int vv=0;vv<Nsimd;vv++) {
+	  std::cout<< buf[vv]<<" ";
+	}
+	std::cout<<std::endl;
+	assert(0);
+      }
+      assert(buf[i*s]==buf[i*s+ii]);
    }
  }

@@ -74,21 +82,7 @@ inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::

  for(int i=0;i<Nextr;i++){
    for(int ii=0;ii<s;ii++){
-      buf[i*s+ii]=extracted[i];
-    }
-  }
-
-};
-template<class vsimd,class scalar>
-inline void AmergeA(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type  &y,std::vector<scalar> &extracted){
-  int Nextr=extracted.size();
-  int Nsimd=vsimd::Nsimd();
-  int s=Nsimd/Nextr;
-
-  scalar *buf = (scalar *)&y;
-  for(int i=0;i<Nextr;i++){
-    for(int ii=0;ii<s;ii++){
-      buf[i*s+ii]=extracted[i];
+      buf[i*s+ii]=extracted[i]; // replicates value
    }
  }
 };
@@ -102,12 +96,12 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
  typedef typename vobj::vector_type vector_type ;

  const int Nsimd=vobj::vector_type::Nsimd();
+  int Nextr=extracted.size();
  const int words=sizeof(vobj)/sizeof(vector_type);
+  int s=Nsimd/Nextr;

-  extracted.resize(Nsimd);
-
-  std::vector<scalar_type *> pointers(Nsimd);
-  for(int i=0;i<Nsimd;i++) 
+  std::vector<scalar_type *> pointers(Nextr);
+  for(int i=0;i<Nextr;i++) 
    pointers[i] =(scalar_type *)& extracted[i];

  vector_type *vp = (vector_type *)&vec;
@@ -127,11 +121,11 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac

  const int words=sizeof(vobj)/sizeof(vector_type);
  const int Nsimd=vobj::vector_type::Nsimd();
-
-  assert(extracted.size()==Nsimd);
+  int Nextr=extracted.size();
+  int s = Nsimd/Nextr;

  std::vector<scalar_type *> pointers(Nsimd);
-  for(int i=0;i<Nsimd;i++) {
+  for(int i=0;i<Nextr;i++) {
    pointers[i] =(scalar_type *)& extracted[i][offset];
  }

@@ -153,10 +147,11 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
  const int Nsimd=vobj::vector_type::Nsimd();
  const int words=sizeof(vobj)/sizeof(vector_type);

-  assert(extracted.size()==Nsimd);
+  int Nextr = extracted.size();
+  int splat=Nsimd/Nextr;

-  std::vector<scalar_type *> pointers(Nsimd);
-  for(int i=0;i<Nsimd;i++) 
+  std::vector<scalar_type *> pointers(Nextr);
+  for(int i=0;i<Nextr;i++) 
    pointers[i] =(scalar_type *)& extracted[i];
  
  vector_type *vp = (vector_type *)&vec;
@@ -177,14 +172,14 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  const int Nsimd=vobj::vector_type::Nsimd();
  const int words=sizeof(vobj)/sizeof(vector_type);

-  assert(extracted.size()==Nsimd);
+  int Nextr=extracted.size();

-  std::vector<scalar_type *> pointers(Nsimd);
-  for(int i=0;i<Nsimd;i++) 
+  std::vector<scalar_type *> pointers(Nextr);
+  for(int i=0;i<Nextr;i++) 
    pointers[i] =(scalar_type *)& extracted[i][offset];
-  
+
  vector_type *vp = (vector_type *)&vec;
-  assert((void *)vp!=NULL);
+
  for(int w=0;w<words;w++){
    merge<vector_type,scalar_type>(&vp[w],pointers,w);
  }
--- a/lib/tensors/Tensor_inner.h
+++ b/lib/tensors/Tensor_inner.h
@@ -10,7 +10,8 @@ namespace Grid {
      typedef typename sobj::scalar_type scalar;
      decltype(innerProduct(arg,arg)) nrm;
      nrm = innerProduct(arg,arg);
-      return real(nrm);
+      RealD ret = real(nrm);
+      return ret;
    }

    template<class l,class r,int N> inline
--- a/lib/tensors/Tensor_logical.h
+++ b/lib/tensors/Tensor_logical.h
@@ -0,0 +1,32 @@
+#ifndef GRID_TENSOR_LOGICAL_H
+#define GRID_TENSOR_LOGICAL_H
+
+namespace Grid {
+
+#define LOGICAL_BINOP(Op)\
+template<class v> strong_inline iScalar<v> operator Op (const iScalar<v>& lhs,const iScalar<v>& rhs) \
+{\
+  iScalar<v> ret;\
+  ret._internal = lhs._internal Op rhs._internal ;\
+  return ret;\
+}\
+template<class l> strong_inline iScalar<l> operator Op (const iScalar<l>& lhs,Integer rhs) \
+{\
+  typename iScalar<l>::scalar_type t; t=rhs;\
+  typename iScalar<l>::tensor_reduced srhs; srhs=t;\
+  return lhs Op srhs;\
+}\
+template<class l> strong_inline iScalar<l> operator Op (Integer lhs,const iScalar<l>& rhs) \
+{\
+  typename iScalar<l>::scalar_type t;t=lhs;\
+  typename iScalar<l>::tensor_reduced slhs;slhs=t;\
+  return slhs Op rhs;\
+}
+
+LOGICAL_BINOP(|);
+LOGICAL_BINOP(&);
+LOGICAL_BINOP(||);
+LOGICAL_BINOP(&&);
+
+}
+#endif
--- a/lib/tensors/Tensor_traits.h
+++ b/lib/tensors/Tensor_traits.h
@@ -26,6 +26,8 @@ namespace Grid {
    typedef typename T::vector_type vector_type;
    typedef typename T::tensor_reduced tensor_reduced;
    typedef typename T::scalar_object scalar_object;
+    typedef typename T::Complexified Complexified;
+    typedef typename T::Realified Realified;
    enum { TensorLevel = T::TensorLevel };
  };

@@ -38,6 +40,8 @@ namespace Grid {
    typedef RealF vector_type;
    typedef RealF tensor_reduced ;
    typedef RealF scalar_object;
+    typedef ComplexF Complexified;
+    typedef RealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<RealD> {
@@ -46,6 +50,8 @@ namespace Grid {
    typedef RealD vector_type;
    typedef RealD tensor_reduced;
    typedef RealD scalar_object;
+    typedef ComplexD Complexified;
+    typedef RealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<ComplexF> {
@@ -54,6 +60,8 @@ namespace Grid {
    typedef ComplexF vector_type;
    typedef ComplexF tensor_reduced;
    typedef ComplexF scalar_object;
+    typedef ComplexF Complexified;
+    typedef RealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<ComplexD> {
@@ -62,6 +70,8 @@ namespace Grid {
    typedef ComplexD vector_type;
    typedef ComplexD tensor_reduced;
    typedef ComplexD scalar_object;
+    typedef ComplexD Complexified;
+    typedef RealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<Integer> {
@@ -70,6 +80,8 @@ namespace Grid {
    typedef Integer vector_type;
    typedef Integer tensor_reduced;
    typedef Integer scalar_object;
+    typedef void Complexified;
+    typedef void Realified;
    enum { TensorLevel = 0 };
  };

@@ -79,6 +91,8 @@ namespace Grid {
    typedef vRealF vector_type;
    typedef vRealF tensor_reduced;
    typedef RealF  scalar_object;
+    typedef vComplexF Complexified;
+    typedef vRealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vRealD> {
@@ -87,6 +101,8 @@ namespace Grid {
    typedef vRealD vector_type;
    typedef vRealD tensor_reduced;
    typedef RealD  scalar_object;
+    typedef vComplexD Complexified;
+    typedef vRealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vComplexF> {
@@ -95,6 +111,8 @@ namespace Grid {
    typedef vComplexF vector_type;
    typedef vComplexF tensor_reduced;
    typedef ComplexF  scalar_object;
+    typedef vComplexF Complexified;
+    typedef vRealF Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vComplexD> {
@@ -103,6 +121,8 @@ namespace Grid {
    typedef vComplexD vector_type;
    typedef vComplexD tensor_reduced;
    typedef ComplexD  scalar_object;
+    typedef vComplexD Complexified;
+    typedef vRealD Realified;
    enum { TensorLevel = 0 };
  };
  template<> class GridTypeMapper<vInteger> {
@@ -111,6 +131,8 @@ namespace Grid {
    typedef vInteger vector_type;
    typedef vInteger tensor_reduced;
    typedef  Integer scalar_object;
+    typedef void Complexified;
+    typedef void Realified;
    enum { TensorLevel = 0 };
  };

--- a/lib/tensors/Tensor_unary.h
+++ b/lib/tensors/Tensor_unary.h
@@ -2,7 +2,7 @@
 #define GRID_TENSOR_UNARY_H
 namespace Grid {

-#define UNARY_REAL(func)\
+#define UNARY(func)\
 template<class obj> inline auto func(const iScalar<obj> &z) -> iScalar<obj>\
 {\
    iScalar<obj> ret;\
@@ -53,14 +53,71 @@ template<class obj> inline iScalar<obj> func(const iScalar<obj> &z,scal y)	\
    return ret;\
 }

-UNARY_REAL(sqrt);
-UNARY_REAL(rsqrt);
-UNARY_REAL(sin);
-UNARY_REAL(cos);
+UNARY(sqrt);
+UNARY(rsqrt);
+UNARY(sin);
+UNARY(cos);
+UNARY(log);
+UNARY(exp);
+UNARY(abs);
+UNARY(Not);
+
+
+template<class obj> inline auto toReal(const iScalar<obj> &z) -> typename iScalar<obj>::Realified
+{
+  typename iScalar<obj>::Realified ret;
+  ret._internal = toReal(z._internal);
+  return ret;
+}
+ template<class obj,int N> inline auto toReal(const iVector<obj,N> &z) -> typename iVector<obj,N>::Realified
+{
+  typename iVector<obj,N>::Realified ret;
+  for(int c1=0;c1<N;c1++){  
+    ret._internal[c1] = toReal(z._internal[c1]); 
+  }
+  return ret;
+}
+template<class obj,int N> inline auto toReal(const iMatrix<obj,N> &z) -> typename iMatrix<obj,N>::Realified
+{
+  typename iMatrix<obj,N>::Realified ret;
+  for(int c1=0;c1<N;c1++){
+  for(int c2=0;c2<N;c2++){
+    ret._internal[c1][c2] = toReal(z._internal[c1][c2]);
+  }}
+  return ret;
+}
+
+template<class obj> inline auto toComplex(const iScalar<obj> &z) -> typename iScalar<obj>::Complexified
+{
+  typename iScalar<obj>::Complexified ret;
+  ret._internal = toComplex(z._internal);
+  return ret;
+}
+ template<class obj,int N> inline auto toComplex(const iVector<obj,N> &z) -> typename iVector<obj,N>::Complexified
+{
+  typename iVector<obj,N>::Complexified ret;
+  for(int c1=0;c1<N;c1++){  
+    ret._internal[c1] = toComplex(z._internal[c1]); 
+  }
+  return ret;
+}
+template<class obj,int N> inline auto toComplex(const iMatrix<obj,N> &z) -> typename iMatrix<obj,N>::Complexified
+{
+  typename iMatrix<obj,N>::Complexified ret;
+  for(int c1=0;c1<N;c1++){
+  for(int c2=0;c2<N;c2++){
+    ret._internal[c1][c2] = toComplex(z._internal[c1][c2]);
+  }}
+  return ret;
+}
+

 BINARY_RSCALAR(mod,Integer);
 BINARY_RSCALAR(pow,RealD);

+#undef UNARY
+#undef BINARY_RSCALAR
+

 }
 #endif
--- a/tests/Test_GaugeAction.cc
+++ b/tests/Test_GaugeAction.cc
@@ -27,6 +27,7 @@ public:
  }
 };

+
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
--- a/tests/Test_cayley_coarsen_support.cc
+++ b/tests/Test_cayley_coarsen_support.cc
@@ -102,16 +102,11 @@ int main (int argc, char ** argv)
  CoarseVector c_res (Coarse5d);
  CoarseVector c_proj(Coarse5d);
  
-  //  TODO
-  // -- promote from subspace, check we get the vector we wanted
-  // -- apply ldop; check we get the same as inner product of M times big vec
-  // -- pick blocks one by one. Evaluate matrix elements.
  Complex one(1.0);
  c_src = one;  // 1 in every element for vector 1.
  
  blockPromote(c_src,err,subspace);

-
  prom=zero;
  for(int b=0;b<nbasis;b++){
    prom=prom+subspace[b];
--- a/tests/Test_cshift.cc
+++ b/tests/Test_cshift.cc
@@ -66,7 +66,8 @@ int main (int argc, char ** argv)
 	  
 	  nrm = abs(scm-cm()()());
 	  std::vector<int> peer(4);
-	  int index=real(cm);
+	  Complex tmp  =cm;
+	  Integer index=real(tmp);
 	  Fine.CoorFromIndex(peer,index,latt_size);

 	  if (nrm > 0){
--- a/tests/Test_cshift_red_black.cc
+++ b/tests/Test_cshift_red_black.cc
@@ -100,7 +100,8 @@ int main (int argc, char ** argv)
 	  
 	  double nrm = abs(scm-cm()()());
 	  std::vector<int> peer(4);
-	  int index=real(cm);
+	  Complex ctmp = cm;
+	  Integer index=real(ctmp);
 	  Fine.CoorFromIndex(peer,index,latt_size);

 	  if (nrm > 0){
@@ -138,7 +139,8 @@ int main (int argc, char ** argv)
 	  
 	  nrm = abs(scm-cm()()());
 	  std::vector<int> peer(4);
-	  int index=real(cm);
+	  Complex ctmp=cm;
+	  Integer index=real(ctmp);
 	  Fine.CoorFromIndex(peer,index,latt_size);

 	  if (nrm > 0){
--- a/tests/Test_gamma.cc
+++ b/tests/Test_gamma.cc
@@ -95,7 +95,7 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<6;mu++){
    result =  Gamma(g[mu])* ident * Gamma(g[mu]);
    result = result - ident;
-    double mag = TensorRemove(norm2(result));
+    RealD mag = norm2(result);
    std::cout << list[mu]<<" " << mag<<std::endl;
  }

@@ -103,7 +103,7 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<6;mu++){
    result =          rnd * Gamma(g[mu]);
    result = result + rnd * Gamma(g[mu+6]);
-    double mag = TensorRemove(norm2(result));
+    RealD mag = norm2(result);
    std::cout << list[mu]<<" " << mag<<std::endl;
  }

@@ -111,7 +111,7 @@ int main (int argc, char ** argv)
  for(int mu=0;mu<6;mu++){
    result =           Gamma(g[mu])  *rnd;
    result = result +  Gamma(g[mu+6])*rnd;
-    double mag = TensorRemove(norm2(result));
+    RealD mag = norm2(result);
    std::cout << list[mu]<<" " << mag<<std::endl;
  }

--- a/tests/Test_lie_generators.cc
+++ b/tests/Test_lie_generators.cc
@@ -2,226 +2,91 @@

 #include <qcd/utils/CovariantCshift.h>
 #include <qcd/utils/WilsonLoops.h>
+#include <qcd/utils/SUn.h>

 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;

-class suN {
-public:
-
-  static int generators(int ncolour)   { return ncolour*ncolour-1; }
-  static int su2subgroups(int ncolour) { return (ncolour*(ncolour-1))/2; }
-  
-  template<typename CComplex,int N> using suNmatrix = iScalar<iScalar<iMatrix<CComplex,N> > > ;
-
-  ////////////////////////////////////////////////////////////////////////
-  // There are N^2-1 generators for SU(N).
-  //
-  // We take a traceless hermitian generator basis as follows
-  //
-  // * Normalisation: trace ta tb = 1/2 delta_ab
-  // 
-  // * Off diagonal
-  //    - pairs of rows i1,i2 behaving like pauli matrices signma_x, sigma_y
-  //     
-  //    - there are (Nc-1-i1) slots for i2 on each row [ x  0  x ]                                   
-  //      direct count off each row                                                                    
-  //
-  //    - Sum of all pairs is Nc(Nc-1)/2: proof arithmetic series
-  //
-  //      (Nc-1) + (Nc-2)+...  1      ==> Nc*(Nc-1)/2 
-  //      1+ 2+          +   + Nc-1                        
-  // 
-  //    - There are 2 x Nc (Nc-1)/ 2 of these = Nc^2 - Nc
-  //
-  //    - We enumerate the row-col pairs.
-  //    - for each row col pair there is a (sigma_x) and a (sigma_y) like generator
-  //
-  //
-  //   t^a_ij = { in 0.. Nc(Nc-1)/2 -1} =>  delta_{i,i1} delta_{j,i2} +  delta_{i,i1} delta_{j,i2}  
-  //   t^a_ij = { in Nc(Nc-1)/2 ... Nc^(Nc-1) -1} =>  i delta_{i,i1} delta_{j,i2} - i delta_{i,i1} delta_{j,i2}  
-  //   
-  // * Diagonal; must be traceless and normalised
-  //   - Sequence is 
-  //   N  (1,-1,0,0...)
-  //   N  (1, 1,-2,0...)
-  //   N  (1, 1, 1,-3,0...)
-  //   N  (1, 1, 1, 1,-4,0...)
-  //
-  //   where 1/2 = N^2 (1+.. m^2)etc.... for the m-th diagonal generator
-  //   NB this gives the famous SU3 result for su2 index 8
-  //
-  //   N= sqrt(1/2 . 1/6 ) = 1/2 . 1/sqrt(3) 
-  //
-  //   ( 1      )
-  //   (    1   ) / sqrt(3) /2  = 1/2 lambda_8
-  //   (      -2)
-  ////////////////////////////////////////////////////////////////////////
-  template<class CComplex,int Ncolour> 
-  static void suNgenerator(int lieIndex,suNmatrix<CComplex,Ncolour> &ta){
-    // map lie index to which type of generator
-    int diagIndex;
-    int su2Index;
-    int sigxy;
-    int NNm1 =  Ncolour*(Ncolour-1);
-    if ( lieIndex>= NNm1 ) {
-      diagIndex = lieIndex -NNm1;
-      suNgeneratorDiagonal(diagIndex,ta);
-      return;
-    }
-    sigxy   = lieIndex&0x1;
-    su2Index= lieIndex>>1;
-    if ( sigxy ) suNgeneratorSigmaY(su2Index,ta);
-    else         suNgeneratorSigmaX(su2Index,ta);
-  }
-  template<class CComplex,int Ncolour> 
-  static void suNgeneratorSigmaX(int su2Index,suNmatrix<CComplex,Ncolour> &ta){
-    ta=zero;
-    int i1,i2;
-    su2SubGroupIndex<Ncolour>(i1,i2,su2Index);
-    ta()()(i1,i2)=1.0;
-    ta()()(i2,i1)=1.0;
-    ta= ta *0.5;
-  }
-  template<class CComplex,int Ncolour> 
-  static void suNgeneratorSigmaY(int su2Index,suNmatrix<CComplex,Ncolour> &ta){
-    ta=zero;
-    Complex i(0.0,1.0);
-    int i1,i2;
-    su2SubGroupIndex<Ncolour>(i1,i2,su2Index);
-    ta()()(i1,i2)=-i;
-    ta()()(i2,i1)= i;
-    ta= ta *0.5;
-  }
-  template<class CComplex,int Ncolour> 
-  static void suNgeneratorDiagonal(int diagIndex,suNmatrix<CComplex,Ncolour> &ta){
-    ta=zero;
-    int trsq=0;
-    int last=diagIndex+1;
-    for(int i=0;i<=diagIndex;i++){
-      ta()()(i,i) = 1.0;
-      trsq++;
-    }
-    ta()()(last,last) = -last;
-    trsq+=last*last;
-    RealD nrm = 1.0/std::sqrt(2.0*trsq);
-    ta = ta *nrm;
-  }
-  ////////////////////////////////////////////////////////////////////////
-  // Map a 
-  //
-  ////////////////////////////////////////////////////////////////////////
-  template<int Ncolour>
-  static void su2SubGroupIndex(int &i1,int &i2,int su2_index){
-
-    assert( (su2_index>=0) && (su2_index< (Ncolour*(Ncolour-1))/2) );
-
-    int spare=su2_index;
-    for(i1=0;spare >= (Ncolour-1-i1);i1++ ){
-      spare = spare - (Ncolour-1-i1);  // remove the Nc-1-i1 terms                                 
-    }
-    i2=i1+1+spare;
-  }
-  template<class CComplex,int Ncolour>
-  static void su2Extract(std::vector<LatticeComplex> &r,const Lattice<suNmatrix<CComplex,Ncolour> > &source, int su2_index)
-  {
-    assert(r.size() == 4); // store in 4 real parts
-    
-    for(int i=0;i<4;i++){
-      conformable(r[i],source);
-    }
-    
-    int i1,i2;    
-    su2SubGroupIndex<Ncolour>(i1,i2,su2_index);
-    
-    /* Compute the b(k) of A_SU(2) = b0 + i sum_k bk sigma_k */ 
-    r[0] = real(source()()(i1,i1) + source()()(i2,i2));
-    r[1] = imag(source()()(i1,i2) + source()()(i2,i1));
-    r[2] = real(source()()(i1,i2) - source()()(i2,i1));
-    r[3] = imag(source()()(i1,i1) - source()()(i2,i2));
-  }
-
-
-  template<int Ncolour> static void printGenerators(void)
-  {
-    for(int gen=0;gen<suN::generators(Ncolour);gen++){
-      suN::suNmatrix<Complex,Ncolour> ta;
-      suN::suNgenerator(gen,ta);
-      std::cout<< "Nc = "<<Ncolour<<" t_"<<gen<<std::endl;
-      std::cout<<ta<<std::endl;
-    }
-  }
-
-  template<int Ncolour> static void testGenerators(void){
-    suNmatrix<Complex,Ncolour> ta;
-    suNmatrix<Complex,Ncolour> tb;
-    std::cout<<"Checking trace ta tb is 0.5 delta_ab"<<std::endl;
-    for(int a=0;a<generators(Ncolour);a++){
-      for(int b=0;b<generators(Ncolour);b++){
-	suNgenerator(a,ta);
-	suNgenerator(b,tb);
-	Complex tr =TensorRemove(trace(ta*tb)); 
-	std::cout<<tr<<" ";
-	if(a==b) assert(abs(tr-Complex(0.5))<1.0e-6);
-	if(a!=b) assert(abs(tr)<1.0e-6);
-      }
-      std::cout<<std::endl;
-    }
-    std::cout<<"Checking hermitian"<<std::endl;
-    for(int a=0;a<generators(Ncolour);a++){
-      suNgenerator(a,ta);
-      std::cout<<a<<" ";
-      assert(norm2(ta-adj(ta))<1.0e-6);
-    }    
-    std::cout<<std::endl;
-
-    std::cout<<"Checking traceless"<<std::endl;
-    for(int a=0;a<generators(Ncolour);a++){
-      suNgenerator(a,ta);
-      Complex tr =TensorRemove(trace(ta)); 
-      std::cout<<a<<" ";
-      assert(abs(tr)<1.0e-6);
-    }    
-    std::cout<<std::endl;
-  }
-
-};
-

 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-
-  std::vector<int> simd_layout = GridDefaultSimd(4,vComplexF::Nsimd());
-  std::vector<int> mpi_layout  = GridDefaultMpi();
-  std::vector<int> latt_size  ({4,4,4,4});
-
-  GridCartesian     Fine(latt_size,simd_layout,mpi_layout);
-
-  LatticeGaugeField Umu(&Fine);
+  std::vector<int> latt({4,4,4,8});
+  GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, 
+							GridDefaultSimd(Nd,vComplexF::Nsimd()),
+							GridDefaultMpi());
+  
+  GridRedBlackCartesian * rbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);

  std::cout<<"*********************************************"<<std::endl;
  std::cout<<"* Generators for SU(2)"<<std::endl;
  std::cout<<"*********************************************"<<std::endl;
-  suN::printGenerators<2>();
-  suN::testGenerators<2>();
+  SU2::printGenerators();
+  SU2::testGenerators();
+
  std::cout<<"*********************************************"<<std::endl;
  std::cout<<"* Generators for SU(3)"<<std::endl;
  std::cout<<"*********************************************"<<std::endl;
-  suN::printGenerators<3>();
-  suN::testGenerators<3>();
-  std::cout<<"*********************************************"<<std::endl;
-  std::cout<<"* Generators for SU(4)"<<std::endl;
-  std::cout<<"*********************************************"<<std::endl;
-  suN::printGenerators<4>();
-  suN::testGenerators<4>();
-  std::cout<<"*********************************************"<<std::endl;
-  std::cout<<"* Generators for SU(5)"<<std::endl;
-  std::cout<<"*********************************************"<<std::endl;
-  suN::printGenerators<5>();
-  suN::testGenerators<5>();
+  SU3::printGenerators();
+  SU3::testGenerators();
+
+  //  std::cout<<"*********************************************"<<std::endl;
+  //  std::cout<<"* Generators for SU(4)"<<std::endl;
+  //  std::cout<<"*********************************************"<<std::endl;
+  //  SU4::printGenerators();
+  //  SU4::testGenerators();
+
+  //  std::cout<<"*********************************************"<<std::endl;
+  //  std::cout<<"* Generators for SU(5)"<<std::endl;
+  //  std::cout<<"*********************************************"<<std::endl;
+  //  SU5::printGenerators();
+  //  SU5::testGenerators();
+
+  ///////////////////////////////
+  // Configuration of known size
+  ///////////////////////////////
+  NerscField header;
+  std::string file("./ckpoint_lat.400");
+  LatticeGaugeField Umu(grid);
+  //  readNerscConfiguration(Umu,header,file);
+  Umu=1.0; // Cold start
+
+  // RNG set up for test
+  std::vector<int> pseeds({1,2,3,4,5}); // once I caught a fish alive
+  std::vector<int> sseeds({6,7,8,9,10});// then i let it go again
+  GridParallelRNG  pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
+  GridSerialRNG    sRNG;       sRNG.SeedFixedIntegers(sseeds);
+
+  // SU3 colour operatoions
+  LatticeColourMatrix link(grid);
+  LatticeColourMatrix staple(grid);
+  int mu=0;
+
+  // Get Staple
+  ColourWilsonLoops::Staple(staple,Umu,mu);
+  // Get Link
+  link = peekIndex<LorentzIndex>(Umu,mu);
+
+  // Apply heatbath to the link
+  RealD beta=6.0;
+  int subgroup=0;
+  int nhb=1;
+  int trials=0;
+  int fails=0;
+
+  LatticeInteger one(rbGrid);  one = 1; // fill with ones
+  LatticeInteger mask(grid);   mask= zero;
+  one.checkerboard=Even;
+  setCheckerboard(mask,one);
+
+  // update Even checkerboard
+
+  SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,
+			nhb,trials,fails,mask);
+
+

  Grid_finalize();
 }
--- a/tests/Test_main.cc
+++ b/tests/Test_main.cc
@@ -248,7 +248,7 @@ int main (int argc, char ** argv)
    

    { // Peek-ology and Poke-ology, with a little app-ology
-      TComplex      c;
+      Complex      c;
      ColourMatrix c_m;   
      SpinMatrix   s_m;   
      SpinColourMatrix sc_m; 
@@ -302,10 +302,12 @@ int main (int argc, char ** argv)
      LatticeInteger coor(&Fine);
      LatticeCoordinate(coor,d);
      lex = lex + coor*mm[d];
+      
    }

    Bar = zero;
-    Bar = where(lex<10,Foo,Bar);
+    Bar = where(lex<Integer(10),Foo,Bar);
+    cout << "peeking sites..\n";
    {
      std::vector<int> coor(4);
      for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
@@ -473,7 +475,8 @@ int main (int argc, char ** argv)
        mdiff = shifted1-shifted2;
        amdiff=adj(mdiff);
        ColourMatrix prod = amdiff*mdiff;
-        Real Ttr=real(trace(prod));
+	Complex trprod = trace(prod);
+        Real Ttr=real(trprod);
        double nn=Ttr;
        if ( nn > 0 )
            cout<<"Shift real trace fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<endl;