diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h
index fd7bbb78..3e90371a 100644
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -49,9 +49,9 @@ public:
 
   void deallocate(pointer __p, size_type) { 
 #ifdef HAVE_MM_MALLOC_H
-    _mm_free(__p); 
+    _mm_free((void *)__p); 
 #else
-    free(__p);
+    free((void *)__p);
 #endif
   }
   void construct(pointer __p, const _Tp& __val) { };
diff --git a/lib/Make.inc b/lib/Make.inc
index 11362e27..dab78b37 100644
--- a/lib/Make.inc
+++ b/lib/Make.inc
@@ -1,4 +1,4 @@
 
-HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/DiffAction.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/WilsonLoops.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h
+HFILES=./Cshift.h ./simd/Grid_avx.h ./simd/Grid_vector_types.h ./simd/Grid_sse4.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_vector_unops.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./Tensors.h ./Algorithms.h ./communicator/Communicator_base.h ./lattice/Lattice_rng.h ./lattice/Lattice_reduction.h ./lattice/Lattice_transfer.h ./lattice/Lattice_unary.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_comparison.h ./lattice/Lattice_overload.h ./lattice/Lattice_reality.h ./lattice/Lattice_local.h ./lattice/Lattice_conformable.h ./lattice/Lattice_where.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_ET.h ./lattice/Lattice_transpose.h ./lattice/Lattice_trace.h ./Stencil.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_poke.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_class.h ./tensors/Tensor_logical.h ./tensors/Tensor_transpose.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_outer.h ./tensors/Tensor_inner.h ./tensors/Tensor_traits.h ./tensors/Tensor_Ta.h ./tensors/Tensor_unary.h ./tensors/Tensor_determinant.h ./tensors/Tensor_peek.h ./tensors/Tensor_arith.h ./tensors/Tensor_extract_merge.h ./Communicator.h ./Cartesian.h ./parallelIO/NerscIO.h ./qcd/QCD.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/LinalgUtils.h ./qcd/utils/CovariantCshift.h ./qcd/utils/WilsonLoops.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/gauge/GaugeActionBase.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/spin/TwoSpinor.h ./qcd/spin/Dirac.h ./cshift/Cshift_common.h ./cshift/Cshift_none.h ./cshift/Cshift_mpi.h ./Simd.h ./GridConfig.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_red_black.h ./cartesian/Cartesian_full.h ./AlignedAllocator.h ./Lattice.h ./Threads.h ./Grid.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Remez.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./algorithms/CoarsenedMatrix.h ./stencil/Lebesgue.h
 
-CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./GridInit.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
+CCFILES=./qcd/utils/SpaceTimeGrid.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/spin/Dirac.cc ./GridInit.cc ./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
diff --git a/lib/Simd.h b/lib/Simd.h
index f93c3070..7da0d7ad 100644
--- a/lib/Simd.h
+++ b/lib/Simd.h
@@ -46,6 +46,18 @@ namespace Grid {
   inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conjugate(l)*r; }
   inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
   inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
+
+  inline ComplexD Reduce(const ComplexD& r){ return r; }
+  inline ComplexF Reduce(const ComplexF& r){ return r; }
+  inline RealD Reduce(const RealD& r){ return r; }
+  inline RealF Reduce(const RealF& r){ return r; }
+
+  inline RealD toReal(const ComplexD& r){ return real(r); }
+  inline RealF toReal(const ComplexF& r){ return real(r); }
+  inline RealD toReal(const RealD& r){ return r; }
+  inline RealF toReal(const RealF& r){ return r; }
+
+  
   
   ////////////////////////////////////////////////////////////////////////////////
   //Provide support functions for basic real and complex data types required by Grid
diff --git a/lib/Tensors.h b/lib/Tensors.h
index a2246f4a..90c9d7a8 100644
--- a/lib/Tensors.h
+++ b/lib/Tensors.h
@@ -9,6 +9,8 @@
 #include <tensors/Tensor_transpose.h>
 #include <tensors/Tensor_trace.h>
 #include <tensors/Tensor_Ta.h>
+#include <tensors/Tensor_determinant.h>
+#include <tensors/Tensor_exp.h>
 #include <tensors/Tensor_peek.h>
 #include <tensors/Tensor_poke.h>
 #include <tensors/Tensor_reality.h>
diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h
index 3a84ed49..2d885d36 100644
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -142,7 +142,10 @@ public:
 	// Use a reduced simd grid
 	_simd_layout[d] = simd_layout[d];
 	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
-	
+
+	// all elements of a simd vector must have same checkerboard.
+	if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0); 
+
 	_osites *= _rdimensions[d];
 	_isites *= _simd_layout[d];
         
diff --git a/lib/lattice/Lattice_ET.h b/lib/lattice/Lattice_ET.h
index caa45e69..82d11218 100644
--- a/lib/lattice/Lattice_ET.h
+++ b/lib/lattice/Lattice_ET.h
@@ -1,4 +1,3 @@
-
 #ifndef GRID_LATTICE_ET_H
 #define GRID_LATTICE_ET_H
 
diff --git a/lib/lattice/Lattice_base.h b/lib/lattice/Lattice_base.h
index cd2fa855..0503732c 100644
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -158,7 +158,7 @@ PARALLEL_FOR_LOOP
       vobj tmp = eval(ss,expr);
       vstream(_odata[ss] ,tmp);
 #else
-      _odata[ss]=_eval(ss,expr);
+      _odata[ss]=eval(ss,expr);
 #endif
     }
   };
@@ -196,7 +196,7 @@ PARALLEL_FOR_LOOP
     _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
-      _odata[ss]=eval(ss,expr);
+      vstream(_odata[ss] ,eval(ss,expr));
     }
   };
 
diff --git a/lib/lattice/Lattice_unary.h b/lib/lattice/Lattice_unary.h
index 91b39007..6d72ef31 100644
--- a/lib/lattice/Lattice_unary.h
+++ b/lib/lattice/Lattice_unary.h
@@ -24,5 +24,18 @@ PARALLEL_FOR_LOOP
     return ret;
   }
 
+  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, ComplexD alpha, Integer Nexp = DEFAULT_MAT_EXP){
+    Lattice<obj> ret(rhs._grid);
+    ret.checkerboard = rhs.checkerboard;
+    conformable(ret,rhs);
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss<rhs._grid->oSites();ss++){
+      ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
+    }
+    return ret;
+  }
+
+
+
 }
 #endif
diff --git a/lib/qcd/utils/SUn.h b/lib/qcd/utils/SUn.h
index 3a8b5f4d..0ede7965 100644
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -1,7 +1,6 @@
 #ifndef QCD_UTIL_SUN_H
 #define QCD_UTIL_SUN_H
 
-
 namespace Grid {
   namespace QCD {
 
@@ -13,6 +12,7 @@ public:
   static int su2subgroups(void) { return (ncolour*(ncolour-1))/2; }
 
   template<typename vtype> using iSUnMatrix              = iScalar<iScalar<iMatrix<vtype, ncolour> > > ;
+  template<typename vtype> using iSU2Matrix              = iScalar<iScalar<iMatrix<vtype, 2> > > ;
 
   //////////////////////////////////////////////////////////////////////////////////////////////////
   // Types can be accessed as SU<2>::Matrix , SU<2>::vSUnMatrix, SU<2>::LatticeMatrix etc...
@@ -29,6 +29,19 @@ public:
   typedef Lattice<vMatrixF>    LatticeMatrixF;
   typedef Lattice<vMatrixD>    LatticeMatrixD;
 
+  typedef iSU2Matrix<Complex>         SU2Matrix;
+  typedef iSU2Matrix<ComplexF>        SU2MatrixF;
+  typedef iSU2Matrix<ComplexD>        SU2MatrixD;
+
+  typedef iSU2Matrix<vComplex>       vSU2Matrix;
+  typedef iSU2Matrix<vComplexF>      vSU2MatrixF;
+  typedef iSU2Matrix<vComplexD>      vSU2MatrixD;
+
+  typedef Lattice<vSU2Matrix>     LatticeSU2Matrix;
+  typedef Lattice<vSU2MatrixF>    LatticeSU2MatrixF;
+  typedef Lattice<vSU2MatrixD>    LatticeSU2MatrixD;
+
+
   ////////////////////////////////////////////////////////////////////////
   // There are N^2-1 generators for SU(N).
   //
@@ -122,6 +135,7 @@ public:
     RealD nrm = 1.0/std::sqrt(2.0*trsq);
     ta = ta *nrm;
   }
+
   ////////////////////////////////////////////////////////////////////////
   // Map a su2 subgroup number to the pair of rows that are non zero
   ////////////////////////////////////////////////////////////////////////
@@ -135,208 +149,333 @@ public:
     }
     i2=i1+1+spare;
   }
-  template<class vreal,class vcplx>
-  static void su2Extract(std::vector<Lattice<iSinglet  <vreal> > > &r,
-			       const Lattice<iSUnMatrix<vcplx> > &source, 
-			 int su2_index)
+
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // Pull out a subgroup and project on to real coeffs x pauli basis
+  //////////////////////////////////////////////////////////////////////////////////////////
+  template<class vcplx>
+  static void su2Extract(    Lattice<iSinglet<vcplx> > &Determinant,
+			     Lattice<iSU2Matrix<vcplx> > &subgroup, 
+			     const Lattice<iSUnMatrix<vcplx> > &source, 
+			     int su2_index)
   {
     GridBase *grid(source._grid);
+    conformable(subgroup,source);
+    conformable(subgroup,Determinant);
+    int i0,i1;    
+    su2SubGroupIndex(i0,i1,su2_index);
+
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss!=grid->oSites();ss++){
+      subgroup._odata[ss]()()(0,0) = source._odata[ss]()()(i0,i0);
+      subgroup._odata[ss]()()(0,1) = source._odata[ss]()()(i0,i1);
+      subgroup._odata[ss]()()(1,0) = source._odata[ss]()()(i1,i0);
+      subgroup._odata[ss]()()(1,1) = source._odata[ss]()()(i1,i1);
+
+      iSU2Matrix<vcplx> Sigma = subgroup._odata[ss];
+
+      Sigma = Sigma-adj(Sigma)+trace(adj(Sigma));
+
+      subgroup._odata[ss] = Sigma;
+
+      // this should be purely real
+      Determinant._odata[ss] = Sigma()()(0,0)*Sigma()()(1,1)
+   	                     - Sigma()()(0,1)*Sigma()()(1,0);
 
-    assert(r.size() == 4); // store in 4 real parts
-    
-    for(int i=0;i<4;i++){
-      conformable(r[i],source);
     }
-    
-    int i1,i2;    
-    su2SubGroupIndex(i1,i2,su2_index);
 
-    /* Compute the b(k) of A_SU(2) = b0 + i sum_k bk sigma_k */ 
-    
-    //    r[0] = toReal(real(peekColour(source,i1,i1)) + real(peekColour(source,i2,i2)));
-    //    r[1] = toReal(imag(peekColour(source,i1,i2)) + imag(peekColour(source,i2,i1)));
-    //    r[2] = toReal(real(peekColour(source,i1,i2)) - real(peekColour(source,i2,i1)));
-    //    r[3] = toReal(imag(peekColour(source,i1,i1)) - imag(peekColour(source,i2,i2)));
-    r[0] = toReal(real(peekColour(source,i1,i1)) + real(peekColour(source,i2,i2)));
-    r[1] = toReal(imag(peekColour(source,i1,i2)) + imag(peekColour(source,i2,i1)));
-    r[2] = toReal(real(peekColour(source,i1,i2)) - real(peekColour(source,i2,i1)));
-    r[3] = toReal(imag(peekColour(source,i1,i1)) - imag(peekColour(source,i2,i2)));
-    
   }
 
-  template<class vreal,class vcplx>
-  static void su2Insert(const std::vector<Lattice<iSinglet<vreal> > > &r,
-			Lattice<iSUnMatrix<vcplx> > &dest,
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // Set matrix to one and insert a pauli subgroup
+  //////////////////////////////////////////////////////////////////////////////////////////
+  template<class vcplx>
+  static void su2Insert( const Lattice<iSU2Matrix<vcplx> > &subgroup, 
+			       Lattice<iSUnMatrix<vcplx> > &dest,
 			int su2_index)
   {
-    typedef typename Lattice<iSUnMatrix<vcplx> >::scalar_type cplx;
-    typedef Lattice<iSinglet<vcplx> > Lcomplex;
-    GridBase * grid = dest._grid;
+    GridBase *grid(dest._grid);
+    conformable(subgroup,dest);
+    int i0,i1;    
+    su2SubGroupIndex(i0,i1,su2_index);
 
-    assert(r.size() == 4); // store in 4 real parts
-
-    Lcomplex tmp(grid);
-
-    std::vector<Lcomplex > cr(4,grid);
-    for(int i=0;i<r.size();i++){
-      conformable(r[i],dest);
-      cr[i]=toComplex(r[i]);
+    dest = 1.0; // start out with identity
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss!=grid->oSites();ss++){
+      dest._odata[ss]()()(i0,i0) = subgroup._odata[ss]()()(0,0);
+      dest._odata[ss]()()(i0,i1) = subgroup._odata[ss]()()(0,1);
+      dest._odata[ss]()()(i1,i0) = subgroup._odata[ss]()()(1,0);
+      dest._odata[ss]()()(i1,i1) = subgroup._odata[ss]()()(1,1);
     }
-
-    int i1,i2;
-    su2SubGroupIndex(i1,i2,su2_index);
-
-    cplx one   (1.0,0.0);
-    cplx cplx_i(0.0,1.0);
-
-    tmp =   cr[0]*one+ cr[3]*cplx_i;   pokeColour(dest,tmp,i1,i2); 
-    tmp =   cr[2]*one+ cr[1]*cplx_i;   pokeColour(dest,tmp,i1,i2);
-    tmp =  -cr[2]*one+ cr[1]*cplx_i;   pokeColour(dest,tmp,i2,i1);
-    tmp =   cr[0]*one- cr[3]*cplx_i;   pokeColour(dest,tmp,i2,i2);
   }
 
+
+  ///////////////////////////////////////////////
+  // Generate e^{ Re Tr Staple Link} dlink 
+  //
+  // *** Note Staple should be appropriate linear compbination between all staples.
+  // *** If already by beta pass coefficient 1.0.
+  // *** This routine applies the additional 1/Nc factor that comes after trace in action.
+  //
+  ///////////////////////////////////////////////
   static void SubGroupHeatBath(      GridSerialRNG       &sRNG,
 				     GridParallelRNG     &pRNG,
-			             RealD beta,
+				     RealD beta,//coeff multiplying staple in action (with no 1/Nc)
 				     LatticeMatrix &link, 
-			       const LatticeMatrix &staple, 
+				     const LatticeMatrix &barestaple, // multiplied by action coeffs so th
 				     int su2_subgroup,
 				     int nheatbath,
-				     int& ntrials,
-				     int& nfails,
 				     LatticeInteger &wheremask)
   {
     GridBase *grid = link._grid;
 
-    LatticeMatrix V(grid);
-    V = link*staple;
-
-    std::vector<LatticeReal> r(4,grid);
-    std::vector<LatticeReal> a(4,grid);
-    su2Extract(r,V,su2_subgroup); // HERE
-
-    LatticeReal r_l(grid);
-    r_l  = r[0]*r[0]+r[1]*r[1]+r[2]*r[2]+r[3]*r[3];
-    r_l = sqrt(r_l);
-
-    LatticeReal ftmp(grid);
-    LatticeReal ftmp1(grid);
-    LatticeReal ftmp2(grid);
-    LatticeReal one (grid);    one = 1.0;
-    LatticeReal zz  (grid);    zz  = zero;
-    LatticeReal recip(grid);   recip = one/r_l;
-    
-    Real machine_epsilon= 1.0e-14;
-    
-    ftmp = where(r_l>machine_epsilon,recip,one);
-    a[0] = where(r_l>machine_epsilon,   r[0] * ftmp , one);
-    a[1] = where(r_l>machine_epsilon, -(r[1] * ftmp), zz);
-    a[2] = where(r_l>machine_epsilon, -(r[2] * ftmp), zz);
-    a[3] = where(r_l>machine_epsilon, -(r[3] * ftmp), zz);
-
-    r_l *=  beta / ncolour;
-    
-    ftmp1 = where(wheremask,one,zz);
-    Real num_sites = TensorRemove(sum(ftmp1));
-
-    Integer itrials = (Integer)num_sites;
-    ntrials = 0;
-    nfails = 0;
-
-    LatticeInteger lupdate(grid);
-    LatticeInteger lbtmp(grid);
-    LatticeInteger lbtmp2(grid); lbtmp2=zero;
-
-    int n_done = 0;
-    int nhb = 0;
-
-    r[0] = a[0];
-    lupdate = 1;
-
-    LatticeReal ones (grid); ones = 1.0;
-    LatticeReal zeros(grid); zeros=zero;
-
+    int ntrials=0;
+    int nfails=0;
     const RealD twopi=2.0*M_PI;
-    while ( nhb < nheatbath && n_done < num_sites ) {
 
-       ntrials += itrials;
+    LatticeMatrix staple(grid); 
 
-       random(pRNG,r[1]);
-       std::cout<<"RANDOM SPARSE FLOAT r[1]"<<std::endl;
-       std::cout<<r[1]<<std::endl;
+    staple = barestaple * (beta/ncolour);
 
-       random(pRNG,r[2]);
-       random(pRNG,ftmp);
+    LatticeMatrix    V(grid);    V = link*staple;
 
-       r[1] = log(r[1]);
-       r[2] = log(r[2]);
+    // Subgroup manipulation in the lie algebra space
+    LatticeSU2Matrix    u(grid);   // Kennedy pendleton "u" real projected normalised Sigma
+    LatticeSU2Matrix uinv(grid);
+    LatticeSU2Matrix   ua(grid);   // a in pauli form
+    LatticeSU2Matrix    b(grid);   // rotated matrix after hb
 
-       ftmp = ftmp*twopi;
-       r[3] = cos(ftmp);
-       r[3] = r[3]*r[3];
+    // Some handy constant fields
+    LatticeComplex ones (grid); ones = 1.0;
+    LatticeComplex zeros(grid); zeros=zero;
+    LatticeReal rones (grid); rones = 1.0;
+    LatticeReal rzeros(grid); rzeros=zero;
+    LatticeComplex udet(grid); // determinant of real(staple)
+    LatticeInteger mask_true (grid); mask_true = 1;
+    LatticeInteger mask_false(grid); mask_false= 0;
 
-       r[1] += r[2] * r[3];
-       r[2]  = r[1] / r_l;
+  /*
+PLB 156 P393 (1985) (Kennedy and Pendleton)
 
-       random(pRNG,ftmp);
-       r[1] = ftmp*ftmp;
+Note: absorb "beta" into the def of sigma compared to KP paper; staple
+passed to this routine has "beta" already multiplied in
 
-       {
-	 LatticeInteger mask_true (grid); mask_true = 1;
-	 LatticeInteger mask_false(grid); mask_false= 0;
-	 LatticeReal    thresh(grid); thresh = (1.0 + 0.5*r[2]);
-	 lbtmp = where(r[1] <= thresh,mask_true,mask_false);
-       }
-       lbtmp2= lbtmp && lupdate;       
-       r[0]  = where(lbtmp2, 1.0+r[2], r[0]);
+Action linear in links h and of form:
 
-       ftmp1 = where(lbtmp2,ones,zeros);
-       RealD sitesum = sum(ftmp1);
-       Integer itmp = sitesum;
+    beta S = beta  Sum_p (1 - 1/Nc Re Tr Plaq )
 
-       n_done += itmp;
-       itrials -= itmp;
-       nfails += itrials;
-       lbtmp   = !lbtmp;
-       lupdate = lupdate & lbtmp;
-       ++nhb;
-    }
-    // Now create r[1], r[2] and r[3] according to the spherical measure 
-    // Take absolute value to guard against round-off 
-    random(pRNG,ftmp1);
-    r[2] = 1.0 - 2.0*ftmp1;
- 
-    ftmp1 = abs(1.0 - r[0]*r[0]);
-    r[3]   = -(sqrt(ftmp1) * r[2]);
+Writing Sigma = 1/Nc (beta Sigma') where sum over staples is "Sigma' "
 
-    // Take absolute value to guard against round-off 
-    r_l    = sqrt(abs(ftmp1 - r[3]*r[3]));
- 
-    random(pRNG,ftmp1);
-    ftmp1 *= twopi;
-    r[1]    = r_l * cos(ftmp1);
-    r[2]    = r_l * sin(ftmp1);
+     beta S = const - beta/Nc Re Tr h Sigma'
+            = const - Re Tr h Sigma
+      
+Decompose h and Sigma into (1, sigma_j) ; h_i real, h^2=1, Sigma_i complex arbitrary.
 
-    // Update matrix is B = R * A, with B, R and A given by b_i, r_i and a_i 
-    std::vector<LatticeReal> b(4,grid);
-    b[0] = r[0]*a[0] - r[1]*a[1] - r[2]*a[2] - r[3]*a[3];
-    b[1] = r[0]*a[1] + r[1]*a[0] - r[2]*a[3] + r[3]*a[2];
-    b[2] = r[0]*a[2] + r[2]*a[0] - r[3]*a[1] + r[1]*a[3];
-    b[3] = r[0]*a[3] + r[3]*a[0] - r[1]*a[2] + r[2]*a[1];
+    Tr h Sigma = h_i Sigma_j Tr (sigma_i sigma_j)  = h_i Sigma_j 2 delta_ij
+ Re Tr h Sigma = 2 h_j Re Sigma_j
 
-     //
-     // Now fill an SU(3) matrix V with the SU(2) submatrix su2_index
-     // parametrized by a_k in the sigma matrix basis.
-     //
+Normalised re Sigma_j = xi u_j 
+
+With u_j a unit vector and U can be in SU(2);
+
+Re Tr h Sigma = 2 h_j Re Sigma_j = 2 xi (h.u)
+
+4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
+ u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
+
+ xi = sqrt(Det)/2;
+
+Write a= u h in SU(2); a has pauli decomp a_j;
+
+Note: Product b' xi is unvariant because scaling Sigma leaves
+      normalised vector "u" fixed; Can rescale Sigma so b' = 1.
+  */
+
+    ////////////////////////////////////////////////////////
+    // Real part of Pauli decomposition
+    // Note a subgroup can project to zero in cold start
+    ////////////////////////////////////////////////////////
+    su2Extract(udet,u,V,su2_subgroup); 
+
+    //////////////////////////////////////////////////////
+    // Normalising this vector if possible; else identity
+    //////////////////////////////////////////////////////
+    LatticeComplex xi(grid);  
+
+    LatticeSU2Matrix lident(grid);
+    
+    SU2Matrix ident = Complex(1.0); 
+    SU2Matrix pauli1; SU<2>::generator(0,pauli1);
+    SU2Matrix pauli2; SU<2>::generator(1,pauli2);
+    SU2Matrix pauli3; SU<2>::generator(2,pauli3);
+    pauli1 = timesI(pauli1)*2.0;
+    pauli2 = timesI(pauli2)*2.0;
+    pauli3 = timesI(pauli3)*2.0;
+
+    LatticeComplex cone(grid);
+    LatticeReal adet(grid);
+    adet = abs(toReal(udet));
+    lident=Complex(1.0);
+    cone  =Complex(1.0);
+    Real machine_epsilon=1.0e-7;
+    u   = where(adet>machine_epsilon,u,lident);
+    udet= where(adet>machine_epsilon,udet,cone);
+
+    xi  = 0.5*sqrt(udet);     //4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
+    u   = 0.5*u*pow(xi,-1.0); //  u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
+
+    // Debug test for sanity
+    uinv=adj(u);
+    b=u*uinv-1.0;
+    assert(norm2(b)<1.0e-4);
+
+  /*
+Measure: Haar measure dh has d^4a delta(1-|a^2|)
+In polars:
+  da = da0 r^2 sin theta dr dtheta dphi delta( 1 - r^2 -a0^2) 
+     = da0 r^2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r)(sqrt(1-a0^) + r) )
+     = da0 r/2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r) )
+
+Action factor Q(h) dh  = e^-S[h]  dh =  e^{  xi Tr uh} dh    // beta enters through xi
+                                     =  e^{2 xi (h.u)} dh
+                                     =  e^{2 xi h0u0}.e^{2 xi h1u1}.e^{2 xi h2u2}.e^{2 xi h3u3} dh
+
+Therefore for each site, take xi for that site
+i) generate  |a0|<1 with dist 
+   (1-a0^2)^0.5 e^{2 xi a0 } da0
+
+Take alpha = 2 xi  = 2 xi [ recall 2 beta/Nc unmod staple norm]; hence 2.0/Nc factor in Chroma ]
+A. Generate two uniformly distributed pseudo-random numbers R and R', R'', R''' in the unit interval; 
+B. Set X = -(ln R)/alpha, X' =-(ln R')/alpha;
+C. Set C = cos^2(2pi R"), with R" another uniform random number in [0,1] ;
+D. Set A = XC;
+E. Let d  = X'+A;
+F. If R'''^2 :> 1 - 0.5 d,  go back to A;
+G. Set a0 = 1 - d;
+
+Note that in step D setting B ~ X - A and using B in place of A in step E will generate a second independent a 0 value.
+  */
+
+    /////////////////////////////////////////////////////////
+    // count the number of sites by picking "1"'s out of hat
+    /////////////////////////////////////////////////////////
+    Integer hit=0;
+    LatticeReal rtmp(grid);
+    rtmp=where(wheremask,rones,rzeros);
+    RealD numSites = sum(rtmp);
+    RealD numAccepted;
+    LatticeInteger      Accepted(grid); Accepted = zero;
+    LatticeInteger newlyAccepted(grid);
+   
+    std::vector<LatticeReal> xr(4,grid);
+    std::vector<LatticeReal>  a(4,grid);
+    LatticeReal d(grid); d=zero;
+    LatticeReal alpha(grid);
+
+    //    std::cout<<"xi "<<xi <<std::endl;
+    alpha = toReal(2.0*xi);
+
+    do { 
+
+      // A. Generate two uniformly distributed pseudo-random numbers R and R', R'', R''' in the unit interval; 
+      random(pRNG,xr[0]); 
+      random(pRNG,xr[1]); 
+      random(pRNG,xr[2]); 
+      random(pRNG,xr[3]); 
+
+      // B. Set X = - ln R/alpha, X' = -ln R'/alpha
+      xr[1] = -log(xr[1])/alpha;  
+      xr[2] = -log(xr[2])/alpha;
+
+      // C. Set C = cos^2(2piR'')
+      xr[3] = cos(xr[3]*twopi);
+      xr[3] = xr[3]*xr[3];
+
+      LatticeReal xrsq(grid);
+
+      //D. Set A = XC;
+      //E. Let d  = X'+A;
+      xrsq = xr[2]+xr[1]*xr[3];
+
+      d = where(Accepted,d,xr[2]+xr[1]*xr[3]);
+	      
+      //F. If R'''^2 :> 1 - 0.5 d,  go back to A;
+      LatticeReal thresh(grid); thresh = 1.0-d*0.5;
+      xrsq = xr[0]*xr[0];
+      LatticeInteger ione(grid); ione = 1;
+      LatticeInteger izero(grid); izero=zero;
+
+      newlyAccepted = where ( xrsq < thresh,ione,izero);
+      Accepted = where ( newlyAccepted, newlyAccepted,Accepted);
+      Accepted = where ( wheremask, Accepted,izero);
+
+      // FIXME need an iSum for integer to avoid overload on return type??
+      rtmp=where(Accepted,rones,rzeros);
+      numAccepted = sum(rtmp);
+
+      hit++;
+
+    } while ( (numAccepted < numSites) && ( hit < nheatbath) );
+
+    // G. Set a0 = 1 - d;
+    a[0]=zero;
+    a[0]=where(wheremask,1.0-d,a[0]);
+
+    //////////////////////////////////////////
+    //    ii) generate a_i uniform on two sphere radius (1-a0^2)^0.5
+    //////////////////////////////////////////
+
+    LatticeReal a123mag(grid);
+    a123mag = sqrt(abs(1.0-a[0]*a[0]));
+
+    LatticeReal cos_theta (grid);
+    LatticeReal sin_theta (grid);
+    LatticeReal       phi (grid);
+
+    random(pRNG,phi);            phi = phi * twopi;       // uniform in [0,2pi]
+    random(pRNG,cos_theta); cos_theta=(cos_theta*2.0)-1.0;  // uniform in [-1,1]
+    sin_theta = sqrt(abs(1.0-cos_theta*cos_theta));
+
+    a[1]    = a123mag * sin_theta * cos(phi);
+    a[2]    = a123mag * sin_theta * sin(phi);
+    a[3]    = a123mag * cos_theta;
+    
+    ua = toComplex(a[0])*ident 
+       + toComplex(a[1])*pauli1 
+       + toComplex(a[2])*pauli2 
+       + toComplex(a[3])*pauli3;
+
+    b = 1.0;
+    b = where(wheremask,uinv*ua,b);
     su2Insert(b,V,su2_subgroup);
 
-    // U = V*U
-    LatticeMatrix tmp(grid);
-    tmp = V * link;
+    //mask the assignment back based on Accptance
+    link = where(Accepted,V * link,link);
 
-    //mask the assignment back
-    link = where(wheremask,tmp,link);
+    //////////////////////////////
+    // Debug Checks
+    // SU2 check
+    LatticeSU2Matrix    check(grid);   // rotated matrix after hb
+    u = zero;
+    check = ua * adj(ua) - 1.0;
+    check = where(Accepted,check,u);
+    assert(norm2(check)<1.0e-4);
 
+    check = b*adj(b)-1.0;
+    check = where(Accepted,check,u);
+    assert(norm2(check)<1.0e-4);
+    
+    LatticeMatrix Vcheck(grid);
+    Vcheck = zero;
+    Vcheck = where(Accepted,V*adj(V) - 1.0,Vcheck);
+    //    std::cout << "SU3 check " <<norm2(Vcheck)<<std::endl;
+    assert(norm2(Vcheck)<1.0e-4);
+    
+    // Verify the link stays in SU(3)
+    //    std::cout <<"Checking the modified link"<<std::endl;
+    Vcheck = link*adj(link) - 1.0;
+    assert(norm2(Vcheck)<1.0e-4);
+    /////////////////////////////////
   }
 
   static void printGenerators(void)
diff --git a/lib/qcd/utils/WilsonLoops.h b/lib/qcd/utils/WilsonLoops.h
index 69b86859..e3885432 100644
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -30,6 +30,7 @@ public:
   static void sitePlaquette(LatticeComplex &Plaq,const std::vector<GaugeMat> &U)
   {
     LatticeComplex sitePlaq(U[0]._grid);
+    Plaq=zero;
     for(int mu=1;mu<Nd;mu++){
       for(int nu=0;nu<mu;nu++){
 	traceDirPlaquette(sitePlaq,U,mu,nu);
@@ -42,6 +43,7 @@ public:
   //////////////////////////////////////////////////
   static RealD sumPlaquette(const GaugeLorentz &Umu){
     std::vector<GaugeMat> U(4,Umu._grid);
+
     for(int mu=0;mu<Nd;mu++){
       U[mu] = peekIndex<LorentzIndex>(Umu,mu);
     }
@@ -72,13 +74,15 @@ public:
   //////////////////////////////////////////////////
   static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
 
-    std::vector<GaugeMat> U(4,Umu._grid);
+    GridBase *grid = Umu._grid;
+
+    std::vector<GaugeMat> U(4,grid);
     for(int d=0;d<Nd;d++){
       U[d] = peekIndex<LorentzIndex>(Umu,d);
     }
 
     staple = zero;
-    GaugeMat tmp(U[0]._grid);
+    GaugeMat tmp(grid);
 
     for(int nu=0;nu<Nd;nu++){
 
diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h
index 2dc38cca..a00e29eb 100644
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -4,7 +4,7 @@
 
   Using intrinsics
 */
-// Time-stamp: <2015-06-09 14:26:59 neo>
+// Time-stamp: <2015-06-16 23:30:41 neo>
 //----------------------------------------------------------------------
 
 #include <immintrin.h>
@@ -248,7 +248,7 @@ namespace Optimization {
       return _mm256_set_m128i(a1,a0);
 #endif
 #if defined (AVX2)
-      return _mm256_mul_epi32(a,b);
+      return _mm256_mullo_epi32(a,b);
 #endif
 
     }
diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h
index 3a54c0b3..62516201 100644
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -4,7 +4,7 @@
 
   Using intrinsics
 */
-// Time-stamp: <2015-06-09 14:24:01 neo>
+// Time-stamp: <2015-06-16 23:27:54 neo>
 //----------------------------------------------------------------------
 
 #include <pmmintrin.h>
@@ -97,7 +97,7 @@ namespace Optimization {
     }
     // Integer
     inline __m128i operator()(Integer *a){
-      return _mm_set_epi32(a[0],a[1],a[2],a[3]);
+      return _mm_set_epi32(a[3],a[2],a[1],a[0]);
     }
 
 
@@ -181,7 +181,7 @@ namespace Optimization {
     }
     // Integer
     inline __m128i operator()(__m128i a, __m128i b){
-      return _mm_mul_epi32(a,b);
+      return _mm_mullo_epi32(a,b);
     }
   };
 
diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h
index 3ba77d82..37d1f6de 100644
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -100,7 +100,7 @@ namespace Grid {
   }
   template < class S, class V > 
   inline Grid_simd<S,V> sin(const Grid_simd<S,V> &r) {
-    return SimdApply(CosRealFunctor<S>(),r);
+    return SimdApply(SinRealFunctor<S>(),r);
   }
   template < class S, class V > 
   inline Grid_simd<S,V> log(const Grid_simd<S,V> &r) {
diff --git a/lib/tensors/Tensor_Ta.h b/lib/tensors/Tensor_Ta.h
index f06dbf04..e49ede35 100644
--- a/lib/tensors/Tensor_Ta.h
+++ b/lib/tensors/Tensor_Ta.h
@@ -1,5 +1,7 @@
 #ifndef GRID_MATH_TA_H
 #define GRID_MATH_TA_H
+
+
 namespace Grid {
 
   /////////////////////////////////////////////// 
@@ -36,7 +38,8 @@ namespace Grid {
 
 
   /////////////////////////////////////////////// 
-  // ProjectOnGroup function for scalar, vector, matrix
+  // ProjectOnGroup function for scalar, vector, matrix 
+  // Projects on orthogonal, unitary group
   /////////////////////////////////////////////// 
 
 
@@ -59,19 +62,22 @@ namespace Grid {
     {
       // need a check for the group type?
       iMatrix<vtype,N> ret(arg);
-      double nrm;
+      RealD nrm;
+      vtype inner;
       for(int c1=0;c1<N;c1++){
-	nrm = 0.0; 
+	zeroit(inner);	
 	for(int c2=0;c2<N;c2++)
-	  nrm = real(innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]));
-	nrm = 1.0/sqrt(nrm);
+	  inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
+
+	nrm = 1.0/sqrt(Reduce(toReal(inner)));
 	for(int c2=0;c2<N;c2++)
 	  ret._internal[c1][c2]*= nrm;
       
 	for (int b=c1+1; b<N; ++b){
-	  decltype(ret._internal[b][b]*ret._internal[b][b]) pr = 0.0;
+	  decltype(ret._internal[b][b]*ret._internal[b][b]) pr;
+	  zeroit(pr);
 	  for(int c=0; c<N; ++c)
-	    pr += ret._internal[c1][c]*ret._internal[b][c];
+	    pr += conjugate(ret._internal[c1][c])*ret._internal[b][c];
 	  
 	  for(int c=0; c<N; ++c){
 	    ret._internal[b][c] -= pr * ret._internal[c1][c];
@@ -79,37 +85,11 @@ namespace Grid {
 	}
 	  
       }
+      // assuming the determinant is ok
       return ret;
     }
 
 
-  /////////////////////////////////////////////// 
-  // Exponentiate function for scalar, vector, matrix
-  /////////////////////////////////////////////// 
-
-
-  template<class vtype> inline iScalar<vtype> Exponentiate(const iScalar<vtype>&r, double alpha,  int Nexp)
-    {
-      iScalar<vtype> ret;
-      ret._internal = Exponentiate(r._internal, alpha, Nexp);
-      return ret;
-    }
-
-
-  template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
-    inline iMatrix<vtype,N> Exponentiate(const iMatrix<vtype,N> &arg, double alpha, int Nexp)
-    {
-      iMatrix<vtype,N> unit(1.0);
-      iMatrix<vtype,N> temp(unit);
-      
-      for(int i=Nexp; i>=1;--i){
-	temp *= alpha/double(i);
-	temp = unit + temp*arg;
-      }
-      return ProjectOnGroup(temp);
-    }
-
-
 
 }
 #endif
diff --git a/lib/tensors/Tensor_determinant.h b/lib/tensors/Tensor_determinant.h
new file mode 100644
index 00000000..7c8af9ed
--- /dev/null
+++ b/lib/tensors/Tensor_determinant.h
@@ -0,0 +1,45 @@
+#ifndef GRID_MATH_DET_H
+#define GRID_MATH_DET_H
+namespace Grid {
+  /////////////////////////////////////////////// 
+  // Determinant function for scalar, vector, matrix
+  /////////////////////////////////////////////// 
+  inline ComplexF Determinant( const ComplexF &arg){    return arg;}
+  inline ComplexD Determinant( const ComplexD &arg){    return arg;}
+  inline RealF Determinant( const RealF &arg){    return arg;}
+  inline RealD Determinant( const RealD &arg){    return arg;}
+
+  template<class vtype> inline auto Determinant(const iScalar<vtype>&r) -> iScalar<decltype(Determinant(r._internal))>
+    {
+      iScalar<decltype(Determinant(r._internal))> ret;
+      ret._internal = Determinant(r._internal);
+      return ret;
+    }
+
+  template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
+    inline iScalar<vtype> Determinant(const iMatrix<vtype,N> &arg)
+    {
+      iMatrix<vtype,N> ret(arg);
+      iScalar<vtype> det = vtype(1.0);
+      /* Conversion of matrix to upper triangular */
+      for(int i = 0; i < N; i++){
+        for(int j = 0; j < N; j++){
+	  if(j>i){
+	    vtype ratio = ret._internal[j][i]/ret._internal[i][i];
+	    for(int k = 0; k < N; k++){
+	      ret._internal[j][k] -= ratio * ret._internal[i][k];
+	    }
+	  }
+        }
+      }      
+
+      for(int i = 0; i < N; i++)
+	det *= ret._internal[i][i];   
+
+      return det;
+    }
+
+
+
+}
+#endif
diff --git a/lib/tensors/Tensor_exp.h b/lib/tensors/Tensor_exp.h
new file mode 100644
index 00000000..f8437116
--- /dev/null
+++ b/lib/tensors/Tensor_exp.h
@@ -0,0 +1,37 @@
+#ifndef GRID_MATH_EXP_H
+#define GRID_MATH_EXP_H
+
+#define DEFAULT_MAT_EXP 12
+
+namespace Grid {
+
+  /////////////////////////////////////////////// 
+  // Exponentiate function for scalar, vector, matrix
+  /////////////////////////////////////////////// 
+
+
+  template<class vtype> inline iScalar<vtype> Exponentiate(const iScalar<vtype>&r, ComplexD alpha ,  Integer Nexp = DEFAULT_MAT_EXP)
+    {
+      iScalar<vtype> ret;
+      ret._internal = Exponentiate(r._internal, alpha, Nexp);
+      return ret;
+    }
+
+
+  template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
+    inline iMatrix<vtype,N> Exponentiate(const iMatrix<vtype,N> &arg, ComplexD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
+    {
+      iMatrix<vtype,N> unit(1.0);
+      iMatrix<vtype,N> temp(unit);
+      
+      for(int i=Nexp; i>=1;--i){
+	temp *= alpha/ComplexD(i);
+	temp = unit + temp*arg;
+      }
+      return ProjectOnGroup(temp);//maybe not strictly necessary
+    }
+
+
+
+}
+#endif
diff --git a/lib/tensors/Tensor_extract_merge.h b/lib/tensors/Tensor_extract_merge.h
index cea9bb64..21ffd778 100644
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -57,7 +57,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
     extracted[i]=buf[i*s];
     for(int ii=1;ii<s;ii++){
       if ( buf[i*s]!=buf[i*s+ii] ){
-	std::cout << " SIMD extract failure splat="<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
+	std::cout << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
 	for(int vv=0;vv<Nsimd;vv++) {
 	  std::cout<< buf[vv]<<" ";
 	}
diff --git a/tests/Make.inc b/tests/Make.inc
index d04bf0ac..5bc90cd9 100644
--- a/tests/Make.inc
+++ b/tests/Make.inc
@@ -1,13 +1,5 @@
 
-bin_PROGRAMS = Test_GaugeAction Test_Metropolis Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cg Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_gamma Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd
-
-
-Test_GaugeAction_SOURCES=Test_GaugeAction.cc
-Test_GaugeAction_LDADD=-lGrid
-
-
-Test_Metropolis_SOURCES=Test_Metropolis.cc
-Test_Metropolis_LDADD=-lGrid
+bin_PROGRAMS = Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cg Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_contfrac_cg Test_contfrac_even_odd Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_gamma Test_GaugeAction Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_remez Test_rng Test_rng_fixed Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd
 
 
 Test_cayley_cg_SOURCES=Test_cayley_cg.cc
@@ -78,6 +70,10 @@ Test_gamma_SOURCES=Test_gamma.cc
 Test_gamma_LDADD=-lGrid
 
 
+Test_GaugeAction_SOURCES=Test_GaugeAction.cc
+Test_GaugeAction_LDADD=-lGrid
+
+
 Test_lie_generators_SOURCES=Test_lie_generators.cc
 Test_lie_generators_LDADD=-lGrid
 
diff --git a/tests/Test_lie_generators.cc b/tests/Test_lie_generators.cc
index 91f2d33f..d36e463e 100644
--- a/tests/Test_lie_generators.cc
+++ b/tests/Test_lie_generators.cc
@@ -44,49 +44,6 @@ int main (int argc, char ** argv)
   //  SU5::printGenerators();
   //  SU5::testGenerators();
 
-  ///////////////////////////////
-  // Configuration of known size
-  ///////////////////////////////
-  NerscField header;
-  std::string file("./ckpoint_lat.400");
-  LatticeGaugeField Umu(grid);
-  //  readNerscConfiguration(Umu,header,file);
-  Umu=1.0; // Cold start
-
-  // RNG set up for test
-  std::vector<int> pseeds({1,2,3,4,5}); // once I caught a fish alive
-  std::vector<int> sseeds({6,7,8,9,10});// then i let it go again
-  GridParallelRNG  pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
-  GridSerialRNG    sRNG;       sRNG.SeedFixedIntegers(sseeds);
-
-  // SU3 colour operatoions
-  LatticeColourMatrix link(grid);
-  LatticeColourMatrix staple(grid);
-  int mu=0;
-
-  // Get Staple
-  ColourWilsonLoops::Staple(staple,Umu,mu);
-  // Get Link
-  link = peekIndex<LorentzIndex>(Umu,mu);
-
-  // Apply heatbath to the link
-  RealD beta=6.0;
-  int subgroup=0;
-  int nhb=1;
-  int trials=0;
-  int fails=0;
-
-  LatticeInteger one(rbGrid);  one = 1; // fill with ones
-  LatticeInteger mask(grid);   mask= zero;
-  one.checkerboard=Even;
-  setCheckerboard(mask,one);
-
-  // update Even checkerboard
-
-  SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,
-			nhb,trials,fails,mask);
-
-
 
   Grid_finalize();
 }
diff --git a/tests/Test_main.cc b/tests/Test_main.cc
index daa6c3aa..8eecf0c3 100644
--- a/tests/Test_main.cc
+++ b/tests/Test_main.cc
@@ -1,9 +1,5 @@
 #include "Grid.h"
 
-//DEBUG
-#ifdef SSE4
-#include "simd/Grid_vector_types.h"
-#endif
 
 using namespace std;
 using namespace Grid;
@@ -216,16 +212,27 @@ int main (int argc, char ** argv)
     scm=transposeIndex<1>(scm);
     
    
-    //random(SerialRNG, cm);
-    //std::cout << cm << std::endl;
+    random(SerialRNG, cm);
+    std::cout << cm << std::endl;
 
     cm = Ta(cm);
-    //TComplex tracecm= trace(cm);      
-    //std::cout << cm << "  "<< tracecm << std::endl;
+    TComplex tracecm= trace(cm);      
+    std::cout << cm << std::endl;
+
+
+    cm = Exponentiate(cm, 1.0, 12);
+    std::cout << cm << "  " << std::endl;
+    Complex det = Determinant(cm);
+    std::cout << "determinant: " << det <<  std::endl;
 
     cm = ProjectOnGroup(cm);
+    std::cout << cm << "  " << std::endl;
+    cm = ProjectOnGroup(cm);
+    std::cout << cm << "  " << std::endl;
+
+    det = Determinant(cm);
+    std::cout << "determinant: " << det <<  std::endl;
 
-    cm = Exponentiate(cm, 1.0, 10);
 
 //    Foo = Foo+scalar; // LatticeColourMatrix+Scalar
 //    Foo = Foo*scalar; // LatticeColourMatrix*Scalar
@@ -237,6 +244,9 @@ int main (int argc, char ** argv)
     LatticeComplex trscMat(&Fine);
     trscMat = trace(scMat); // Trace
 
+    // Exponentiate test
+    cMat = expMat(cMat, ComplexD(1.0, 0.0));
+
     // LatticeComplex trlcMat(&Fine);
     // trlcMat = trace(lcMat); // Trace involving iVector - now generates error
     
@@ -296,10 +306,12 @@ int main (int argc, char ** argv)
       LatticeInteger coor(&Fine);
       LatticeCoordinate(coor,d);
       lex = lex + coor*mm[d];
+      
     }
 
     Bar = zero;
     Bar = where(lex<Integer(10),Foo,Bar);
+    cout << "peeking sites..\n";
     {
       std::vector<int> coor(4);
       for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){