diff --git a/lib/AlignedAllocator.h b/lib/AlignedAllocator.h
index fd7bbb78..3e90371a 100644
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -49,9 +49,9 @@ public:
 
   void deallocate(pointer __p, size_type) { 
 #ifdef HAVE_MM_MALLOC_H
-    _mm_free(__p); 
+    _mm_free((void *)__p); 
 #else
-    free(__p);
+    free((void *)__p);
 #endif
   }
   void construct(pointer __p, const _Tp& __val) { };
diff --git a/lib/cartesian/Cartesian_red_black.h b/lib/cartesian/Cartesian_red_black.h
index 3a84ed49..2d885d36 100644
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -142,7 +142,10 @@ public:
 	// Use a reduced simd grid
 	_simd_layout[d] = simd_layout[d];
 	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
-	
+
+	// all elements of a simd vector must have same checkerboard.
+	if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0); 
+
 	_osites *= _rdimensions[d];
 	_isites *= _simd_layout[d];
         
diff --git a/lib/lattice/Lattice_base.h b/lib/lattice/Lattice_base.h
index cd2fa855..0503732c 100644
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -158,7 +158,7 @@ PARALLEL_FOR_LOOP
       vobj tmp = eval(ss,expr);
       vstream(_odata[ss] ,tmp);
 #else
-      _odata[ss]=_eval(ss,expr);
+      _odata[ss]=eval(ss,expr);
 #endif
     }
   };
@@ -196,7 +196,7 @@ PARALLEL_FOR_LOOP
     _odata.resize(_grid->oSites());
 PARALLEL_FOR_LOOP
     for(int ss=0;ss<_grid->oSites();ss++){
-      _odata[ss]=eval(ss,expr);
+      vstream(_odata[ss] ,eval(ss,expr));
     }
   };
 
diff --git a/lib/qcd/utils/SUn.h b/lib/qcd/utils/SUn.h
index 3a8b5f4d..0ede7965 100644
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -1,7 +1,6 @@
 #ifndef QCD_UTIL_SUN_H
 #define QCD_UTIL_SUN_H
 
-
 namespace Grid {
   namespace QCD {
 
@@ -13,6 +12,7 @@ public:
   static int su2subgroups(void) { return (ncolour*(ncolour-1))/2; }
 
   template<typename vtype> using iSUnMatrix              = iScalar<iScalar<iMatrix<vtype, ncolour> > > ;
+  template<typename vtype> using iSU2Matrix              = iScalar<iScalar<iMatrix<vtype, 2> > > ;
 
   //////////////////////////////////////////////////////////////////////////////////////////////////
   // Types can be accessed as SU<2>::Matrix , SU<2>::vSUnMatrix, SU<2>::LatticeMatrix etc...
@@ -29,6 +29,19 @@ public:
   typedef Lattice<vMatrixF>    LatticeMatrixF;
   typedef Lattice<vMatrixD>    LatticeMatrixD;
 
+  typedef iSU2Matrix<Complex>         SU2Matrix;
+  typedef iSU2Matrix<ComplexF>        SU2MatrixF;
+  typedef iSU2Matrix<ComplexD>        SU2MatrixD;
+
+  typedef iSU2Matrix<vComplex>       vSU2Matrix;
+  typedef iSU2Matrix<vComplexF>      vSU2MatrixF;
+  typedef iSU2Matrix<vComplexD>      vSU2MatrixD;
+
+  typedef Lattice<vSU2Matrix>     LatticeSU2Matrix;
+  typedef Lattice<vSU2MatrixF>    LatticeSU2MatrixF;
+  typedef Lattice<vSU2MatrixD>    LatticeSU2MatrixD;
+
+
   ////////////////////////////////////////////////////////////////////////
   // There are N^2-1 generators for SU(N).
   //
@@ -122,6 +135,7 @@ public:
     RealD nrm = 1.0/std::sqrt(2.0*trsq);
     ta = ta *nrm;
   }
+
   ////////////////////////////////////////////////////////////////////////
   // Map a su2 subgroup number to the pair of rows that are non zero
   ////////////////////////////////////////////////////////////////////////
@@ -135,208 +149,333 @@ public:
     }
     i2=i1+1+spare;
   }
-  template<class vreal,class vcplx>
-  static void su2Extract(std::vector<Lattice<iSinglet  <vreal> > > &r,
-			       const Lattice<iSUnMatrix<vcplx> > &source, 
-			 int su2_index)
+
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // Pull out a subgroup and project on to real coeffs x pauli basis
+  //////////////////////////////////////////////////////////////////////////////////////////
+  template<class vcplx>
+  static void su2Extract(    Lattice<iSinglet<vcplx> > &Determinant,
+			     Lattice<iSU2Matrix<vcplx> > &subgroup, 
+			     const Lattice<iSUnMatrix<vcplx> > &source, 
+			     int su2_index)
   {
     GridBase *grid(source._grid);
+    conformable(subgroup,source);
+    conformable(subgroup,Determinant);
+    int i0,i1;    
+    su2SubGroupIndex(i0,i1,su2_index);
+
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss!=grid->oSites();ss++){
+      subgroup._odata[ss]()()(0,0) = source._odata[ss]()()(i0,i0);
+      subgroup._odata[ss]()()(0,1) = source._odata[ss]()()(i0,i1);
+      subgroup._odata[ss]()()(1,0) = source._odata[ss]()()(i1,i0);
+      subgroup._odata[ss]()()(1,1) = source._odata[ss]()()(i1,i1);
+
+      iSU2Matrix<vcplx> Sigma = subgroup._odata[ss];
+
+      Sigma = Sigma-adj(Sigma)+trace(adj(Sigma));
+
+      subgroup._odata[ss] = Sigma;
+
+      // this should be purely real
+      Determinant._odata[ss] = Sigma()()(0,0)*Sigma()()(1,1)
+   	                     - Sigma()()(0,1)*Sigma()()(1,0);
 
-    assert(r.size() == 4); // store in 4 real parts
-    
-    for(int i=0;i<4;i++){
-      conformable(r[i],source);
     }
-    
-    int i1,i2;    
-    su2SubGroupIndex(i1,i2,su2_index);
 
-    /* Compute the b(k) of A_SU(2) = b0 + i sum_k bk sigma_k */ 
-    
-    //    r[0] = toReal(real(peekColour(source,i1,i1)) + real(peekColour(source,i2,i2)));
-    //    r[1] = toReal(imag(peekColour(source,i1,i2)) + imag(peekColour(source,i2,i1)));
-    //    r[2] = toReal(real(peekColour(source,i1,i2)) - real(peekColour(source,i2,i1)));
-    //    r[3] = toReal(imag(peekColour(source,i1,i1)) - imag(peekColour(source,i2,i2)));
-    r[0] = toReal(real(peekColour(source,i1,i1)) + real(peekColour(source,i2,i2)));
-    r[1] = toReal(imag(peekColour(source,i1,i2)) + imag(peekColour(source,i2,i1)));
-    r[2] = toReal(real(peekColour(source,i1,i2)) - real(peekColour(source,i2,i1)));
-    r[3] = toReal(imag(peekColour(source,i1,i1)) - imag(peekColour(source,i2,i2)));
-    
   }
 
-  template<class vreal,class vcplx>
-  static void su2Insert(const std::vector<Lattice<iSinglet<vreal> > > &r,
-			Lattice<iSUnMatrix<vcplx> > &dest,
+  //////////////////////////////////////////////////////////////////////////////////////////
+  // Set matrix to one and insert a pauli subgroup
+  //////////////////////////////////////////////////////////////////////////////////////////
+  template<class vcplx>
+  static void su2Insert( const Lattice<iSU2Matrix<vcplx> > &subgroup, 
+			       Lattice<iSUnMatrix<vcplx> > &dest,
 			int su2_index)
   {
-    typedef typename Lattice<iSUnMatrix<vcplx> >::scalar_type cplx;
-    typedef Lattice<iSinglet<vcplx> > Lcomplex;
-    GridBase * grid = dest._grid;
+    GridBase *grid(dest._grid);
+    conformable(subgroup,dest);
+    int i0,i1;    
+    su2SubGroupIndex(i0,i1,su2_index);
 
-    assert(r.size() == 4); // store in 4 real parts
-
-    Lcomplex tmp(grid);
-
-    std::vector<Lcomplex > cr(4,grid);
-    for(int i=0;i<r.size();i++){
-      conformable(r[i],dest);
-      cr[i]=toComplex(r[i]);
+    dest = 1.0; // start out with identity
+PARALLEL_FOR_LOOP
+    for(int ss=0;ss!=grid->oSites();ss++){
+      dest._odata[ss]()()(i0,i0) = subgroup._odata[ss]()()(0,0);
+      dest._odata[ss]()()(i0,i1) = subgroup._odata[ss]()()(0,1);
+      dest._odata[ss]()()(i1,i0) = subgroup._odata[ss]()()(1,0);
+      dest._odata[ss]()()(i1,i1) = subgroup._odata[ss]()()(1,1);
     }
-
-    int i1,i2;
-    su2SubGroupIndex(i1,i2,su2_index);
-
-    cplx one   (1.0,0.0);
-    cplx cplx_i(0.0,1.0);
-
-    tmp =   cr[0]*one+ cr[3]*cplx_i;   pokeColour(dest,tmp,i1,i2); 
-    tmp =   cr[2]*one+ cr[1]*cplx_i;   pokeColour(dest,tmp,i1,i2);
-    tmp =  -cr[2]*one+ cr[1]*cplx_i;   pokeColour(dest,tmp,i2,i1);
-    tmp =   cr[0]*one- cr[3]*cplx_i;   pokeColour(dest,tmp,i2,i2);
   }
 
+
+  ///////////////////////////////////////////////
+  // Generate e^{ Re Tr Staple Link} dlink 
+  //
+  // *** Note Staple should be appropriate linear compbination between all staples.
+  // *** If already by beta pass coefficient 1.0.
+  // *** This routine applies the additional 1/Nc factor that comes after trace in action.
+  //
+  ///////////////////////////////////////////////
   static void SubGroupHeatBath(      GridSerialRNG       &sRNG,
 				     GridParallelRNG     &pRNG,
-			             RealD beta,
+				     RealD beta,//coeff multiplying staple in action (with no 1/Nc)
 				     LatticeMatrix &link, 
-			       const LatticeMatrix &staple, 
+				     const LatticeMatrix &barestaple, // multiplied by action coeffs so th
 				     int su2_subgroup,
 				     int nheatbath,
-				     int& ntrials,
-				     int& nfails,
 				     LatticeInteger &wheremask)
   {
     GridBase *grid = link._grid;
 
-    LatticeMatrix V(grid);
-    V = link*staple;
-
-    std::vector<LatticeReal> r(4,grid);
-    std::vector<LatticeReal> a(4,grid);
-    su2Extract(r,V,su2_subgroup); // HERE
-
-    LatticeReal r_l(grid);
-    r_l  = r[0]*r[0]+r[1]*r[1]+r[2]*r[2]+r[3]*r[3];
-    r_l = sqrt(r_l);
-
-    LatticeReal ftmp(grid);
-    LatticeReal ftmp1(grid);
-    LatticeReal ftmp2(grid);
-    LatticeReal one (grid);    one = 1.0;
-    LatticeReal zz  (grid);    zz  = zero;
-    LatticeReal recip(grid);   recip = one/r_l;
-    
-    Real machine_epsilon= 1.0e-14;
-    
-    ftmp = where(r_l>machine_epsilon,recip,one);
-    a[0] = where(r_l>machine_epsilon,   r[0] * ftmp , one);
-    a[1] = where(r_l>machine_epsilon, -(r[1] * ftmp), zz);
-    a[2] = where(r_l>machine_epsilon, -(r[2] * ftmp), zz);
-    a[3] = where(r_l>machine_epsilon, -(r[3] * ftmp), zz);
-
-    r_l *=  beta / ncolour;
-    
-    ftmp1 = where(wheremask,one,zz);
-    Real num_sites = TensorRemove(sum(ftmp1));
-
-    Integer itrials = (Integer)num_sites;
-    ntrials = 0;
-    nfails = 0;
-
-    LatticeInteger lupdate(grid);
-    LatticeInteger lbtmp(grid);
-    LatticeInteger lbtmp2(grid); lbtmp2=zero;
-
-    int n_done = 0;
-    int nhb = 0;
-
-    r[0] = a[0];
-    lupdate = 1;
-
-    LatticeReal ones (grid); ones = 1.0;
-    LatticeReal zeros(grid); zeros=zero;
-
+    int ntrials=0;
+    int nfails=0;
     const RealD twopi=2.0*M_PI;
-    while ( nhb < nheatbath && n_done < num_sites ) {
 
-       ntrials += itrials;
+    LatticeMatrix staple(grid); 
 
-       random(pRNG,r[1]);
-       std::cout<<"RANDOM SPARSE FLOAT r[1]"<<std::endl;
-       std::cout<<r[1]<<std::endl;
+    staple = barestaple * (beta/ncolour);
 
-       random(pRNG,r[2]);
-       random(pRNG,ftmp);
+    LatticeMatrix    V(grid);    V = link*staple;
 
-       r[1] = log(r[1]);
-       r[2] = log(r[2]);
+    // Subgroup manipulation in the lie algebra space
+    LatticeSU2Matrix    u(grid);   // Kennedy pendleton "u" real projected normalised Sigma
+    LatticeSU2Matrix uinv(grid);
+    LatticeSU2Matrix   ua(grid);   // a in pauli form
+    LatticeSU2Matrix    b(grid);   // rotated matrix after hb
 
-       ftmp = ftmp*twopi;
-       r[3] = cos(ftmp);
-       r[3] = r[3]*r[3];
+    // Some handy constant fields
+    LatticeComplex ones (grid); ones = 1.0;
+    LatticeComplex zeros(grid); zeros=zero;
+    LatticeReal rones (grid); rones = 1.0;
+    LatticeReal rzeros(grid); rzeros=zero;
+    LatticeComplex udet(grid); // determinant of real(staple)
+    LatticeInteger mask_true (grid); mask_true = 1;
+    LatticeInteger mask_false(grid); mask_false= 0;
 
-       r[1] += r[2] * r[3];
-       r[2]  = r[1] / r_l;
+  /*
+PLB 156 P393 (1985) (Kennedy and Pendleton)
 
-       random(pRNG,ftmp);
-       r[1] = ftmp*ftmp;
+Note: absorb "beta" into the def of sigma compared to KP paper; staple
+passed to this routine has "beta" already multiplied in
 
-       {
-	 LatticeInteger mask_true (grid); mask_true = 1;
-	 LatticeInteger mask_false(grid); mask_false= 0;
-	 LatticeReal    thresh(grid); thresh = (1.0 + 0.5*r[2]);
-	 lbtmp = where(r[1] <= thresh,mask_true,mask_false);
-       }
-       lbtmp2= lbtmp && lupdate;       
-       r[0]  = where(lbtmp2, 1.0+r[2], r[0]);
+Action linear in links h and of form:
 
-       ftmp1 = where(lbtmp2,ones,zeros);
-       RealD sitesum = sum(ftmp1);
-       Integer itmp = sitesum;
+    beta S = beta  Sum_p (1 - 1/Nc Re Tr Plaq )
 
-       n_done += itmp;
-       itrials -= itmp;
-       nfails += itrials;
-       lbtmp   = !lbtmp;
-       lupdate = lupdate & lbtmp;
-       ++nhb;
-    }
-    // Now create r[1], r[2] and r[3] according to the spherical measure 
-    // Take absolute value to guard against round-off 
-    random(pRNG,ftmp1);
-    r[2] = 1.0 - 2.0*ftmp1;
- 
-    ftmp1 = abs(1.0 - r[0]*r[0]);
-    r[3]   = -(sqrt(ftmp1) * r[2]);
+Writing Sigma = 1/Nc (beta Sigma') where sum over staples is "Sigma' "
 
-    // Take absolute value to guard against round-off 
-    r_l    = sqrt(abs(ftmp1 - r[3]*r[3]));
- 
-    random(pRNG,ftmp1);
-    ftmp1 *= twopi;
-    r[1]    = r_l * cos(ftmp1);
-    r[2]    = r_l * sin(ftmp1);
+     beta S = const - beta/Nc Re Tr h Sigma'
+            = const - Re Tr h Sigma
+      
+Decompose h and Sigma into (1, sigma_j) ; h_i real, h^2=1, Sigma_i complex arbitrary.
 
-    // Update matrix is B = R * A, with B, R and A given by b_i, r_i and a_i 
-    std::vector<LatticeReal> b(4,grid);
-    b[0] = r[0]*a[0] - r[1]*a[1] - r[2]*a[2] - r[3]*a[3];
-    b[1] = r[0]*a[1] + r[1]*a[0] - r[2]*a[3] + r[3]*a[2];
-    b[2] = r[0]*a[2] + r[2]*a[0] - r[3]*a[1] + r[1]*a[3];
-    b[3] = r[0]*a[3] + r[3]*a[0] - r[1]*a[2] + r[2]*a[1];
+    Tr h Sigma = h_i Sigma_j Tr (sigma_i sigma_j)  = h_i Sigma_j 2 delta_ij
+ Re Tr h Sigma = 2 h_j Re Sigma_j
 
-     //
-     // Now fill an SU(3) matrix V with the SU(2) submatrix su2_index
-     // parametrized by a_k in the sigma matrix basis.
-     //
+Normalised re Sigma_j = xi u_j 
+
+With u_j a unit vector and U can be in SU(2);
+
+Re Tr h Sigma = 2 h_j Re Sigma_j = 2 xi (h.u)
+
+4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
+ u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
+
+ xi = sqrt(Det)/2;
+
+Write a= u h in SU(2); a has pauli decomp a_j;
+
+Note: Product b' xi is unvariant because scaling Sigma leaves
+      normalised vector "u" fixed; Can rescale Sigma so b' = 1.
+  */
+
+    ////////////////////////////////////////////////////////
+    // Real part of Pauli decomposition
+    // Note a subgroup can project to zero in cold start
+    ////////////////////////////////////////////////////////
+    su2Extract(udet,u,V,su2_subgroup); 
+
+    //////////////////////////////////////////////////////
+    // Normalising this vector if possible; else identity
+    //////////////////////////////////////////////////////
+    LatticeComplex xi(grid);  
+
+    LatticeSU2Matrix lident(grid);
+    
+    SU2Matrix ident = Complex(1.0); 
+    SU2Matrix pauli1; SU<2>::generator(0,pauli1);
+    SU2Matrix pauli2; SU<2>::generator(1,pauli2);
+    SU2Matrix pauli3; SU<2>::generator(2,pauli3);
+    pauli1 = timesI(pauli1)*2.0;
+    pauli2 = timesI(pauli2)*2.0;
+    pauli3 = timesI(pauli3)*2.0;
+
+    LatticeComplex cone(grid);
+    LatticeReal adet(grid);
+    adet = abs(toReal(udet));
+    lident=Complex(1.0);
+    cone  =Complex(1.0);
+    Real machine_epsilon=1.0e-7;
+    u   = where(adet>machine_epsilon,u,lident);
+    udet= where(adet>machine_epsilon,udet,cone);
+
+    xi  = 0.5*sqrt(udet);     //4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
+    u   = 0.5*u*pow(xi,-1.0); //  u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
+
+    // Debug test for sanity
+    uinv=adj(u);
+    b=u*uinv-1.0;
+    assert(norm2(b)<1.0e-4);
+
+  /*
+Measure: Haar measure dh has d^4a delta(1-|a^2|)
+In polars:
+  da = da0 r^2 sin theta dr dtheta dphi delta( 1 - r^2 -a0^2) 
+     = da0 r^2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r)(sqrt(1-a0^) + r) )
+     = da0 r/2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r) )
+
+Action factor Q(h) dh  = e^-S[h]  dh =  e^{  xi Tr uh} dh    // beta enters through xi
+                                     =  e^{2 xi (h.u)} dh
+                                     =  e^{2 xi h0u0}.e^{2 xi h1u1}.e^{2 xi h2u2}.e^{2 xi h3u3} dh
+
+Therefore for each site, take xi for that site
+i) generate  |a0|<1 with dist 
+   (1-a0^2)^0.5 e^{2 xi a0 } da0
+
+Take alpha = 2 xi  = 2 xi [ recall 2 beta/Nc unmod staple norm]; hence 2.0/Nc factor in Chroma ]
+A. Generate two uniformly distributed pseudo-random numbers R and R', R'', R''' in the unit interval; 
+B. Set X = -(ln R)/alpha, X' =-(ln R')/alpha;
+C. Set C = cos^2(2pi R"), with R" another uniform random number in [0,1] ;
+D. Set A = XC;
+E. Let d  = X'+A;
+F. If R'''^2 :> 1 - 0.5 d,  go back to A;
+G. Set a0 = 1 - d;
+
+Note that in step D setting B ~ X - A and using B in place of A in step E will generate a second independent a 0 value.
+  */
+
+    /////////////////////////////////////////////////////////
+    // count the number of sites by picking "1"'s out of hat
+    /////////////////////////////////////////////////////////
+    Integer hit=0;
+    LatticeReal rtmp(grid);
+    rtmp=where(wheremask,rones,rzeros);
+    RealD numSites = sum(rtmp);
+    RealD numAccepted;
+    LatticeInteger      Accepted(grid); Accepted = zero;
+    LatticeInteger newlyAccepted(grid);
+   
+    std::vector<LatticeReal> xr(4,grid);
+    std::vector<LatticeReal>  a(4,grid);
+    LatticeReal d(grid); d=zero;
+    LatticeReal alpha(grid);
+
+    //    std::cout<<"xi "<<xi <<std::endl;
+    alpha = toReal(2.0*xi);
+
+    do { 
+
+      // A. Generate two uniformly distributed pseudo-random numbers R and R', R'', R''' in the unit interval; 
+      random(pRNG,xr[0]); 
+      random(pRNG,xr[1]); 
+      random(pRNG,xr[2]); 
+      random(pRNG,xr[3]); 
+
+      // B. Set X = - ln R/alpha, X' = -ln R'/alpha
+      xr[1] = -log(xr[1])/alpha;  
+      xr[2] = -log(xr[2])/alpha;
+
+      // C. Set C = cos^2(2piR'')
+      xr[3] = cos(xr[3]*twopi);
+      xr[3] = xr[3]*xr[3];
+
+      LatticeReal xrsq(grid);
+
+      //D. Set A = XC;
+      //E. Let d  = X'+A;
+      xrsq = xr[2]+xr[1]*xr[3];
+
+      d = where(Accepted,d,xr[2]+xr[1]*xr[3]);
+	      
+      //F. If R'''^2 :> 1 - 0.5 d,  go back to A;
+      LatticeReal thresh(grid); thresh = 1.0-d*0.5;
+      xrsq = xr[0]*xr[0];
+      LatticeInteger ione(grid); ione = 1;
+      LatticeInteger izero(grid); izero=zero;
+
+      newlyAccepted = where ( xrsq < thresh,ione,izero);
+      Accepted = where ( newlyAccepted, newlyAccepted,Accepted);
+      Accepted = where ( wheremask, Accepted,izero);
+
+      // FIXME need an iSum for integer to avoid overload on return type??
+      rtmp=where(Accepted,rones,rzeros);
+      numAccepted = sum(rtmp);
+
+      hit++;
+
+    } while ( (numAccepted < numSites) && ( hit < nheatbath) );
+
+    // G. Set a0 = 1 - d;
+    a[0]=zero;
+    a[0]=where(wheremask,1.0-d,a[0]);
+
+    //////////////////////////////////////////
+    //    ii) generate a_i uniform on two sphere radius (1-a0^2)^0.5
+    //////////////////////////////////////////
+
+    LatticeReal a123mag(grid);
+    a123mag = sqrt(abs(1.0-a[0]*a[0]));
+
+    LatticeReal cos_theta (grid);
+    LatticeReal sin_theta (grid);
+    LatticeReal       phi (grid);
+
+    random(pRNG,phi);            phi = phi * twopi;       // uniform in [0,2pi]
+    random(pRNG,cos_theta); cos_theta=(cos_theta*2.0)-1.0;  // uniform in [-1,1]
+    sin_theta = sqrt(abs(1.0-cos_theta*cos_theta));
+
+    a[1]    = a123mag * sin_theta * cos(phi);
+    a[2]    = a123mag * sin_theta * sin(phi);
+    a[3]    = a123mag * cos_theta;
+    
+    ua = toComplex(a[0])*ident 
+       + toComplex(a[1])*pauli1 
+       + toComplex(a[2])*pauli2 
+       + toComplex(a[3])*pauli3;
+
+    b = 1.0;
+    b = where(wheremask,uinv*ua,b);
     su2Insert(b,V,su2_subgroup);
 
-    // U = V*U
-    LatticeMatrix tmp(grid);
-    tmp = V * link;
+    //mask the assignment back based on Accptance
+    link = where(Accepted,V * link,link);
 
-    //mask the assignment back
-    link = where(wheremask,tmp,link);
+    //////////////////////////////
+    // Debug Checks
+    // SU2 check
+    LatticeSU2Matrix    check(grid);   // rotated matrix after hb
+    u = zero;
+    check = ua * adj(ua) - 1.0;
+    check = where(Accepted,check,u);
+    assert(norm2(check)<1.0e-4);
 
+    check = b*adj(b)-1.0;
+    check = where(Accepted,check,u);
+    assert(norm2(check)<1.0e-4);
+    
+    LatticeMatrix Vcheck(grid);
+    Vcheck = zero;
+    Vcheck = where(Accepted,V*adj(V) - 1.0,Vcheck);
+    //    std::cout << "SU3 check " <<norm2(Vcheck)<<std::endl;
+    assert(norm2(Vcheck)<1.0e-4);
+    
+    // Verify the link stays in SU(3)
+    //    std::cout <<"Checking the modified link"<<std::endl;
+    Vcheck = link*adj(link) - 1.0;
+    assert(norm2(Vcheck)<1.0e-4);
+    /////////////////////////////////
   }
 
   static void printGenerators(void)
diff --git a/lib/qcd/utils/WilsonLoops.h b/lib/qcd/utils/WilsonLoops.h
index 69b86859..e3885432 100644
--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -30,6 +30,7 @@ public:
   static void sitePlaquette(LatticeComplex &Plaq,const std::vector<GaugeMat> &U)
   {
     LatticeComplex sitePlaq(U[0]._grid);
+    Plaq=zero;
     for(int mu=1;mu<Nd;mu++){
       for(int nu=0;nu<mu;nu++){
 	traceDirPlaquette(sitePlaq,U,mu,nu);
@@ -42,6 +43,7 @@ public:
   //////////////////////////////////////////////////
   static RealD sumPlaquette(const GaugeLorentz &Umu){
     std::vector<GaugeMat> U(4,Umu._grid);
+
     for(int mu=0;mu<Nd;mu++){
       U[mu] = peekIndex<LorentzIndex>(Umu,mu);
     }
@@ -72,13 +74,15 @@ public:
   //////////////////////////////////////////////////
   static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
 
-    std::vector<GaugeMat> U(4,Umu._grid);
+    GridBase *grid = Umu._grid;
+
+    std::vector<GaugeMat> U(4,grid);
     for(int d=0;d<Nd;d++){
       U[d] = peekIndex<LorentzIndex>(Umu,d);
     }
 
     staple = zero;
-    GaugeMat tmp(U[0]._grid);
+    GaugeMat tmp(grid);
 
     for(int nu=0;nu<Nd;nu++){
 
diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h
index 3ba77d82..37d1f6de 100644
--- a/lib/simd/Grid_vector_unops.h
+++ b/lib/simd/Grid_vector_unops.h
@@ -100,7 +100,7 @@ namespace Grid {
   }
   template < class S, class V > 
   inline Grid_simd<S,V> sin(const Grid_simd<S,V> &r) {
-    return SimdApply(CosRealFunctor<S>(),r);
+    return SimdApply(SinRealFunctor<S>(),r);
   }
   template < class S, class V > 
   inline Grid_simd<S,V> log(const Grid_simd<S,V> &r) {
diff --git a/tests/Test_lie_generators.cc b/tests/Test_lie_generators.cc
index 91f2d33f..d36e463e 100644
--- a/tests/Test_lie_generators.cc
+++ b/tests/Test_lie_generators.cc
@@ -44,49 +44,6 @@ int main (int argc, char ** argv)
   //  SU5::printGenerators();
   //  SU5::testGenerators();
 
-  ///////////////////////////////
-  // Configuration of known size
-  ///////////////////////////////
-  NerscField header;
-  std::string file("./ckpoint_lat.400");
-  LatticeGaugeField Umu(grid);
-  //  readNerscConfiguration(Umu,header,file);
-  Umu=1.0; // Cold start
-
-  // RNG set up for test
-  std::vector<int> pseeds({1,2,3,4,5}); // once I caught a fish alive
-  std::vector<int> sseeds({6,7,8,9,10});// then i let it go again
-  GridParallelRNG  pRNG(grid); pRNG.SeedFixedIntegers(pseeds);
-  GridSerialRNG    sRNG;       sRNG.SeedFixedIntegers(sseeds);
-
-  // SU3 colour operatoions
-  LatticeColourMatrix link(grid);
-  LatticeColourMatrix staple(grid);
-  int mu=0;
-
-  // Get Staple
-  ColourWilsonLoops::Staple(staple,Umu,mu);
-  // Get Link
-  link = peekIndex<LorentzIndex>(Umu,mu);
-
-  // Apply heatbath to the link
-  RealD beta=6.0;
-  int subgroup=0;
-  int nhb=1;
-  int trials=0;
-  int fails=0;
-
-  LatticeInteger one(rbGrid);  one = 1; // fill with ones
-  LatticeInteger mask(grid);   mask= zero;
-  one.checkerboard=Even;
-  setCheckerboard(mask,one);
-
-  // update Even checkerboard
-
-  SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,
-			nhb,trials,fails,mask);
-
-
 
   Grid_finalize();
 }