Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion

Peter's GPU branch changes merged with A2A CI code
2025-06-19 16:27:05 +01:00 · 2019-09-30 16:53:44 +01:00
parent 25150eb2e0 b473405652
commit d1daab601a
785 changed files with 41312 additions and 51680 deletions
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@ -1,9 +1,8 @@
 #pragma once
 //#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Eigen/unsupported/CXX11/Tensor>
+#include <Grid/Grid_Eigen_Tensor.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 #undef DELTA_F_EQ_2

@ -116,7 +115,7 @@ private:
  inline static void OuterProductWWVV(std::vector<PropagatorField> &WWVV,
                               const vobj &lhs,
                               const vobj &rhs,
-                               const int Ns, const int ss, const int t);
+                               const int Ns, const int ss);
 };

 template <class FImpl>
@ -140,7 +139,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  int Lblock = mat.dimension(3); 
  int Rblock = mat.dimension(4);

-  GridBase *grid = lhs_wi[0]._grid;
+  GridBase *grid = lhs_wi[0].Grid();
  
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
@ -160,14 +159,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  int MFlvol = ld*Lblock*Rblock*Nmom;

  Vector<SpinMatrix_v > lvSum(MFrvol);
-  parallel_for (int r = 0; r < MFrvol; r++){
-    lvSum[r] = zero;
-  }
+  thread_for( r, MFrvol,{
+    lvSum[r] = Zero();
+  });

  Vector<SpinMatrix_s > lsSum(MFlvol);             
-  parallel_for (int r = 0; r < MFlvol; r++){
+  thread_for(r,MFlvol,{
    lsSum[r]=scalar_type(0.0);
-  }
+  });

  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
@ -175,7 +174,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,

  // potentially wasting cores here if local time extent too small
  if (t_kernel) *t_kernel = -usecond();
-  parallel_for(int r=0;r<rd;r++){
+  thread_for(r,rd,{

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@ -186,12 +185,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,

 	for(int i=0;i<Lblock;i++){

-	  auto left = conjugate(lhs_wi[i]._odata[ss]);
+	  auto lhs_v = lhs_wi[i].View();
+	  auto left = conjugate(lhs_v[ss]);

 	  for(int j=0;j<Rblock;j++){

 	    SpinMatrix_v vv;
-	    auto right = rhs_vj[j]._odata[ss];
+	    auto rhs_v = rhs_vj[j].View();
+	    auto right = rhs_v[ss];
 	    for(int s1=0;s1<Ns;s1++){
 	    for(int s2=0;s2<Ns;s2++){
 	      vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
@ -203,7 +204,8 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
 	    for ( int m=0;m<Nmom;m++){
 	      int idx = m+base;
-	      auto phase = mom[m]._odata[ss];
+	      auto mom_v = mom[m].View();
+	      auto phase = mom_v[ss];
 	      mac(&lvSum[idx],&vv,&phase);
 	    }
 	  
@ -211,14 +213,13 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	}
      }
    }
-  }
-
+  });

  // Sum across simd lanes in the plane, breaking out orthog dir.
-  parallel_for(int rt=0;rt<rd;rt++){
+  thread_for(rt,rd,{

-    std::vector<int> icoor(Nd);
-    std::vector<SpinMatrix_s> extracted(Nsimd);               
+    Coordinate icoor(Nd);
+    ExtractBuffer<SpinMatrix_s> extracted(Nsimd);               

    for(int i=0;i<Lblock;i++){
    for(int j=0;j<Rblock;j++){
@ -240,7 +241,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,

      }
    }}}
-  }
+  });
  if (t_kernel) *t_kernel += usecond();
  assert(mat.dimension(0) == Nmom);
  assert(mat.dimension(1) == Ngamma);
@ -249,8 +250,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  // ld loop and local only??
  int pd = grid->_processors[orthogdim];
  int pc = grid->_processor_coor[orthogdim];
-  parallel_for_nest2(int lt=0;lt<ld;lt++)
-  {
+  thread_for_collapse(2,lt,ld,{
    for(int pt=0;pt<pd;pt++){
      int t = lt + pt*ld;
      if (pt == pc){
@ -278,7 +278,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	}
      }
    }
-  }
+  });

  ////////////////////////////////////////////////////////////////////
  // This global sum is taking as much as 50% of time on 16 nodes
@ -329,7 +329,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
  int Lblock = mat.dimension(1); 
  int Rblock = mat.dimension(2);

-  GridBase *grid = wi[0]._grid;
+  GridBase *grid = wi[0].Grid();
  
  const int    nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
@ -347,20 +347,20 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
  int MFlvol = ld*Lblock*Rblock;

  Vector<vector_type > lvSum(MFrvol);
-  parallel_for (int r = 0; r < MFrvol; r++){
-    lvSum[r] = zero;
-  }
+  thread_for(r,MFrvol,{
+    lvSum[r] = Zero();
+  });

  Vector<scalar_type > lsSum(MFlvol);             
-  parallel_for (int r = 0; r < MFlvol; r++){
+  thread_for(r,MFlvol,{
    lsSum[r]=scalar_type(0.0);
-  }
+  });

  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];

-  parallel_for(int r=0;r<rd;r++){
+  thread_for(r,rd,{

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@ -371,7 +371,8 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,

 	for(int i=0;i<Lblock;i++){

-	  auto w = conjugate(wi[i]._odata[ss]);
+	  auto wi_v = wi[i].View();
+	  auto w = conjugate(wi_v[ss]);
 	  if (g5) {
 	    w()(2)(0) = - w()(2)(0);
 	    w()(2)(1) = - w()(2)(1);
@ -381,8 +382,9 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
 	    w()(3)(2) = - w()(3)(2);
 	  }
 	  for(int j=0;j<Rblock;j++){
-
-	    auto v = vj[j]._odata[ss];
+	    
+	    auto vj_v=vj[j].View();
+	    auto v  = vj_v[ss];
 	    auto vv = v()(0)(0);

 	    vv =      w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
@ -404,14 +406,14 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
 	}
      }
    }
-  }
+  });

  // Sum across simd lanes in the plane, breaking out orthog dir.
-  parallel_for(int rt=0;rt<rd;rt++){
+  thread_for(rt,rd,{

-    std::vector<int> icoor(nd);
+      Coordinate icoor(nd);
    iScalar<vector_type> temp; 
-    std::vector<iScalar<scalar_type> > extracted(Nsimd);               
+    ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               

    for(int i=0;i<Lblock;i++){
    for(int j=0;j<Rblock;j++){
@ -433,14 +435,13 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,

      }
    }}
-  }
+  });

  assert(mat.dimension(0) == Nt);
  // ld loop and local only??
  int pd = grid->_processors[orthogdim];
  int pc = grid->_processor_coor[orthogdim];
-  parallel_for_nest2(int lt=0;lt<ld;lt++)
-  {
+  thread_for_collapse(2,lt,ld,{
    for(int pt=0;pt<pd;pt++){
      int t = lt + pt*ld;
      if (pt == pc){
@ -459,7 +460,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
 	}
      }
    }
-  }
+  });

  grid->GlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock);
 }
@ -474,7 +475,7 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
  int Lblock = mat.dimension(2); 
  int Rblock = mat.dimension(3);

-  GridBase *grid = wi[0]._grid;
+  GridBase *grid = wi[0].Grid();
  
  const int    nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
@ -493,20 +494,20 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
  int MFlvol = ld*Lblock*Rblock*Nmom;

  Vector<vector_type > lvSum(MFrvol);
-  parallel_for (int r = 0; r < MFrvol; r++){
-    lvSum[r] = zero;
-  }
+  thread_for(r,MFrvol,{
+    lvSum[r] = Zero();
+  });

  Vector<scalar_type > lsSum(MFlvol);             
-  parallel_for (int r = 0; r < MFlvol; r++){
+  thread_for(r,MFlvol,{
    lsSum[r]=scalar_type(0.0);
-  }
+  });

  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];

-  parallel_for(int r=0;r<rd;r++){
+  thread_for(r,rd,{

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@ -517,11 +518,13 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,

 	for(int i=0;i<Lblock;i++){

-	  auto w = conjugate(wi[i]._odata[ss]);
+	  auto wi_v = wi[i].View();
+	  auto w = conjugate(wi_v[ss]);

 	  for(int j=0;j<Rblock;j++){
-
-	    auto v = vj[j]._odata[ss];
+	    
+	    auto vj_v = vj[j].View();
+	    auto v = vj_v[ss];

 	    auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
 	      +       w()(0)(1) * v()(0)(1)
@ -541,22 +544,23 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
 	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
 	    for ( int m=0;m<Nmom;m++){
 	      int idx = m+base;
-	      auto phase = mom[m]._odata[ss];
+	      auto mom_v = mom[m].View();
+	      auto phase = mom_v[ss];
 	      mac(&lvSum[idx],&vv,&phase()()());
 	    }
 	  }
 	}
      }
    }
-  }
+  });


  // Sum across simd lanes in the plane, breaking out orthog dir.
-  parallel_for(int rt=0;rt<rd;rt++){
+  thread_for(rt,rd,{

-    std::vector<int> icoor(nd);
+    Coordinate icoor(nd);
    iScalar<vector_type> temp; 
-    std::vector<iScalar<scalar_type> > extracted(Nsimd);               
+    ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               

    for(int i=0;i<Lblock;i++){
    for(int j=0;j<Rblock;j++){
@ -579,15 +583,14 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,

      }
    }}}
-  }
+  });

  assert(mat.dimension(0) == Nmom);
  assert(mat.dimension(1) == Nt);
-
+ 
  int pd = grid->_processors[orthogdim];
  int pc = grid->_processor_coor[orthogdim];
-  parallel_for_nest2(int lt=0;lt<ld;lt++)
-  {
+  thread_for_collapse(2,lt,ld,{
    for(int pt=0;pt<pd;pt++){
      int t = lt + pt*ld;
      if (pt == pc){
@ -610,7 +613,7 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
 	}
      }
    }
-  }
+  });

  grid->GlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock);
 }
@ -678,7 +681,7 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
    int Lblock = mat.dimension(3); 
    int Rblock = mat.dimension(4);

-    GridBase *grid = lhs_wi[0]._grid;
+    GridBase *grid = lhs_wi[0].Grid();
    
    const int    Nd = grid->_ndimension;
    const int Nsimd = grid->Nsimd();
@ -698,16 +701,16 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
    int MFlvol = ld*Lblock*Rblock*Nem;

    Vector<vector_type> lvSum(MFrvol);
-    parallel_for (int r = 0; r < MFrvol; r++)
+    thread_for(r,MFrvol,
    {
-        lvSum[r] = zero;
-    }
+      lvSum[r] = Zero();
+    });

    Vector<scalar_type> lsSum(MFlvol);             
-    parallel_for (int r = 0; r < MFlvol; r++)
+    thread_for(r,MFlvol,
    {
        lsSum[r] = scalar_type(0.0);
-    }
+    });

    int e1=    grid->_slice_nblock[orthogdim];
    int e2=    grid->_slice_block [orthogdim];
@ -716,7 +719,7 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
    // Nested parallelism would be ok
    // Wasting cores here. Test case r
    if (t_kernel) *t_kernel = -usecond();
-    parallel_for(int r=0;r<rd;r++)
+    thread_for(r,rd,
    {
        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@ -727,17 +730,19 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,

            for(int i=0;i<Lblock;i++)
            {
-                auto left = conjugate(lhs_wi[i]._odata[ss]);
+  	        auto wi_v = lhs_wi[i].View();
+                auto left = conjugate(wi_v[ss]);

                for(int j=0;j<Rblock;j++)
                {
                    SpinMatrix_v vv;
-                    auto right = rhs_vj[j]._odata[ss];
+		    auto vj_v  = rhs_vj[j].View();
+                    auto right = vj_v[ss];

                    for(int s1=0;s1<Ns;s1++)
                    for(int s2=0;s2<Ns;s2++)
                    {
-                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+		          vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
                                        + left()(s2)(1) * right()(s1)(1)
                                        + left()(s2)(2) * right()(s1)(2);
                    }
@ -747,9 +752,11 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,

                    for ( int m=0;m<Nem;m++)
                    {
+  		        auto emB0_v = emB0[m].View();
+  		        auto emB1_v = emB1[m].View();
                        int idx  = m+base;
-                        auto b0  = emB0[m]._odata[ss];
-                        auto b1  = emB1[m]._odata[ss];
+                        auto b0  = emB0_v[ss];
+                        auto b1  = emB1_v[ss];
                        auto cb0 = conjugate(b0);
                        auto cb1 = conjugate(b1);

@ -761,13 +768,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
                }
            }
        }
-    }
+    });

    // Sum across simd lanes in the plane, breaking out orthog dir.
-    parallel_for(int rt=0;rt<rd;rt++)
+    thread_for(rt,rd,
    {
-        std::vector<int> icoor(Nd);
-        std::vector<scalar_type> extracted(Nsimd);               
+        Coordinate icoor(Nd);
+        ExtractBuffer<scalar_type> extracted(Nsimd);               

        for(int i=0;i<Lblock;i++)
        for(int j=0;j<Rblock;j++)
@ -787,13 +794,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
            }
        }
-    }
+    });
    if (t_kernel) *t_kernel += usecond();

    // ld loop and local only??
    int pd = grid->_processors[orthogdim];
    int pc = grid->_processor_coor[orthogdim];
-    parallel_for_nest2(int lt=0;lt<ld;lt++)
+    thread_for_collapse(2,lt,ld,
    {
        for(int pt=0;pt<pd;pt++)
        {
@ -821,7 +828,7 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
                }
            }
        }
-    }
+    });
    if (t_gsum) *t_gsum = -usecond();
    grid->GlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock);
    if (t_gsum) *t_gsum += usecond();
@ -989,10 +996,9 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
 				   const FermionField *vs,
 				   const FermionField *vd)
 {
-  std::cout << "Start contraction" << std::endl;
-  GridBase *grid = vs[0]._grid;
+  GridBase *grid = vs[0].Grid();

-  int nd    = grid->_ndimension;
+  //  int nd    = grid->_ndimension;
  int Nsimd = grid->Nsimd();
  int N_t   = WW_sd.dimension(0);
  int N_s = WW_sd.dimension(1); 
@ -1001,30 +1007,33 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
  int d_unroll = 32;// Empirical optimisation

  for(int t=0;t<N_t;t++){
-    WWVV[t] = zero;
+    WWVV[t] = Zero();
  }

-  parallel_for(int ss=0;ss<grid->oSites();ss++){
+  thread_for(ss,grid->oSites(),{
    for(int d_o=0;d_o<N_d;d_o+=d_unroll){
      for(int t=0;t<N_t;t++){
      for(int s=0;s<N_s;s++){
-	auto tmp1 = vs[s]._odata[ss];
-  vobj tmp2 = zero;
-  vobj tmp3 = zero;
-
+  auto vs_v = vs[s].View();
+  auto tmp1 = vs_v[ss];
+  vobj tmp2 = Zero();
+  vobj tmp3 = Zero();
  for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
-	  Scalar_v coeff = WW_sd(t,s,d);
-    tmp3 = conjugate(vd[d]._odata[ss]);
-    mac(&tmp2 ,& coeff, &tmp3 );
-	}
+    auto vd_v = vd[d].View();
+    Scalar_v coeff = WW_sd(t,s,d);
+    tmp3 = conjugate(vd_v[ss]);
+    mac(&tmp2, &coeff, &tmp3);
+  }
+
+  //////////////////////////
+  // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
+  //////////////////////////
+  auto WWVV_v = WWVV[t].View();
+  OuterProductWWVV(WWVV_v, tmp1, tmp2, Ns, ss);

-	//////////////////////////
-	// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
-	//////////////////////////
-  OuterProductWWVV(WWVV, tmp1, tmp2, Ns, ss, t);
      }}
    }
-  }
+  });
 }

 template <class FImpl>
@ -1056,45 +1065,46 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
  for (int t = 0; t < N_t; t++){
    std::cout << GridLogMessage << "Contraction t = " << t << std::endl;
    buf = WW_sd[t];
-    parallel_for(int ss=0;ss<grid->oSites();ss++){
+    thread_for(ss,grid->oSites(),{
      for(int d_o=0;d_o<N_d;d_o+=d_unroll){
        for(int s=0;s<N_s;s++){
-    auto tmp1 = vs[s]._odata[ss];
-    vobj tmp2 = zero;
-    vobj tmp3 = zero;
-
+    auto vs_v = vs[s].View();
+    auto tmp1 = vs_v[ss];
+    vobj tmp2 = Zero();
+    vobj tmp3 = Zero();
    for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
+      auto vd_v = vd[d].View();
      Scalar_v coeff = buf(s,d);
-      tmp3 = conjugate(vd[d]._odata[ss]);
-      mac(&tmp2 ,& coeff, &tmp3 );
+      tmp3 = conjugate(vd_v[ss]);
+      mac(&tmp2, &coeff, &tmp3);
    }

    //////////////////////////
    // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
    //////////////////////////
-    OuterProductWWVV(WWVV, tmp1, tmp2, Ns, ss, t);
+    OuterProductWWVV(WWVV, tmp1, tmp2, Ns, ss);
      }}
-    }
+    });
  }
 }

 template <class FImpl>
-inline void A2Autils<FImpl>::OuterProductWWVV(std::vector<PropagatorField> &WWVV,
+inline void A2Autils<FImpl>::OuterProductWWVV(PropagatorField &WWVV,
                                             const vobj &lhs,
                                             const vobj &rhs,
                                             const int Ns, const int ss, const int t)
 {
  for (int s1 = 0; s1 < Ns; s1++){
    for (int s2 = 0; s2 < Ns; s2++){
-      WWVV[t]._odata[ss]()(s1, s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
-      WWVV[t]._odata[ss]()(s1, s2)(0, 1) += lhs()(s1)(0) * rhs()(s2)(1);
-      WWVV[t]._odata[ss]()(s1, s2)(0, 2) += lhs()(s1)(0) * rhs()(s2)(2);
-      WWVV[t]._odata[ss]()(s1, s2)(1, 0) += lhs()(s1)(1) * rhs()(s2)(0);
-      WWVV[t]._odata[ss]()(s1, s2)(1, 1) += lhs()(s1)(1) * rhs()(s2)(1);
-      WWVV[t]._odata[ss]()(s1, s2)(1, 2) += lhs()(s1)(1) * rhs()(s2)(2);
-      WWVV[t]._odata[ss]()(s1, s2)(2, 0) += lhs()(s1)(2) * rhs()(s2)(0);
-      WWVV[t]._odata[ss]()(s1, s2)(2, 1) += lhs()(s1)(2) * rhs()(s2)(1);
-      WWVV[t]._odata[ss]()(s1, s2)(2, 2) += lhs()(s1)(2) * rhs()(s2)(2);
+      WWVV[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
+      WWVV[ss]()(s1,s2)(0, 1) += lhs()(s1)(0) * rhs()(s2)(1);
+      WWVV[ss]()(s1,s2)(0, 2) += lhs()(s1)(0) * rhs()(s2)(2);
+      WWVV[ss]()(s1,s2)(1, 0) += lhs()(s1)(1) * rhs()(s2)(0);
+      WWVV[ss]()(s1,s2)(1, 1) += lhs()(s1)(1) * rhs()(s2)(1);
+      WWVV[ss]()(s1,s2)(1, 2) += lhs()(s1)(1) * rhs()(s2)(2);
+      WWVV[ss]()(s1,s2)(2, 0) += lhs()(s1)(2) * rhs()(s2)(0);
+      WWVV[ss]()(s1,s2)(2, 1) += lhs()(s1)(2) * rhs()(s2)(1);
+      WWVV[ss]()(s1,s2)(2, 2) += lhs()(s1)(2) * rhs()(s2)(2);
    }
  }
 }
@ -1110,17 +1120,21 @@ void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWV
  assert(gamma0.size()==gamma1.size());
  int Ng = gamma0.size();

-  GridBase *grid = WWVV0._grid;
+  GridBase *grid = WWVV0.Grid();

-  parallel_for(int ss=0;ss<grid->oSites();ss++){
+  auto WWVV0_v = WWVV0.View();
+  auto WWVV1_v = WWVV1.View();
+  auto O_trtr_v= O_trtr.View();
+  auto O_fig8_v= O_fig8.View();
+  thread_for(ss,grid->oSites(),{

    typedef typename ComplexField::vector_object vobj;

    vobj v_trtr;
    vobj v_fig8;

-    auto VV0 = WWVV0._odata[ss];
-    auto VV1 = WWVV1._odata[ss];
+    auto VV0 = WWVV0_v[ss];
+    auto VV1 = WWVV1_v[ss];
    
    for(int g=0;g<Ng;g++){

@ -1128,15 +1142,15 @@ void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWV
      v_fig8 = trace(VV0 * gamma0[g] * VV1 * gamma1[g]);

      if ( g==0 ) {
-	O_trtr._odata[ss] = v_trtr; 
-	O_fig8._odata[ss] = v_fig8;
+	O_trtr_v[ss] = v_trtr; 
+	O_fig8_v[ss] = v_fig8;
      } else { 
-	O_trtr._odata[ss]+= v_trtr; 
-	O_fig8._odata[ss]+= v_fig8;
+	O_trtr_v[ss]+= v_trtr; 
+	O_fig8_v[ss]+= v_fig8;
      }
      
    }
-  }
+  });
 }

 template<class FImpl>
@ -1150,22 +1164,27 @@ void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,
  assert(gamma0.size()==gamma1.size());
  int Ng = gamma0.size();

-  GridBase *grid = WWVV0._grid;
+  GridBase *grid = WWVV0.Grid();

-  parallel_for(int ss=0;ss<grid->oSites();ss++){
+  auto WWVV0_v = WWVV0.View();
+  auto WWVV1_v = WWVV1.View();
+  auto O_trtr_v= O_trtr.View();
+  auto O_fig8_v= O_fig8.View();
+
+  thread_for(ss,grid->oSites(),{

    typedef typename ComplexField::vector_object vobj;

-    auto VV0 = WWVV0._odata[ss];
-    auto VV1 = WWVV1._odata[ss];
+    auto VV0 = WWVV0_v[ss];
+    auto VV1 = WWVV1_v[ss];
    
    for(int g=0;g<Ng;g++){

      auto VV0G = VV0 * gamma0[g];  // Spin multiply
      auto VV1G = VV1 * gamma1[g];

-      vobj v_trtr=zero;
-      vobj v_fig8=zero;
+      vobj v_trtr=Zero();
+      vobj v_fig8=Zero();

      /////////////////////////////////////////
      // Colour mixed
@ -1216,15 +1235,15 @@ Bag [8,4]  fig8 (-227.58,3.58808e-17) trtr (-32.5776,1.83286e-17)     //  - 1602
      }}}}

      if ( g==0 ) {
-	O_trtr._odata[ss] = v_trtr; 
-	O_fig8._odata[ss] = v_fig8;
+	O_trtr_v[ss] = v_trtr; 
+	O_fig8_v[ss] = v_fig8;
      } else { 
-	O_trtr._odata[ss]+= v_trtr; 
-	O_fig8._odata[ss]+= v_fig8;
+	O_trtr_v[ss]+= v_trtr; 
+	O_fig8_v[ss]+= v_fig8;
      }
      
    }
-  }
+  });
 }

 #ifdef DELTA_F_EQ_2
@ -1246,7 +1265,7 @@ void A2Autils<FImpl>::DeltaFeq2(int dt_min,int dt_max,
 				const FermionField *vd,
 				int orthogdim)
 {
-  GridBase *grid = vs[0]._grid;
+  GridBase *grid = vs[0].Grid();

  LOG(Message) << "Computing A2A DeltaF=2 graph" << std::endl;

@ -1298,32 +1317,32 @@ void A2Autils<FImpl>::DeltaFeq2(int dt_min,int dt_max,
    denom_P(t) =ComplexD(0.0);
  }

-  ComplexField D0(grid);   D0 = zero; // <P|A0> correlator from each wall
-  ComplexField D1(grid);   D1 = zero;
+  ComplexField D0(grid);   D0 = Zero(); // <P|A0> correlator from each wall
+  ComplexField D1(grid);   D1 = Zero();

-  ComplexField O1_trtr(grid);  O1_trtr = zero;
-  ComplexField O2_trtr(grid);  O2_trtr = zero;
-  ComplexField O3_trtr(grid);  O3_trtr = zero;
-  ComplexField O4_trtr(grid);  O4_trtr = zero;
-  ComplexField O5_trtr(grid);  O5_trtr = zero;
+  ComplexField O1_trtr(grid);  O1_trtr = Zero();
+  ComplexField O2_trtr(grid);  O2_trtr = Zero();
+  ComplexField O3_trtr(grid);  O3_trtr = Zero();
+  ComplexField O4_trtr(grid);  O4_trtr = Zero();
+  ComplexField O5_trtr(grid);  O5_trtr = Zero();

-  ComplexField O1_fig8(grid);  O1_fig8 = zero;
-  ComplexField O2_fig8(grid);  O2_fig8 = zero;
-  ComplexField O3_fig8(grid);  O3_fig8 = zero;
-  ComplexField O4_fig8(grid);  O4_fig8 = zero;
-  ComplexField O5_fig8(grid);  O5_fig8 = zero;
+  ComplexField O1_fig8(grid);  O1_fig8 = Zero();
+  ComplexField O2_fig8(grid);  O2_fig8 = Zero();
+  ComplexField O3_fig8(grid);  O3_fig8 = Zero();
+  ComplexField O4_fig8(grid);  O4_fig8 = Zero();
+  ComplexField O5_fig8(grid);  O5_fig8 = Zero();

-  ComplexField VV_trtr(grid);  VV_trtr = zero;
-  ComplexField AA_trtr(grid);  AA_trtr = zero;
-  ComplexField SS_trtr(grid);  SS_trtr = zero;
-  ComplexField PP_trtr(grid);  PP_trtr = zero;
-  ComplexField TT_trtr(grid);  TT_trtr = zero;
+  ComplexField VV_trtr(grid);  VV_trtr = Zero();
+  ComplexField AA_trtr(grid);  AA_trtr = Zero();
+  ComplexField SS_trtr(grid);  SS_trtr = Zero();
+  ComplexField PP_trtr(grid);  PP_trtr = Zero();
+  ComplexField TT_trtr(grid);  TT_trtr = Zero();

-  ComplexField VV_fig8(grid);  VV_fig8 = zero;
-  ComplexField AA_fig8(grid);  AA_fig8 = zero;
-  ComplexField SS_fig8(grid);  SS_fig8 = zero;
-  ComplexField PP_fig8(grid);  PP_fig8 = zero;
-  ComplexField TT_fig8(grid);  TT_fig8 = zero;
+  ComplexField VV_fig8(grid);  VV_fig8 = Zero();
+  ComplexField AA_fig8(grid);  AA_fig8 = Zero();
+  ComplexField SS_fig8(grid);  SS_fig8 = Zero();
+  ComplexField PP_fig8(grid);  PP_fig8 = Zero();
+  ComplexField TT_fig8(grid);  TT_fig8 = Zero();

  //////////////////////////////////////////////////
  // Used to store appropriate correlation funcs
@ -1458,5 +1477,5 @@ void A2Autils<FImpl>::DeltaFeq2(int dt_min,int dt_max,
 }
 #endif 

-}}
+NAMESPACE_END(Grid);

--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,13 +24,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef QCD_UTILS_COVARIANT_CSHIFT_H
 #define QCD_UTILS_COVARIANT_CSHIFT_H

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);
+
 ////////////////////////////////////////////////////////////////////////
 // Low performance implementation of CovariantCshift API
 ////////////////////////////////////////////////////////////////////////
@ -39,8 +39,8 @@ namespace QCD {
 namespace PeriodicBC { 

  template<class covariant,class gauge> Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link, 
-									    int mu,
-									    const Lattice<covariant> &field)
+									   int mu,
+									   const Lattice<covariant> &field)
  {
    return Link*Cshift(field,mu,1);// moves towards negative mu
  }
@ -48,7 +48,7 @@ namespace PeriodicBC {
 									    int mu,
 									    const Lattice<covariant> &field)
  {
-    Lattice<covariant> tmp(field._grid);
+    Lattice<covariant> tmp(field.Grid());
    tmp = adj(Link)*field;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
@ -73,21 +73,21 @@ namespace ConjugateBC {
  //                      U  U^* U^* U^T U^adj =  U  (U U U^dag U^T )^*
  //                                           =  U  (U U U^dag)^* ( U^T )^*
  //
-  // So covariant shift rule: conjugate inward shifted plane when crossing boundary applies.
+  // So covariant shift rule: Conjugate inward shifted plane when crossing boundary applies.
  //
-  // This conjugate should be applied to BOTH the link and the covariant field on backward shift
+  // This Conjugate should be applied to BOTH the link and the covariant field on backward shift
  // boundary wrap.
  // 
  //      |  |              
  // xxxxxxxxxxxxxxxxx         
-  //      |  | <---- this link is conjugated, and the path leading into it. Segment crossing in and out is double conjugated.
+  //      |  | <---- this link is Conjugated, and the path leading into it. Segment crossing in and out is double Conjugated.
  //       -- 
  //    ------->
  template<class covariant,class gauge> Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link, 
-									    int mu,
-									    const Lattice<covariant> &field)
+									   int mu,
+									   const Lattice<covariant> &field)
  {
-    GridBase * grid = Link._grid;
+    GridBase * grid = Link.Grid();

    int Lmu = grid->GlobalDimensions()[mu]-1;

@ -106,7 +106,7 @@ namespace ConjugateBC {
 									    int mu,
 									    const Lattice<covariant> &field)
  {
-    GridBase * grid = field._grid;
+    GridBase * grid = field.Grid();

    int Lmu = grid->GlobalDimensions()[mu]-1;

@ -122,9 +122,8 @@ namespace ConjugateBC {
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }

-
 }


-}}
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/utils/CovariantLaplacian.h
+++ b/Grid/qcd/utils/CovariantLaplacian.h
@ -25,13 +25,10 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
+#pragma once 

-#ifndef COVARIANT_LAPLACIAN_H
-#define COVARIANT_LAPLACIAN_H
-
-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 struct LaplacianParams : Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LaplacianParams, 
@ -80,19 +77,19 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
  MultiShiftFunction PowerHalf;    
  MultiShiftFunction PowerInvHalf;    

- public:
+public:
  INHERIT_GIMPL_TYPES(Impl);

  LaplacianAdjointField(GridBase* grid, OperatorFunction<GaugeField>& S, LaplacianParams& p, const RealD k = 1.0)
-      : U(Nd, grid), Solver(S), param(p), kappa(k){
-        AlgRemez remez(param.lo,param.hi,param.precision);
-        std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-        remez.generateApprox(param.degree,1,2);
-        PowerHalf.Init(remez,param.tolerance,false);
-        PowerInvHalf.Init(remez,param.tolerance,true);
+    : U(Nd, grid), Solver(S), param(p), kappa(k){
+    AlgRemez remez(param.lo,param.hi,param.precision);
+    std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
+    remez.generateApprox(param.degree,1,2);
+    PowerHalf.Init(remez,param.tolerance,false);
+    PowerInvHalf.Init(remez,param.tolerance,true);
        

-      };
+  };

  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}
@ -109,14 +106,14 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
    //GaugeField herm = in + adj(in);
    //std::cout << "AHermiticity: " << norm2(herm) << std::endl;

-    GaugeLinkField tmp(in._grid);
-    GaugeLinkField tmp2(in._grid);
-    GaugeLinkField sum(in._grid);
+    GaugeLinkField tmp(in.Grid());
+    GaugeLinkField tmp2(in.Grid());
+    GaugeLinkField sum(in.Grid());

    for (int nu = 0; nu < Nd; nu++) {
-      sum = zero;
+      sum = Zero();
      GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
-      GaugeLinkField out_nu(out._grid);
+      GaugeLinkField out_nu(out.Grid());
      for (int mu = 0; mu < Nd; mu++) {
        tmp = U[mu] * Cshift(in_nu, mu, +1) * adj(U[mu]);
        tmp2 = adj(U[mu]) * in_nu * U[mu];
@ -132,8 +129,8 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
    RealD factor = -kappa / (double(4 * Nd));
    
    for (int mu = 0; mu < Nd; mu++){
-      GaugeLinkField der_mu(der._grid);
-      der_mu = zero;
+      GaugeLinkField der_mu(der.Grid());
+      der_mu = Zero();
      for (int nu = 0; nu < Nd; nu++){
        GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
        der_mu += U[mu] * Cshift(in_nu, mu, 1) * adj(U[mu]) * in_nu;
@ -151,8 +148,8 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
    RealD factor = -kappa / (double(4 * Nd));

    for (int mu = 0; mu < Nd; mu++) {
-      GaugeLinkField der_mu(der._grid);
-      der_mu = zero;
+      GaugeLinkField der_mu(der.Grid());
+      der_mu = Zero();
      for (int nu = 0; nu < Nd; nu++) {
        GaugeLinkField left_nu = PeekIndex<LorentzIndex>(left, nu);
        GaugeLinkField right_nu = PeekIndex<LorentzIndex>(right, nu);
@ -169,7 +166,7 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
  }

  void MSquareRoot(GaugeField& P){
-    GaugeField Gp(P._grid);
+    GaugeField Gp(P.Grid());
    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerHalf);
    msCG(HermOp,P,Gp);
@ -177,7 +174,7 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
  }

  void MInvSquareRoot(GaugeField& P){
-    GaugeField Gp(P._grid);
+    GaugeField Gp(P.Grid());
    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerInvHalf);
    msCG(HermOp,P,Gp);
@ -186,12 +183,9 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {



- private:
+private:
  RealD kappa;
  std::vector<GaugeLinkField> U;
 };

-}
-}
-
-#endif
+NAMESPACE_END(Grid);
--- a/Grid/qcd/utils/CovariantSmearing.h
+++ b/Grid/qcd/utils/CovariantSmearing.h
@ -43,7 +43,7 @@ public:
 			    T& chi, 
 			    const Real& width, int Iterations, int orthog)
  {
-    GridBase *grid = chi._grid;
+    GridBase *grid = chi.Grid();
    T psi(grid);

    ////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/utils/GaugeFix.h
+++ b/Grid/qcd/utils/GaugeFix.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    grid` physics library, www.github.com/paboyle/Grid 

@ -22,19 +22,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 //#include <Grid/Grid.h>

 #ifndef GRID_QCD_GAUGE_FIX_H
 #define GRID_QCD_GAUGE_FIX_H
-namespace Grid {
-namespace QCD {
+
+NAMESPACE_BEGIN(Grid);


 template <class Gimpl> 
 class FourierAcceleratedGaugeFixer  : public Gimpl {
- public:
+public:
  INHERIT_GIMPL_TYPES(Gimpl);

  typedef typename Gimpl::GaugeLinkField GaugeMat;
@ -47,7 +47,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    }
  }
  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu,int orthog) {
-    dmuAmu=zero;
+    dmuAmu=Zero();
    for(int mu=0;mu<Nd;mu++){
      if ( mu != orthog ) {
 	dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
@ -56,13 +56,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
  }  

  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
-    GridBase *grid = Umu._grid;
+    GridBase *grid = Umu.Grid();
    GaugeMat xform(grid);
    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
  }
  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {

-    GridBase *grid = Umu._grid;
+    GridBase *grid = Umu.Grid();

    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
@ -72,7 +72,6 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    xform=1.0;

    std::vector<GaugeMat> U(Nd,grid);
-
    GaugeMat dmuAmu(grid);

    {
@ -125,7 +124,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    }
  };
  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
-    GridBase *grid = U[0]._grid;
+    GridBase *grid = U[0].Grid();

    std::vector<GaugeMat> A(Nd,grid);
    GaugeMat g(grid);
@ -145,14 +144,14 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {

  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {

-    GridBase *grid = U[0]._grid;
+    GridBase *grid = U[0].Grid();

    Real vol = grid->gSites();

    FFT theFFT((GridCartesian *)grid);

    LatticeComplex  Fp(grid);
-    LatticeComplex  psq(grid); psq=zero;
+    LatticeComplex  psq(grid); psq=Zero();
    LatticeComplex  pmu(grid); 
    LatticeComplex   one(grid); one = Complex(1.0,0.0);

@ -172,8 +171,8 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    // Work out Fp = psq_max/ psq...
    // Avoid singularities in Fp
    //////////////////////////////////
-    std::vector<int> latt_size = grid->GlobalDimensions();
-    std::vector<int> coor(grid->_ndimension,0);
+    Coordinate latt_size = grid->GlobalDimensions();
+    Coordinate coor(grid->_ndimension,0);
    for(int mu=0;mu<Nd;mu++) {
      if ( mu != orthog ) { 
 	Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
@ -212,7 +211,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
  }

  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu,int orthog) {
-    GridBase *grid = g._grid;
+    GridBase *grid = g.Grid();
    Complex cialpha(0.0,-alpha);
    GaugeMat ciadmam(grid);
    DmuAmu(A,dmuAmu,orthog);
@ -221,6 +220,6 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
  }  
 };

-}
-}
+NAMESPACE_END(Grid);
+
 #endif
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -25,13 +25,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_QCD_LINALG_UTILS_H
-#define GRID_QCD_LINALG_UTILS_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);

-namespace Grid{
-namespace QCD{
 ////////////////////////////////////////////////////////////////////////
 //This file brings additional linear combination assist that is helpful
 //to QCD such as chiral projectors and spin matrices applied to one of the inputs.
@ -42,170 +41,200 @@ namespace QCD{
 template<class vobj,class Coeff>
 void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,z);

-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();

  Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss++){
-    vobj tmp;
-    tmp = a*x._odata[ss];
-    tmp = tmp + G5*(b*timesI(x._odata[ss]));
-    vstream(z._odata[ss],tmp);
-  }
+  auto x_v = x.View();
+  auto z_v = z.View();
+  accelerator_for( ss, x_v.size(),vobj::Nsimd(), {
+    auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss)));
+    coalescedWrite(z_v[ss],tmp);
+  });
 }

 template<class vobj,class Coeff> 
 void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp = a*x._odata[ss+s]+b*y._odata[ss+sp];
-    vstream(z._odata[ss+s],tmp);
-  }
+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
+  // FIXME -- need a new class of accelerator_loop to implement this
+  //
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    auto tmp = a*x_v(ss+s)+b*y_v(ss+sp);
+    coalescedWrite(z_v[ss+s],tmp);
+  });
 }

 template<class vobj,class Coeff> 
 void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
-
  Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp;
-    tmp = G5*x._odata[ss+s]*a;
-    tmp = tmp + b*y._odata[ss+sp];
-    vstream(z._odata[ss+s],tmp);
-  }
+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    auto tmp = G5*x_v(ss+s)*a + b*y_v(ss+sp);
+    coalescedWrite(z_v[ss+s],tmp);
+  });
 }

 template<class vobj,class Coeff> 
 void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
  Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp;
-    tmp = G5*y._odata[ss+sp]*b;
-    tmp = tmp + a*x._odata[ss+s];
-    vstream(z._odata[ss+s],tmp);
-  }
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    auto tmp = G5*y_v(ss+sp)*b + a*x_v(ss+s);
+    coalescedWrite(z_v[ss+s],tmp);
+  });
 }

 template<class vobj,class Coeff> 
 void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];

+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
  Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp1;
-    vobj tmp2;
-    tmp1 = a*x._odata[ss+s]+b*y._odata[ss+sp];
-    tmp2 = G5*tmp1;
-    vstream(z._odata[ss+s],tmp2);
-  }
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    auto tmp1 = a*x_v(ss+s)+b*y_v(ss+sp);
+    auto tmp2 = G5*tmp1;
+    coalescedWrite(z_v[ss+s],tmp2);
+  });
 }

 template<class vobj,class Coeff> 
 void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp;
-    spProj5m(tmp,y._odata[ss+sp]);
-    tmp = a*x._odata[ss+s]+b*tmp;
-    vstream(z._odata[ss+s],tmp);
-  }
+
+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    decltype(coalescedRead(y_v[ss+sp])) tmp;
+    spProj5m(tmp,y_v(ss+sp));
+    tmp = a*x_v(ss+s)+b*tmp;
+    coalescedWrite(z_v[ss+s],tmp);
+  });
 }

 template<class vobj,class Coeff> 
 void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp;
-    spProj5p(tmp,y._odata[ss+sp]);
-    tmp = a*x._odata[ss+s]+b*tmp;
-    vstream(z._odata[ss+s],tmp);
-  }
+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    decltype(coalescedRead(y_v[ss+sp])) tmp;
+    spProj5p(tmp,y_v(ss+sp));
+    tmp = a*x_v(ss+s)+b*tmp;
+    coalescedWrite(z_v[ss+s],tmp);
+  });
 }

 template<class vobj> 
 void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
 {
-  GridBase *grid=x._grid;
-  z.checkerboard = x.checkerboard;
+  GridBase *grid=x.Grid();
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,z);
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls) {
-    vobj tmp;
+  auto x_v = x.View();
+  auto z_v = z.View();
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
    for(int s=0;s<Ls;s++){
      int sp = Ls-1-s;
-      tmp = G5*x._odata[ss+s];
-      vstream(z._odata[ss+sp],tmp);
+      coalescedWrite(z_v[ss+sp],G5*x_v(ss+s));
    }
-  }
-}
+  });
 }

 // I explicitly need these outside the QCD namespace
 template<typename vobj>
 void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
 {
-  GridBase *grid = x._grid;
-  z.checkerboard = x.checkerboard;
+  GridBase *grid = x.Grid();
+  z.Checkerboard() = x.Checkerboard();
  conformable(x, z);

-  QCD::Gamma G5(QCD::Gamma::Algebra::Gamma5);
+  Gamma G5(Gamma::Algebra::Gamma5);
  z = G5 * x;
 }

 template<class CComplex, int nbasis>
 void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
 {
-  GridBase *grid = x._grid;
-  z.checkerboard = x.checkerboard;
+  GridBase *grid = x.Grid();
+  z.Checkerboard() = x.Checkerboard();
  conformable(x, z);

  static_assert(nbasis % 2 == 0, "");
  int nb = nbasis / 2;

-  parallel_for(int ss = 0; ss < grid->oSites(); ss++) {
+  auto z_v = z.View();
+  auto x_v = x.View();
+  accelerator_for(ss,grid->oSites(),CComplex::Nsimd(),
+  {
    for(int n = 0; n < nb; ++n) {
-      z._odata[ss](n) = x._odata[ss](n);
+      coalescedWrite(z_v[ss](n), x_v(ss)(n));
    }
    for(int n = nb; n < nbasis; ++n) {
-      z._odata[ss](n) = -x._odata[ss](n);
+      coalescedWrite(z_v[ss](n), -x_v(ss)(n));
    }
-  }
+  });
 }

-}
-#endif 
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/utils/Metric.h
+++ b/Grid/qcd/utils/Metric.h
@ -25,13 +25,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
-//--------------------------------------------------------------------
-#ifndef METRIC_H
-#define METRIC_H
+			   /*  END LEGAL */
+			   //--------------------------------------------------------------------
+#pragma once

-namespace Grid{
-namespace QCD{
+NAMESPACE_BEGIN(Grid);

 template <typename Field> 
 class Metric{
@ -64,10 +62,10 @@ public:
    // do nothing
  }
  virtual void MDeriv(const Field& in, Field& out){
-    out = zero;
+    out = Zero();
  }
  virtual void MDeriv(const Field& left, const Field& right, Field& out){
-    out = zero;
+    out = Zero();
  }

 };
@ -108,7 +106,7 @@ public:
    if (1) {
      // Auxiliary momenta
      // do nothing if trivial, so hide in the metric
-      MomentaField AuxMomTemp(Mom._grid);
+      MomentaField AuxMomTemp(Mom.Grid());
      Implementation::generate_momenta(AuxMom, pRNG);
      Implementation::generate_momenta(AuxField, pRNG);
      // Modify the distribution with the metric
@ -119,11 +117,11 @@ public:

  // Correct
  RealD MomentaAction(){
-    MomentaField inv(Mom._grid);
-    inv = zero;
+    MomentaField inv(Mom.Grid());
+    inv = Zero();
    M.Minv(Mom, inv);
-    LatticeComplex Hloc(Mom._grid);
-    Hloc = zero;
+    LatticeComplex Hloc(Mom.Grid());
+    Hloc = Zero();
    for (int mu = 0; mu < Nd; mu++) {
      // This is not very general
      // hide in the metric
@ -147,7 +145,7 @@ public:
      }
    }

-    Complex Hsum = sum(Hloc);
+    auto Hsum = TensorRemove(sum(Hloc));
    return Hsum.real();
  }

@ -156,9 +154,9 @@ public:

    // Compute the derivative of the kinetic term
    // with respect to the gauge field
-    MomentaField MDer(in._grid);
-    MomentaField X(in._grid);
-    X = zero;
+    MomentaField MDer(in.Grid());
+    MomentaField X(in.Grid());
+    X = Zero();
    M.Minv(in, X);  // X = G in
    M.MDeriv(X, MDer);  // MDer = U * dS/dU
    der = Implementation::projectForce(MDer);  // Ta if gauge fields
@ -166,27 +164,27 @@ public:
  }

  void AuxiliaryFieldsDerivative(MomentaField& der){
-    der = zero;
+    der = Zero();
    if (1){
-    // Auxiliary fields
-    MomentaField der_temp(der._grid);
-    MomentaField X(der._grid);
-    X=zero;
-    //M.M(AuxMom, X); // X = M Aux
-    // Two derivative terms
-    // the Mderiv need separation of left and right terms
-    M.MDeriv(AuxMom, der); 
+      // Auxiliary fields
+      MomentaField der_temp(der.Grid());
+      MomentaField X(der.Grid());
+      X=Zero();
+      //M.M(AuxMom, X); // X = M Aux
+      // Two derivative terms
+      // the Mderiv need separation of left and right terms
+      M.MDeriv(AuxMom, der); 


-    // this one should not be necessary (identical to the previous one)
-    //M.MDeriv(X, AuxMom, der_temp); der += der_temp;
+      // this one should not be necessary (identical to the previous one)
+      //M.MDeriv(X, AuxMom, der_temp); der += der_temp;

-    der = -1.0*Implementation::projectForce(der);
+      der = -1.0*Implementation::projectForce(der);
    }
  }

  void DerivativeP(MomentaField& der){
-    der = zero;
+    der = Zero();
    M.Minv(Mom, der);
    // is the projection necessary here?
    // no for fields in the algebra
@ -201,8 +199,8 @@ public:

  void update_auxiliary_fields(RealD ep){
    if (1) {
-      MomentaField tmp(AuxMom._grid);
-      MomentaField tmp2(AuxMom._grid);
+      MomentaField tmp(AuxMom.Grid());
+      MomentaField tmp2(AuxMom.Grid());
      M.M(AuxMom, tmp);
      // M.M(tmp, tmp2);
      AuxField += ep * tmp;  // M^2 AuxMom
@ -212,15 +210,5 @@ public:

 };

+NAMESPACE_END(Grid);

-
-
-
-
-
-
-}
-}
-
-
-#endif //METRIC_H
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@ -28,16 +28,15 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #ifndef QCD_UTIL_SUN_H
 #define QCD_UTIL_SUN_H

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template <int ncolour>
 class SU {
- public:
+public:
  static const int Dimension = ncolour;
  static const int AdjointDimension = ncolour * ncolour - 1;
  static int su2subgroups(void) { return (ncolour * (ncolour - 1)) / 2; }
@ -48,7 +47,7 @@ class SU {
  using iSU2Matrix = iScalar<iScalar<iMatrix<vtype, 2> > >;
  template <typename vtype>
  using iSUnAlgebraVector =
-      iScalar<iScalar<iVector<vtype, AdjointDimension> > >;
+    iScalar<iScalar<iVector<vtype, AdjointDimension> > >;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  // Types can be accessed as SU<2>::Matrix , SU<2>::vSUnMatrix,
@ -163,7 +162,7 @@ class SU {
  
  template <class cplx>
  static void generatorSigmaY(int su2Index, iSUnMatrix<cplx> &ta) {
-    ta = zero;
+    ta = Zero();
    int i1, i2;
    su2SubGroupIndex(i1, i2, su2Index);
    ta()()(i1, i2) = 1.0;
@ -173,7 +172,7 @@ class SU {
  
  template <class cplx>
  static void generatorSigmaX(int su2Index, iSUnMatrix<cplx> &ta) {
-    ta = zero;
+    ta = Zero();
    cplx i(0.0, 1.0);
    int i1, i2;
    su2SubGroupIndex(i1, i2, su2Index);
@ -185,7 +184,7 @@ class SU {
  template <class cplx>
  static void generatorDiagonal(int diagIndex, iSUnMatrix<cplx> &ta) {
    // diag ({1, 1, ..., 1}(k-times), -k, 0, 0, ...)
-    ta = zero;
+    ta = Zero();
    int k = diagIndex + 1;                  // diagIndex starts from 0
    for (int i = 0; i <= diagIndex; i++) {  // k iterations
      ta()()(i, i) = 1.0;
@ -218,28 +217,32 @@ class SU {
                         Lattice<iSU2Matrix<vcplx> > &subgroup,
                         const Lattice<iSUnMatrix<vcplx> > &source,
                         int su2_index) {
-    GridBase *grid(source._grid);
+    GridBase *grid(source.Grid());
    conformable(subgroup, source);
    conformable(subgroup, Determinant);
    int i0, i1;
    su2SubGroupIndex(i0, i1, su2_index);
+    auto subgroup_v = subgroup.View();
+    auto source_v   = source.View();
+    auto Determinant_v = Determinant.View();

-    parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
-      subgroup._odata[ss]()()(0, 0) = source._odata[ss]()()(i0, i0);
-      subgroup._odata[ss]()()(0, 1) = source._odata[ss]()()(i0, i1);
-      subgroup._odata[ss]()()(1, 0) = source._odata[ss]()()(i1, i0);
-      subgroup._odata[ss]()()(1, 1) = source._odata[ss]()()(i1, i1);
+    thread_for(ss, grid->oSites(), {

-      iSU2Matrix<vcplx> Sigma = subgroup._odata[ss];
+      subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0);
+      subgroup_v[ss]()()(0, 1) = source_v[ss]()()(i0, i1);
+      subgroup_v[ss]()()(1, 0) = source_v[ss]()()(i1, i0);
+      subgroup_v[ss]()()(1, 1) = source_v[ss]()()(i1, i1);
+
+      iSU2Matrix<vcplx> Sigma = subgroup_v[ss];

      Sigma = Sigma - adj(Sigma) + trace(adj(Sigma));

-      subgroup._odata[ss] = Sigma;
+      subgroup_v[ss] = Sigma;

      // this should be purely real
-      Determinant._odata[ss] =
-          Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0);
-    }
+      Determinant_v[ss] =
+	Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0);
+    });
  }

  //////////////////////////////////////////////////////////////////////////////////////////
@ -248,18 +251,21 @@ class SU {
  template <class vcplx>
  static void su2Insert(const Lattice<iSU2Matrix<vcplx> > &subgroup,
                        Lattice<iSUnMatrix<vcplx> > &dest, int su2_index) {
-    GridBase *grid(dest._grid);
+    GridBase *grid(dest.Grid());
    conformable(subgroup, dest);
    int i0, i1;
    su2SubGroupIndex(i0, i1, su2_index);

    dest = 1.0;  // start out with identity
-    parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
-      dest._odata[ss]()()(i0, i0) = subgroup._odata[ss]()()(0, 0);
-      dest._odata[ss]()()(i0, i1) = subgroup._odata[ss]()()(0, 1);
-      dest._odata[ss]()()(i1, i0) = subgroup._odata[ss]()()(1, 0);
-      dest._odata[ss]()()(i1, i1) = subgroup._odata[ss]()()(1, 1);
-    }
+    auto dest_v = dest.View();
+    auto subgroup_v = subgroup.View();
+    thread_for(ss, grid->oSites(),
+    {
+      dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0);
+      dest_v[ss]()()(i0, i1) = subgroup_v[ss]()()(0, 1);
+      dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0);
+      dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1);
+    });
  }

  ///////////////////////////////////////////////
@ -272,16 +278,14 @@ class SU {
  // in action.
  //
  ///////////////////////////////////////////////
-  static void SubGroupHeatBath(
-      GridSerialRNG &sRNG, GridParallelRNG &pRNG,
-      RealD beta,  // coeff multiplying staple in action (with no 1/Nc)
-      LatticeMatrix &link,
-      const LatticeMatrix &barestaple,  // multiplied by action coeffs so th
-      int su2_subgroup, int nheatbath, LatticeInteger &wheremask) {
-    GridBase *grid = link._grid;
+  static void SubGroupHeatBath(GridSerialRNG &sRNG, GridParallelRNG &pRNG,
+			       RealD beta,  // coeff multiplying staple in action (with no 1/Nc)
+			       LatticeMatrix &link,
+			       const LatticeMatrix &barestaple,  // multiplied by action coeffs so th
+			       int su2_subgroup, int nheatbath, LatticeInteger &wheremask) 
+  {
+    GridBase *grid = link.Grid();

-    int ntrials = 0;
-    int nfails = 0;
    const RealD twopi = 2.0 * M_PI;

    LatticeMatrix staple(grid);
@ -292,8 +296,7 @@ class SU {
    V = link * staple;

    // Subgroup manipulation in the lie algebra space
-    LatticeSU2Matrix u(
-        grid);  // Kennedy pendleton "u" real projected normalised Sigma
+    LatticeSU2Matrix u(grid);  // Kennedy pendleton "u" real projected normalised Sigma
    LatticeSU2Matrix uinv(grid);
    LatticeSU2Matrix ua(grid);  // a in pauli form
    LatticeSU2Matrix b(grid);   // rotated matrix after hb
@ -302,11 +305,11 @@ class SU {
    LatticeComplex ones(grid);
    ones = 1.0;
    LatticeComplex zeros(grid);
-    zeros = zero;
+    zeros = Zero();
    LatticeReal rones(grid);
    rones = 1.0;
    LatticeReal rzeros(grid);
-    rzeros = zero;
+    rzeros = Zero();
    LatticeComplex udet(grid);  // determinant of real(staple)
    LatticeInteger mask_true(grid);
    mask_true = 1;
@ -314,41 +317,41 @@ class SU {
    mask_false = 0;

    /*
-  PLB 156 P393 (1985) (Kennedy and Pendleton)
+      PLB 156 P393 (1985) (Kennedy and Pendleton)

-  Note: absorb "beta" into the def of sigma compared to KP paper; staple
-  passed to this routine has "beta" already multiplied in
+      Note: absorb "beta" into the def of sigma compared to KP paper; staple
+      passed to this routine has "beta" already multiplied in

-  Action linear in links h and of form:
+      Action linear in links h and of form:

      beta S = beta  Sum_p (1 - 1/Nc Re Tr Plaq )

-  Writing Sigma = 1/Nc (beta Sigma') where sum over staples is "Sigma' "
+      Writing Sigma = 1/Nc (beta Sigma') where sum over staples is "Sigma' "

-       beta S = const - beta/Nc Re Tr h Sigma'
-              = const - Re Tr h Sigma
+      beta S = const - beta/Nc Re Tr h Sigma'
+      = const - Re Tr h Sigma

-  Decompose h and Sigma into (1, sigma_j) ; h_i real, h^2=1, Sigma_i complex
-  arbitrary.
+      Decompose h and Sigma into (1, sigma_j) ; h_i real, h^2=1, Sigma_i complex
+      arbitrary.

      Tr h Sigma = h_i Sigma_j Tr (sigma_i sigma_j)  = h_i Sigma_j 2 delta_ij
-   Re Tr h Sigma = 2 h_j Re Sigma_j
+      Re Tr h Sigma = 2 h_j Re Sigma_j

-  Normalised re Sigma_j = xi u_j
+      Normalised re Sigma_j = xi u_j

-  With u_j a unit vector and U can be in SU(2);
+      With u_j a unit vector and U can be in SU(2);

-  Re Tr h Sigma = 2 h_j Re Sigma_j = 2 xi (h.u)
+      Re Tr h Sigma = 2 h_j Re Sigma_j = 2 xi (h.u)

-  4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
-   u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
+      4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
+      u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]

-   xi = sqrt(Det)/2;
+      xi = sqrt(Det)/2;

-  Write a= u h in SU(2); a has pauli decomp a_j;
+      Write a= u h in SU(2); a has pauli decomp a_j;

-  Note: Product b' xi is unvariant because scaling Sigma leaves
-        normalised vector "u" fixed; Can rescale Sigma so b' = 1.
+      Note: Product b' xi is unvariant because scaling Sigma leaves
+      normalised vector "u" fixed; Can rescale Sigma so b' = 1.
    */

    ////////////////////////////////////////////////////////
@ -386,7 +389,7 @@ class SU {

    xi = 0.5 * sqrt(udet);  // 4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
    u = 0.5 * u *
-        pow(xi, -1.0);  //  u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
+      pow(xi, -1.0);  //  u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]

    // Debug test for sanity
    uinv = adj(u);
@ -394,36 +397,36 @@ class SU {
    assert(norm2(b) < 1.0e-4);

    /*
-  Measure: Haar measure dh has d^4a delta(1-|a^2|)
-  In polars:
-    da = da0 r^2 sin theta dr dtheta dphi delta( 1 - r^2 -a0^2)
-       = da0 r^2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r)(sqrt(1-a0^) +
-  r) )
-       = da0 r/2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r) )
+      Measure: Haar measure dh has d^4a delta(1-|a^2|)
+      In polars:
+      da = da0 r^2 sin theta dr dtheta dphi delta( 1 - r^2 -a0^2)
+      = da0 r^2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r)(sqrt(1-a0^) +
+      r) )
+      = da0 r/2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r) )

-  Action factor Q(h) dh  = e^-S[h]  dh =  e^{  xi Tr uh} dh    // beta enters
-  through xi
-                                       =  e^{2 xi (h.u)} dh
-                                       =  e^{2 xi h0u0}.e^{2 xi h1u1}.e^{2 xi
-  h2u2}.e^{2 xi h3u3} dh
+      Action factor Q(h) dh  = e^-S[h]  dh =  e^{  xi Tr uh} dh    // beta enters
+      through xi
+      =  e^{2 xi (h.u)} dh
+      =  e^{2 xi h0u0}.e^{2 xi h1u1}.e^{2 xi
+      h2u2}.e^{2 xi h3u3} dh

-  Therefore for each site, take xi for that site
-  i) generate  |a0|<1 with dist
-     (1-a0^2)^0.5 e^{2 xi a0 } da0
+      Therefore for each site, take xi for that site
+      i) generate  |a0|<1 with dist
+      (1-a0^2)^0.5 e^{2 xi a0 } da0

-  Take alpha = 2 xi  = 2 xi [ recall 2 beta/Nc unmod staple norm]; hence 2.0/Nc
-  factor in Chroma ]
-  A. Generate two uniformly distributed pseudo-random numbers R and R', R'',
-  R''' in the unit interval;
-  B. Set X = -(ln R)/alpha, X' =-(ln R')/alpha;
-  C. Set C = cos^2(2pi R"), with R" another uniform random number in [0,1] ;
-  D. Set A = XC;
-  E. Let d  = X'+A;
-  F. If R'''^2 :> 1 - 0.5 d,  go back to A;
-  G. Set a0 = 1 - d;
+      Take alpha = 2 xi  = 2 xi [ recall 2 beta/Nc unmod staple norm]; hence 2.0/Nc
+      factor in Chroma ]
+      A. Generate two uniformly distributed pseudo-random numbers R and R', R'',
+      R''' in the unit interval;
+      B. Set X = -(ln R)/alpha, X' =-(ln R')/alpha;
+      C. Set C = cos^2(2pi R"), with R" another uniform random number in [0,1] ;
+      D. Set A = XC;
+      E. Let d  = X'+A;
+      F. If R'''^2 :> 1 - 0.5 d,  go back to A;
+      G. Set a0 = 1 - d;

-  Note that in step D setting B ~ X - A and using B in place of A in step E will
-  generate a second independent a 0 value.
+      Note that in step D setting B ~ X - A and using B in place of A in step E will
+      generate a second independent a 0 value.
    */

    /////////////////////////////////////////////////////////
@ -435,13 +438,13 @@ class SU {
    RealD numSites = sum(rtmp);
    RealD numAccepted;
    LatticeInteger Accepted(grid);
-    Accepted = zero;
+    Accepted = Zero();
    LatticeInteger newlyAccepted(grid);

    std::vector<LatticeReal> xr(4, grid);
    std::vector<LatticeReal> a(4, grid);
    LatticeReal d(grid);
-    d = zero;
+    d = Zero();
    LatticeReal alpha(grid);

    //    std::cout<<GridLogMessage<<"xi "<<xi <<std::endl;
@ -478,7 +481,7 @@ class SU {
      LatticeInteger ione(grid);
      ione = 1;
      LatticeInteger izero(grid);
-      izero = zero;
+      izero = Zero();

      newlyAccepted = where(xrsq < thresh, ione, izero);
      Accepted = where(newlyAccepted, newlyAccepted, Accepted);
@ -493,7 +496,7 @@ class SU {
    } while ((numAccepted < numSites) && (hit < nheatbath));

    // G. Set a0 = 1 - d;
-    a[0] = zero;
+    a[0] = Zero();
    a[0] = where(wheremask, 1.0 - d, a[0]);

    //////////////////////////////////////////
@ -517,7 +520,7 @@ class SU {
    a[2] = a123mag * sin_theta * sin(phi);
    a[3] = a123mag * cos_theta;

-    ua = toComplex(a[0]) * ident + toComplex(a[1]) * pauli1 +
+    ua = toComplex(a[0]) * ident  + toComplex(a[1]) * pauli1 +
         toComplex(a[2]) * pauli2 + toComplex(a[3]) * pauli3;

    b = 1.0;
@ -531,7 +534,7 @@ class SU {
    // Debug Checks
    // SU2 check
    LatticeSU2Matrix check(grid);  // rotated matrix after hb
-    u = zero;
+    u = Zero();
    check = ua * adj(ua) - 1.0;
    check = where(Accepted, check, u);
    assert(norm2(check) < 1.0e-4);
@ -541,7 +544,7 @@ class SU {
    assert(norm2(check) < 1.0e-4);

    LatticeMatrix Vcheck(grid);
-    Vcheck = zero;
+    Vcheck = Zero();
    Vcheck = where(Accepted, V * adj(V) - 1.0, Vcheck);
    //    std::cout<<GridLogMessage << "SU3 check " <<norm2(Vcheck)<<std::endl;
    assert(norm2(Vcheck) < 1.0e-4);
@ -607,7 +610,7 @@ class SU {
  template <typename LatticeMatrixType>
  static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out,
                           double scale = 1.0) {
-    GridBase *grid = out._grid;
+    GridBase *grid = out.Grid();

    typedef typename LatticeMatrixType::vector_type vector_type;
    typedef typename LatticeMatrixType::scalar_type scalar_type;
@ -616,16 +619,16 @@ class SU {

    typedef Lattice<vTComplexType> LatticeComplexType;
    typedef typename GridTypeMapper<
-        typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
+      typename LatticeMatrixType::vector_object>::scalar_object MatrixType;

    LatticeComplexType ca(grid);
    LatticeMatrixType lie(grid);
    LatticeMatrixType la(grid);
    ComplexD ci(0.0, scale);
-    ComplexD cone(1.0, 0.0);
+    //    ComplexD cone(1.0, 0.0);
    MatrixType ta;

-    lie = zero;
+    lie = Zero();
    for (int a = 0; a < AdjointDimension; a++) {
      random(pRNG, ca);

@ -644,13 +647,13 @@ class SU {
  static void GaussianFundamentalLieAlgebraMatrix(GridParallelRNG &pRNG,
                                                  LatticeMatrix &out,
                                                  Real scale = 1.0) {
-    GridBase *grid = out._grid;
+    GridBase *grid = out.Grid();
    LatticeReal ca(grid);
    LatticeMatrix la(grid);
    Complex ci(0.0, scale);
    Matrix ta;

-    out = zero;
+    out = Zero();
    for (int a = 0; a < AdjointDimension; a++) {
      gaussian(pRNG, ca);
      generator(a, ta);
@ -664,11 +667,11 @@ class SU {
                                          LatticeMatrix &out,
                                          Real scale = 1.0) {
    conformable(h, out);
-    GridBase *grid = out._grid;
+    GridBase *grid = out.Grid();
    LatticeMatrix la(grid);
    Matrix ta;

-    out = zero;
+    out = Zero();
    for (int a = 0; a < AdjointDimension; a++) {
      generator(a, ta);
      la = peekColour(h, a) * timesI(ta) * scale;
@ -687,10 +690,11 @@ class SU {
 /*
 * Adjoint rep gauge xform
 */
+
  template<typename GaugeField,typename GaugeMat>
  static void GaugeTransform( GaugeField &Umu, GaugeMat &g){
-    GridBase *grid = Umu._grid;
-    conformable(grid,g._grid);
+    GridBase *grid = Umu.Grid();
+    conformable(grid,g.Grid());

    GaugeMat U(grid);
    GaugeMat ag(grid); ag = adj(g);
@ -702,8 +706,8 @@ class SU {
    }
  }
  template<typename GaugeMat>
-    static void GaugeTransform( std::vector<GaugeMat> &U, GaugeMat &g){
-    GridBase *grid = g._grid;
+  static void GaugeTransform( std::vector<GaugeMat> &U, GaugeMat &g){
+    GridBase *grid = g.Grid();
    GaugeMat ag(grid); ag = adj(g);
    for(int mu=0;mu<Nd;mu++){
      U[mu] = g*U[mu]*Cshift(ag, mu, 1);
@ -719,7 +723,7 @@ class SU {
  // inverse operation: FundamentalLieAlgebraMatrix
  static void projectOnAlgebra(LatticeAlgebraVector &h_out, const LatticeMatrix &in, Real scale = 1.0) {
    conformable(h_out, in);
-    h_out = zero;
+    h_out = Zero();
    Matrix Ta;

    for (int a = 0; a < AdjointDimension; a++) {
@ -735,7 +739,7 @@ class SU {
    typedef iSUnMatrix<vector_type> vMatrixType;
    typedef Lattice<vMatrixType> LatticeMatrixType;

-    LatticeMatrixType Umu(out._grid);
+    LatticeMatrixType Umu(out.Grid());
    for (int mu = 0; mu < Nd; mu++) {
      LieRandomize(pRNG, Umu, 1.0);
      PokeIndex<LorentzIndex>(out, Umu, mu);
@ -747,7 +751,7 @@ class SU {
    typedef iSUnMatrix<vector_type> vMatrixType;
    typedef Lattice<vMatrixType> LatticeMatrixType;

-    LatticeMatrixType Umu(out._grid);
+    LatticeMatrixType Umu(out.Grid());
    for(int mu=0;mu<Nd;mu++){
      LieRandomize(pRNG,Umu,0.01);
      PokeIndex<LorentzIndex>(out,Umu,mu);
@ -759,7 +763,7 @@ class SU {
    typedef iSUnMatrix<vector_type> vMatrixType;
    typedef Lattice<vMatrixType> LatticeMatrixType;

-    LatticeMatrixType Umu(out._grid);
+    LatticeMatrixType Umu(out.Grid());
    Umu=1.0;
    for(int mu=0;mu<Nd;mu++){
      PokeIndex<LorentzIndex>(out,Umu,mu);
@ -778,7 +782,7 @@ class SU {
  static void taExp(const LatticeMatrixType &x, LatticeMatrixType &ex) {
    typedef typename LatticeMatrixType::scalar_type ComplexType;

-    LatticeMatrixType xn(x._grid);
+    LatticeMatrixType xn(x.Grid());
    RealD nfac = 1.0;

    xn = x;
@ -801,6 +805,5 @@ typedef SU<5> SU5;

 typedef SU<Nc> FundamentalMatrices;

-}
-}
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/utils/SUnAdjoint.h
+++ b/Grid/qcd/utils/SUnAdjoint.h
@ -22,17 +22,16 @@
 //
 ////////////////////////////////////////////////////////////////////////

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template <int ncolour>
 class SU_Adjoint : public SU<ncolour> {
- public:
+public:
  static const int Dimension = ncolour * ncolour - 1;

  template <typename vtype>
  using iSUnAdjointMatrix =
-      iScalar<iScalar<iMatrix<vtype, Dimension > > >;
+    iScalar<iScalar<iMatrix<vtype, Dimension > > >;

  // Actually the adjoint matrices are real...
  // Consider this overhead... FIXME
@ -49,11 +48,11 @@ class SU_Adjoint : public SU<ncolour> {
  typedef Lattice<vAMatrixD> LatticeAdjMatrixD;

  typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> >
-      LatticeAdjField;
+  LatticeAdjField;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> >
-      LatticeAdjFieldF;
+  LatticeAdjFieldF;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> >
-      LatticeAdjFieldD;
+  LatticeAdjFieldD;



@ -62,7 +61,7 @@ class SU_Adjoint : public SU<ncolour> {
  static void generator(int Index, iSUnAdjointMatrix<cplx> &iAdjTa) {
    // returns i(T_Adj)^index necessary for the projectors
    // see definitions above
-    iAdjTa = zero;
+    iAdjTa = Zero();
    Vector<typename SU<ncolour>::template iSUnMatrix<cplx> > ta(ncolour * ncolour - 1);
    typename SU<ncolour>::template iSUnMatrix<cplx> tmp;

@ -73,7 +72,7 @@ class SU_Adjoint : public SU<ncolour> {
      tmp = ta[a] * ta[Index] - ta[Index] * ta[a];
      for (int b = 0; b < (ncolour * ncolour - 1); b++) {
        typename SU<ncolour>::template iSUnMatrix<cplx> tmp1 =
-            2.0 * tmp * ta[b];  // 2.0 from the normalization
+	  2.0 * tmp * ta[b];  // 2.0 from the normalization
        Complex iTr = TensorRemove(timesI(trace(tmp1)));
        //iAdjTa()()(b, a) = iTr;
        iAdjTa()()(a, b) = iTr;
@ -112,14 +111,14 @@ class SU_Adjoint : public SU<ncolour> {
  }

  static void AdjointLieAlgebraMatrix(
-      const typename SU<ncolour>::LatticeAlgebraVector &h,
-      LatticeAdjMatrix &out, Real scale = 1.0) {
+				      const typename SU<ncolour>::LatticeAlgebraVector &h,
+				      LatticeAdjMatrix &out, Real scale = 1.0) {
    conformable(h, out);
-    GridBase *grid = out._grid;
+    GridBase *grid = out.Grid();
    LatticeAdjMatrix la(grid);
    AMatrix iTa;

-    out = zero;
+    out = Zero();
    for (int a = 0; a < Dimension; a++) {
      generator(a, iTa);
      la = peekColour(h, a) * iTa;
@ -131,7 +130,7 @@ class SU_Adjoint : public SU<ncolour> {
  // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 )
  static void projectOnAlgebra(typename SU<ncolour>::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) {
    conformable(h_out, in);
-    h_out = zero;
+    h_out = Zero();
    AMatrix iTa;
    Real coefficient = - 1.0/(ncolour) * scale;// 1/Nc for the normalization of the trace in the adj rep

@ -146,15 +145,15 @@ class SU_Adjoint : public SU<ncolour> {
  static void projector(typename SU<ncolour>::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) {
    conformable(h_out, in);
    static std::vector<AMatrix> iTa(Dimension);  // to store the generators
-    h_out = zero;
+    h_out = Zero();
    static bool precalculated = false; 
    if (!precalculated){
      precalculated = true;
-        for (int a = 0; a < Dimension; a++) generator(a, iTa[a]);
+      for (int a = 0; a < Dimension; a++) generator(a, iTa[a]);
    }

    Real coefficient = -1.0 / (ncolour) * scale;  // 1/Nc for the normalization of
-                                                // the trace in the adj rep
+    // the trace in the adj rep

    for (int a = 0; a < Dimension; a++) {
      auto tmp = real(trace(iTa[a] * in)) * coefficient; 
@ -176,7 +175,7 @@ typedef SU_Adjoint<4> SU4Adjoint;
 typedef SU_Adjoint<5> SU5Adjoint;

 typedef SU_Adjoint<Nc> AdjointMatrices;
-}
-}
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/utils/SUnTwoIndex.h
+++ b/Grid/qcd/utils/SUnTwoIndex.h
@ -26,8 +26,7 @@
 #define QCD_UTIL_SUN2INDEX_H


-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 enum TwoIndexSymmetry { Symmetric = 1, AntiSymmetric = -1 };

@ -35,7 +34,7 @@ inline Real delta(int a, int b) { return (a == b) ? 1.0 : 0.0; }

 template <int ncolour, TwoIndexSymmetry S>
 class SU_TwoIndex : public SU<ncolour> {
- public:
+public:
  static const int Dimension = ncolour * (ncolour + S) / 2;
  static const int NumGenerators = SU<ncolour>::AdjointDimension;

@ -55,11 +54,11 @@ class SU_TwoIndex : public SU<ncolour> {
  typedef Lattice<vTIMatrixD> LatticeTwoIndexMatrixD;

  typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> >
-      LatticeTwoIndexField;
+  LatticeTwoIndexField;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> >
-      LatticeTwoIndexFieldF;
+  LatticeTwoIndexFieldF;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> >
-      LatticeTwoIndexFieldD;
+  LatticeTwoIndexFieldD;

  template <typename vtype>
  using iSUnMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
@ -72,7 +71,7 @@ class SU_TwoIndex : public SU<ncolour> {
  static void base(int Index, iSUnMatrix<cplx> &eij) {
    // returns (e)^(ij)_{kl} necessary for change of base U_F -> U_R
    assert(Index < NumGenerators);
-    eij = zero;
+    eij = Zero();

    // for the linearisation of the 2 indexes 
    static int a[ncolour * (ncolour - 1) / 2][2]; // store the a <-> i,j
@ -98,18 +97,18 @@ class SU_TwoIndex : public SU<ncolour> {

  template <class cplx>
  static void baseDiagonal(int Index, iSUnMatrix<cplx> &eij) {
-    eij = zero;
+    eij = Zero();
    eij()()(Index - ncolour * (ncolour - 1) / 2,
            Index - ncolour * (ncolour - 1) / 2) = 1.0;
  }

  template <class cplx>
  static void baseOffDiagonal(int i, int j, iSUnMatrix<cplx> &eij) {
-    eij = zero;
+    eij = Zero();
    for (int k = 0; k < ncolour; k++)
      for (int l = 0; l < ncolour; l++)
        eij()()(l, k) = delta(i, k) * delta(j, l) +
-                        S * delta(j, k) * delta(i, l);
+	  S * delta(j, k) * delta(i, l);

    RealD nrm = 1. / std::sqrt(2.0);
    eij = eij * nrm;
@ -128,10 +127,10 @@ class SU_TwoIndex : public SU<ncolour> {
  template <class cplx>
  static void generator(int Index, iSUnTwoIndexMatrix<cplx> &i2indTa) {
    Vector<typename SU<ncolour>::template iSUnMatrix<cplx> > ta(
-        ncolour * ncolour - 1);
+								ncolour * ncolour - 1);
    Vector<typename SU<ncolour>::template iSUnMatrix<cplx> > eij(Dimension);
    typename SU<ncolour>::template iSUnMatrix<cplx> tmp;
-    i2indTa = zero;
+    i2indTa = Zero();
    
    for (int a = 0; a < ncolour * ncolour - 1; a++)
      SU<ncolour>::generator(a, ta[a]);
@ -142,7 +141,7 @@ class SU_TwoIndex : public SU<ncolour> {
      tmp = transpose(ta[Index]) * adj(eij[a]) + adj(eij[a]) * ta[Index];
      for (int b = 0; b < Dimension; b++) {
        typename SU<ncolour>::template iSUnMatrix<cplx> tmp1 =
-            tmp * eij[b]; 
+	  tmp * eij[b]; 
        Complex iTr = TensorRemove(timesI(trace(tmp1)));
        i2indTa()()(a, b) = iTr;
      }
@ -197,14 +196,14 @@ class SU_TwoIndex : public SU<ncolour> {
  }

  static void TwoIndexLieAlgebraMatrix(
-      const typename SU<ncolour>::LatticeAlgebraVector &h,
-      LatticeTwoIndexMatrix &out, Real scale = 1.0) {
+				       const typename SU<ncolour>::LatticeAlgebraVector &h,
+				       LatticeTwoIndexMatrix &out, Real scale = 1.0) {
    conformable(h, out);
-    GridBase *grid = out._grid;
+    GridBase *grid = out.Grid();
    LatticeTwoIndexMatrix la(grid);
    TIMatrix i2indTa;

-    out = zero;
+    out = Zero();
    for (int a = 0; a < ncolour * ncolour - 1; a++) {
      generator(a, i2indTa);
      la = peekColour(h, a) * i2indTa;
@ -216,10 +215,10 @@ class SU_TwoIndex : public SU<ncolour> {
  // Projects the algebra components 
  // of a lattice matrix ( of dimension ncol*ncol -1 )
  static void projectOnAlgebra(
-      typename SU<ncolour>::LatticeAlgebraVector &h_out,
-      const LatticeTwoIndexMatrix &in, Real scale = 1.0) {
+			       typename SU<ncolour>::LatticeAlgebraVector &h_out,
+			       const LatticeTwoIndexMatrix &in, Real scale = 1.0) {
    conformable(h_out, in);
-    h_out = zero;
+    h_out = Zero();
    TIMatrix i2indTa;
    Real coefficient = -2.0 / (ncolour + 2 * S) * scale;
    // 2/(Nc +/- 2) for the normalization of the trace in the two index rep
@ -237,7 +236,7 @@ class SU_TwoIndex : public SU<ncolour> {
    conformable(h_out, in);
    // to store the generators
    static std::vector<TIMatrix> i2indTa(ncolour * ncolour -1); 
-    h_out = zero;
+    h_out = Zero();
    static bool precalculated = false;
    if (!precalculated) {
      precalculated = true;
@ -245,8 +244,8 @@ class SU_TwoIndex : public SU<ncolour> {
    }

    Real coefficient =
-        -2.0 / (ncolour + 2 * S) * scale;  // 2/(Nc +/- 2) for the normalization
-                                           // of the trace in the two index rep
+      -2.0 / (ncolour + 2 * S) * scale;  // 2/(Nc +/- 2) for the normalization
+    // of the trace in the two index rep

    for (int a = 0; a < ncolour * ncolour - 1; a++) {
      auto tmp = real(trace(i2indTa[a] * in)) * coefficient;
@ -269,8 +268,6 @@ typedef SU_TwoIndex<3, AntiSymmetric> SU3TwoIndexAntiSymm;
 typedef SU_TwoIndex<4, AntiSymmetric> SU4TwoIndexAntiSymm;
 typedef SU_TwoIndex<5, AntiSymmetric> SU5TwoIndexAntiSymm;

-
-}
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/utils/ScalarObjs.h
+++ b/Grid/qcd/utils/ScalarObjs.h
@ -28,15 +28,13 @@ directory
 /*  END LEGAL */
 #ifndef SCALAR_OBJS_H
 #define SCALAR_OBJS_H
-namespace Grid {

-  // FIXME drop the QCD namespace in Nd
-  
+NAMESPACE_BEGIN(Grid);

 // Scalar field obs
 template <class Impl>
 class ScalarObs {
- public:
+public:
  //////////////////////////////////////////////////
  // squared field
  //////////////////////////////////////////////////
@ -61,7 +59,7 @@ class ScalarObs {
  static void phider(typename Impl::Field &fsq,
                     const typename Impl::Field &f) {
    fsq = Cshift(f, 0, -1) * f;
-    for (int mu = 1; mu < QCD::Nd; mu++) fsq += Cshift(f, mu, -1) * f;
+    for (int mu = 1; mu < Nd; mu++) fsq += Cshift(f, mu, -1) * f;
  }

  //////////////////////////////////////////////////
@ -69,28 +67,27 @@ class ScalarObs {
  //////////////////////////////////////////////////

  static RealD sumphider(const typename Impl::Field &f) {
-    typename Impl::Field tmp(f._grid);
+    typename Impl::Field tmp(f.Grid());
    tmp = Cshift(f, 0, -1) * f;
-    for (int mu = 1; mu < QCD::Nd; mu++) {
+    for (int mu = 1; mu < Nd; mu++) {
      tmp += Cshift(f, mu, -1) * f;
    }
    return -sum(trace(tmp));
  }

  static RealD sumphisquared(const typename Impl::Field &f) {
-    typename Impl::Field tmp(f._grid);
+    typename Impl::Field tmp(f.Grid());
    tmp = f * f;
    return sum(trace(tmp));
  }

  static RealD sumphifourth(const typename Impl::Field &f) {
-    typename Impl::Field tmp(f._grid);
+    typename Impl::Field tmp(f.Grid());
    phifourth(tmp, f);
    return sum(trace(tmp));
  }
 };

-
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/utils/SpaceTimeGrid.cc
+++ b/Grid/qcd/utils/SpaceTimeGrid.cc
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -23,18 +23,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>

-namespace Grid { 
-  namespace QCD {
+NAMESPACE_BEGIN(Grid); 

 /////////////////////////////////////////////////////////////////
 // Public interface
 /////////////////////////////////////////////////////////////////
-GridCartesian *SpaceTimeGrid::makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi)
+GridCartesian *SpaceTimeGrid::makeFourDimGrid(const Coordinate & latt,const Coordinate &simd,const Coordinate &mpi)
 {
  return new GridCartesian(latt,simd,mpi); 
 }
@ -42,23 +41,23 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
 {
  return new GridRedBlackCartesian(FourDimGrid); 
 }
-GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
+GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const Coordinate & latt,const Coordinate &mpi)
 {
-  std::vector<int> simd(4,1);
+  Coordinate simd(4,1);
  return makeFourDimGrid(latt,simd,mpi);
 }
 GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;

-  std::vector<int> latt5(1,Ls);
-  std::vector<int> simd5(1,1);
-  std::vector<int>  mpi5(1,1);
+  Coordinate latt5(1,Ls);
+  Coordinate simd5(1,1);
+  Coordinate  mpi5(1,1);
  
  for(int d=0;d<N4;d++){
    latt5.push_back(FourDimGrid->_fdimensions[d]);
    simd5.push_back(FourDimGrid->_simd_layout[d]);
-     mpi5.push_back(FourDimGrid->_processors[d]);
+    mpi5.push_back(FourDimGrid->_processors[d]);
  }
  return new GridCartesian(latt5,simd5,mpi5,*FourDimGrid); 
 }
@ -68,9 +67,9 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
 {
  int N4=FourDimGrid->_ndimension;
  int cbd=1;
-  std::vector<int>   cb5(1,0);
+  Coordinate   cb5(1,0);
  for(int d=0;d<N4;d++){
-      cb5.push_back(  1);
+    cb5.push_back(  1);
  }
  GridCartesian *tmp = makeFiveDimGrid(Ls,FourDimGrid);
  GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,cb5,cbd); 
@ -84,14 +83,14 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartes
  int N4    = FourDimGrid->_ndimension;
  int nsimd = FourDimGrid->Nsimd();

-  std::vector<int> latt5(1,Ls);
-  std::vector<int> simd5(1,nsimd);
-  std::vector<int>  mpi5(1,1);
+  Coordinate latt5(1,Ls);
+  Coordinate simd5(1,nsimd);
+  Coordinate  mpi5(1,1);
  
  for(int d=0;d<N4;d++){
    latt5.push_back(FourDimGrid->_fdimensions[d]);
    simd5.push_back(1);
-     mpi5.push_back(FourDimGrid->_processors[d]);
+    mpi5.push_back(FourDimGrid->_processors[d]);
  }
  return new GridCartesian(latt5,simd5,mpi5,*FourDimGrid); 
 }
@ -103,9 +102,9 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const Gr
 {
  int N4=FourDimGrid->_ndimension;
  int cbd=1;
-  std::vector<int>   cb5(1,0);
+  Coordinate   cb5(1,0);
  for(int d=0;d<N4;d++){
-      cb5.push_back(1);
+    cb5.push_back(1);
  }
  GridCartesian *tmp         = makeFiveDimDWFGrid(Ls,FourDimGrid);
  GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,cb5,cbd); 
@ -113,5 +112,4 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const Gr
  return ret;
 }

-
-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/utils/SpaceTimeGrid.h
+++ b/Grid/qcd/utils/SpaceTimeGrid.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -23,17 +23,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_QCD_SPACE_TIME_GRID_H
 #define GRID_QCD_SPACE_TIME_GRID_H
-namespace Grid {
-namespace QCD {
+
+NAMESPACE_BEGIN(Grid);

 class SpaceTimeGrid {
- public:
+public:

-  static GridCartesian         *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
+  static GridCartesian         *makeFourDimGrid(const Coordinate & latt,const Coordinate &simd,const Coordinate &mpi);
  static GridRedBlackCartesian *makeFourDimRedBlackGrid       (const GridCartesian *FourDimGrid);

  static GridCartesian         *makeFiveDimGrid        (int Ls,const GridCartesian *FourDimGrid);
@ -41,10 +41,10 @@ class SpaceTimeGrid {

  static GridCartesian         *makeFiveDimDWFGrid        (int Ls,const GridCartesian *FourDimGrid);
  static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
-  static GridCartesian         *makeFourDimDWFGrid        (const std::vector<int> & latt,const std::vector<int> &mpi);
+  static GridCartesian         *makeFourDimDWFGrid        (const Coordinate & latt,const Coordinate &mpi);

 };

-}}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -33,8 +33,8 @@ directory
 /*  END LEGAL */
 #ifndef QCD_UTILS_WILSON_LOOPS_H
 #define QCD_UTILS_WILSON_LOOPS_H
-namespace Grid {
-namespace QCD {
+
+NAMESPACE_BEGIN(Grid);

 // Common wilson loop observables
 template <class Gimpl> class WilsonLoops : public Gimpl {
@ -57,16 +57,16 @@ public:
    // purpose of deriving
    // from Gimpl.
    /*
-    plaq = Gimpl::CovShiftBackward(
-        U[mu], mu, Gimpl::CovShiftBackward(
-                       U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
-                       */
+      plaq = Gimpl::CovShiftBackward(
+      U[mu], mu, Gimpl::CovShiftBackward(
+      U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
+    */
    // _
    //|< _|
    plaq = Gimpl::CovShiftForward(U[mu],mu,
-           Gimpl::CovShiftForward(U[nu],nu,
-           Gimpl::CovShiftBackward(U[mu],mu,
-           Gimpl::CovShiftIdentityBackward(U[nu], nu))));
+				  Gimpl::CovShiftForward(U[nu],nu,
+							 Gimpl::CovShiftBackward(U[mu],mu,
+										 Gimpl::CovShiftIdentityBackward(U[nu], nu))));



@ -78,7 +78,7 @@ public:
  static void traceDirPlaquette(ComplexField &plaq,
                                const std::vector<GaugeMat> &U, const int mu,
                                const int nu) {
-    GaugeMat sp(U[0]._grid);
+    GaugeMat sp(U[0].Grid());
    dirPlaquette(sp, U, mu, nu);
    plaq = trace(sp);
  }
@ -87,8 +87,8 @@ public:
  //////////////////////////////////////////////////
  static void sitePlaquette(ComplexField &Plaq,
                            const std::vector<GaugeMat> &U) {
-    ComplexField sitePlaq(U[0]._grid);
-    Plaq = zero;
+    ComplexField sitePlaq(U[0].Grid());
+    Plaq = Zero();
    for (int mu = 1; mu < Nd; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceDirPlaquette(sitePlaq, U, mu, nu);
@ -100,13 +100,13 @@ public:
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static RealD sumPlaquette(const GaugeLorentz &Umu) {
-    std::vector<GaugeMat> U(Nd, Umu._grid);
+    std::vector<GaugeMat> U(Nd, Umu.Grid());
    // inefficient here
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

-    ComplexField Plaq(Umu._grid);
+    ComplexField Plaq(Umu.Grid());

    sitePlaquette(Plaq, U);
    auto Tp = sum(Plaq);
@ -120,7 +120,7 @@ public:
  //////////////////////////////////////////////////
  static RealD avgPlaquette(const GaugeLorentz &Umu) {
    RealD sumplaq = sumPlaquette(Umu);
-    double vol = Umu._grid->gSites();
+    double vol = Umu.Grid()->gSites();
    double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }
@ -130,12 +130,12 @@ public:
  // average over all x,y,z the temporal loop
  //////////////////////////////////////////////////
  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
-    GaugeMat Ut(Umu._grid), P(Umu._grid);
+    GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
    ComplexD out;
-    int T = Umu._grid->GlobalDimensions()[3];
-    int X = Umu._grid->GlobalDimensions()[0];
-    int Y = Umu._grid->GlobalDimensions()[1];
-    int Z = Umu._grid->GlobalDimensions()[2];
+    int T = Umu.Grid()->GlobalDimensions()[3];
+    int X = Umu.Grid()->GlobalDimensions()[0];
+    int Y = Umu.Grid()->GlobalDimensions()[1];
+    int Z = Umu.Grid()->GlobalDimensions()[2];

    Ut = peekLorentz(Umu,3); //Select temporal direction
    P = Ut;
@ -151,10 +151,10 @@ public:
  // average over traced single links
  //////////////////////////////////////////////////
  static RealD linkTrace(const GaugeLorentz &Umu) {
-    std::vector<GaugeMat> U(Nd, Umu._grid);
+    std::vector<GaugeMat> U(Nd, Umu.Grid());

-    ComplexField Tr(Umu._grid);
-    Tr = zero;
+    ComplexField Tr(Umu.Grid());
+    Tr = Zero();
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
      Tr = Tr + trace(U[mu]);
@ -163,7 +163,7 @@ public:
    auto Tp = sum(Tr);
    auto p = TensorRemove(Tp);

-    double vol = Umu._grid->gSites();
+    double vol = Umu.Grid()->gSites();

    return p.real() / vol / 4.0 / 3.0;
  };
@ -174,13 +174,13 @@ public:
  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
                     int nu) {

-    GridBase *grid = Umu._grid;
+    GridBase *grid = Umu.Grid();

    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
-    staple = zero;
+    staple = Zero();

    if (nu != mu) {

@ -194,11 +194,11 @@ public:
      //

      staple += Gimpl::ShiftStaple(
-          Gimpl::CovShiftForward(
-              U[nu], nu,
-              Gimpl::CovShiftBackward(
-                  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
-          mu);
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftBackward(
+										  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+				   mu);

      //  __
      // |
@ -206,23 +206,23 @@ public:
      //
      //
      staple += Gimpl::ShiftStaple(
-          Gimpl::CovShiftBackward(U[nu], nu,
-                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
-          mu);
+				   Gimpl::CovShiftBackward(U[nu], nu,
+							   Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
+				   mu);
    }
  }


-// For the force term
+  // For the force term
 /*
-static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
-    GridBase *grid = Umu._grid;
+  static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
+    GridBase *grid = Umu.Grid();
    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      // this operation is taking too much time
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
-    staple = zero;
+    staple = Zero();
    GaugeMat tmp1(grid);
    GaugeMat tmp2(grid);

@ -237,20 +237,20 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      }
    }
    staple = U[mu]*staple;
-}
+  }
 */
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

-    GridBase *grid = Umu._grid;
+    GridBase *grid = Umu.Grid();

    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
-    staple = zero;
+    staple = Zero();

    for (int nu = 0; nu < Nd; nu++) {

@ -266,11 +266,11 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
        //
     
        staple += Gimpl::ShiftStaple(
-            Gimpl::CovShiftForward(
-                U[nu], nu,
-                Gimpl::CovShiftBackward(
-                    U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
-            mu);
+				     Gimpl::CovShiftForward(
+							    U[nu], nu,
+							    Gimpl::CovShiftBackward(
+										    U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+				     mu);

        //  __
        // |
@ -279,8 +279,8 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
        //

        staple += Gimpl::ShiftStaple(
-            Gimpl::CovShiftBackward(U[nu], nu,
-                                    Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+				     Gimpl::CovShiftBackward(U[nu], nu,
+							     Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
      }
    }
  }
@ -291,7 +291,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void StapleUpper(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
                          int nu) {
    if (nu != mu) {
-      GridBase *grid = Umu._grid;
+      GridBase *grid = Umu.Grid();

      std::vector<GaugeMat> U(Nd, grid);
      for (int d = 0; d < Nd; d++) {
@ -308,11 +308,11 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      //

      staple = Gimpl::ShiftStaple(
-          Gimpl::CovShiftForward(
-              U[nu], nu,
-              Gimpl::CovShiftBackward(
-                  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
-          mu);
+				  Gimpl::CovShiftForward(
+							 U[nu], nu,
+							 Gimpl::CovShiftBackward(
+										 U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+				  mu);
    }
  }

@ -322,7 +322,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void StapleLower(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
                          int nu) {
    if (nu != mu) {
-      GridBase *grid = Umu._grid;
+      GridBase *grid = Umu.Grid();

      std::vector<GaugeMat> U(Nd, grid);
      for (int d = 0; d < Nd; d++) {
@ -339,7 +339,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      //
      //
      staple = Gimpl::ShiftStaple(
-          Gimpl::CovShiftBackward(U[nu], nu,
+				  Gimpl::CovShiftBackward(U[nu], nu,
                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
          mu);

@ -350,18 +350,18 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //  Field Strength
  //////////////////////////////////////////////////////
  static void FieldStrength(GaugeMat &FS, const GaugeLorentz &Umu, int mu, int nu){
-      // Fmn +--<--+  Ut +--<--+
-      //     |     |     |     |
+    // Fmn +--<--+  Ut +--<--+
+    //     |     |     |     |
      //  (x)+-->--+     +-->--+(x)  - h.c.
-      //     |     |     |     |
-      //     +--<--+     +--<--+
+    //     |     |     |     |
+    //     +--<--+     +--<--+

-      GaugeMat Vup(Umu._grid), Vdn(Umu._grid);
-      StapleUpper(Vup, Umu, mu, nu);
-      StapleLower(Vdn, Umu, mu, nu);
-      GaugeMat v = Vup - Vdn;
-      GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
-      GaugeMat vu = v*u;
+    GaugeMat Vup(Umu.Grid()), Vdn(Umu.Grid());
+    StapleUpper(Vup, Umu, mu, nu);
+    StapleLower(Vdn, Umu, mu, nu);
+    GaugeMat v = Vup - Vdn;
+    GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
+    GaugeMat vu = v*u;
      //FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
      FS = (u*v + Cshift(vu, mu, -1));
      FS = 0.125*(FS - adj(FS));
@ -371,13 +371,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    // 4d topological charge
    assert(Nd==4);
    // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
-    GaugeMat Bx(U._grid), By(U._grid), Bz(U._grid);
+    GaugeMat Bx(U.Grid()), By(U.Grid()), Bz(U.Grid());
    FieldStrength(Bx, U, Ydir, Zdir);
    FieldStrength(By, U, Zdir, Xdir);
    FieldStrength(Bz, U, Xdir, Ydir);

    // Ex = -iF(t,x), Ey = -iF(t,y), Ez = -iF(t,z)
-    GaugeMat Ex(U._grid), Ey(U._grid), Ez(U._grid);
+    GaugeMat Ex(U.Grid()), Ey(U.Grid()), Ez(U.Grid());
    FieldStrength(Ex, U, Tdir, Xdir);
    FieldStrength(Ey, U, Tdir, Ydir);
    FieldStrength(Ez, U, Tdir, Zdir);
@ -396,26 +396,26 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void dirRectangle(GaugeMat &rect, const std::vector<GaugeMat> &U,
                           const int mu, const int nu) {
    rect = Gimpl::CovShiftForward(
-               U[mu], mu, Gimpl::CovShiftForward(U[mu], mu, U[nu])) * // ->->|
-           adj(Gimpl::CovShiftForward(
-               U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[mu])));
+				  U[mu], mu, Gimpl::CovShiftForward(U[mu], mu, U[nu])) * // ->->|
+      adj(Gimpl::CovShiftForward(
+				 U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[mu])));
    rect = rect +
-           Gimpl::CovShiftForward(
-               U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])) * // ->||
-               adj(Gimpl::CovShiftForward(
-                   U[nu], nu, Gimpl::CovShiftForward(U[nu], nu, U[mu])));
+      Gimpl::CovShiftForward(
+			     U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])) * // ->||
+      adj(Gimpl::CovShiftForward(
+				 U[nu], nu, Gimpl::CovShiftForward(U[nu], nu, U[mu])));
  }
  static void traceDirRectangle(ComplexField &rect,
                                const std::vector<GaugeMat> &U, const int mu,
                                const int nu) {
-    GaugeMat sp(U[0]._grid);
+    GaugeMat sp(U[0].Grid());
    dirRectangle(sp, U, mu, nu);
    rect = trace(sp);
  }
  static void siteRectangle(ComplexField &Rect,
                            const std::vector<GaugeMat> &U) {
-    ComplexField siteRect(U[0]._grid);
-    Rect = zero;
+    ComplexField siteRect(U[0].Grid());
+    Rect = Zero();
    for (int mu = 1; mu < Nd; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceDirRectangle(siteRect, U, mu, nu);
@ -428,13 +428,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static RealD sumRectangle(const GaugeLorentz &Umu) {
-    std::vector<GaugeMat> U(Nd, Umu._grid);
+    std::vector<GaugeMat> U(Nd, Umu.Grid());

    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

-    ComplexField Rect(Umu._grid);
+    ComplexField Rect(Umu.Grid());

    siteRectangle(Rect, U);

@ -449,7 +449,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

    RealD sumrect = sumRectangle(Umu);

-    double vol = Umu._grid->gSites();
+    double vol = Umu.Grid()->gSites();

    double faces = (1.0 * Nd * (Nd - 1)); // 2 distinct orientations summed

@ -473,9 +473,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void RectStapleOptimised(GaugeMat &Stap, std::vector<GaugeMat> &U2,
                                  std::vector<GaugeMat> &U, int mu) {

-    Stap = zero;
+    Stap = Zero();

-    GridBase *grid = U[0]._grid;
+    GridBase *grid = U[0].Grid();

    GaugeMat Staple2x1(grid);
    GaugeMat tmp(grid);
@ -552,14 +552,14 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

  static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu,
                                    int mu) {
-    GridBase *grid = Umu._grid;
+    GridBase *grid = Umu.Grid();

    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }

-    Stap = zero;
+    Stap = Zero();

    for (int nu = 0; nu < Nd; nu++) {
      if (nu != mu) {
@ -567,52 +567,52 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
        //          |    __ |
        //
        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftForward(
-                U[mu], mu,
-                Gimpl::CovShiftForward(
-                    U[nu], nu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu,
-                        Gimpl::CovShiftBackward(
-                            U[mu], mu,
-                            Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
-            mu);
+				   Gimpl::CovShiftForward(
+							  U[mu], mu,
+							  Gimpl::CovShiftForward(
+										 U[nu], nu,
+										 Gimpl::CovShiftBackward(
+													 U[mu], mu,
+													 Gimpl::CovShiftBackward(
+																 U[mu], mu,
+																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
+				   mu);

        //              __
        //          |__ __ |

        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftForward(
-                U[mu], mu,
-                Gimpl::CovShiftBackward(
-                    U[nu], nu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))),
-            mu);
+				   Gimpl::CovShiftForward(
+							  U[mu], mu,
+							  Gimpl::CovShiftBackward(
+										  U[nu], nu,
+										  Gimpl::CovShiftBackward(
+													  U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))),
+				   mu);

        //           __
        //          |__ __ |

        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftBackward(
-                U[nu], nu,
-                Gimpl::CovShiftBackward(
-                    U[mu], mu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))),
-            mu);
+				   Gimpl::CovShiftBackward(
+							   U[nu], nu,
+							   Gimpl::CovShiftBackward(
+										   U[mu], mu,
+										   Gimpl::CovShiftBackward(
+													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))),
+				   mu);

        //           __ ___
        //          |__    |

        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftForward(
-                U[nu], nu,
-                Gimpl::CovShiftBackward(
-                    U[mu], mu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))),
-            mu);
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftBackward(
+										  U[mu], mu,
+										  Gimpl::CovShiftBackward(
+													  U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))),
+				   mu);

        //       --
        //      |  |
@ -620,16 +620,16 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
        //      |  |

        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftForward(
-                U[nu], nu,
-                Gimpl::CovShiftForward(
-                    U[nu], nu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu,
-                        Gimpl::CovShiftBackward(
-                            U[nu], nu,
-                            Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
-            mu);
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftForward(
+										 U[nu], nu,
+										 Gimpl::CovShiftBackward(
+													 U[mu], mu,
+													 Gimpl::CovShiftBackward(
+																 U[nu], nu,
+																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
+				   mu);

        //      |  |
        //
@ -637,13 +637,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
        //       --

        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftBackward(
-                U[nu], nu,
-                Gimpl::CovShiftBackward(
-                    U[nu], nu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))),
-            mu);
+				   Gimpl::CovShiftBackward(
+							   U[nu], nu,
+							   Gimpl::CovShiftBackward(
+										   U[nu], nu,
+										   Gimpl::CovShiftBackward(
+													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))),
+				   mu);
      }
    }
  }
@ -679,7 +679,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
                                const std::vector<GaugeMat> &U,
                                const int Rmu, const int Rnu,
                                const int mu, const int nu) {
-    GaugeMat sp(U[0]._grid);
+    GaugeMat sp(U[0].Grid());
    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
    wl = trace(sp);
  }
@ -689,9 +689,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void siteWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
-    LatticeComplex siteWl(U[0]._grid);
-    Wl = zero;
-    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
+    LatticeComplex siteWl(U[0].Grid());
+    Wl = Zero();
+    for (int mu = 1; mu < U[0].Grid()->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
@ -707,11 +707,11 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
-    LatticeComplex siteWl(U[0]._grid);
+    LatticeComplex siteWl(U[0].Grid());

-    int ndim = U[0]._grid->_ndimension;
+    int ndim = U[0].Grid()->_ndimension;

-    Wl = zero;
+    Wl = Zero();
    for (int nu = 0; nu < ndim - 1; nu++) {
      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
      Wl = Wl + siteWl;
@ -723,10 +723,10 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
-    LatticeComplex siteWl(U[0]._grid);
+    LatticeComplex siteWl(U[0].Grid());

-    Wl = zero;
-    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
+    Wl = Zero();
+    for (int mu = 1; mu < U[0].Grid()->_ndimension - 1; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
@ -740,13 +740,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu._grid);
+    std::vector<GaugeMat> U(4, Umu.Grid());

-    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

-    LatticeComplex Wl(Umu._grid);
+    LatticeComplex Wl(Umu.Grid());

    siteWilsonLoop(Wl, U, R1, R2);

@ -759,13 +759,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu._grid);
+    std::vector<GaugeMat> U(4, Umu.Grid());

-    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

-    LatticeComplex Wl(Umu._grid);
+    LatticeComplex Wl(Umu.Grid());

    siteTimelikeWilsonLoop(Wl, U, R1, R2);

@ -778,13 +778,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu._grid);
+    std::vector<GaugeMat> U(4, Umu.Grid());

-    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

-    LatticeComplex Wl(Umu._grid);
+    LatticeComplex Wl(Umu.Grid());

    siteSpatialWilsonLoop(Wl, U, R1, R2);

@ -797,9 +797,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real avgWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    int ndim = Umu._grid->_ndimension;
+    int ndim = Umu.Grid()->_ndimension;
    Real sumWl = sumWilsonLoop(Umu, R1, R2);
-    Real vol = Umu._grid->gSites();
+    Real vol = Umu.Grid()->gSites();
    Real faces = 1.0 * ndim * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
@ -808,9 +808,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    int ndim = Umu._grid->_ndimension;
+    int ndim = Umu.Grid()->_ndimension;
    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
-    Real vol = Umu._grid->gSites();
+    Real vol = Umu.Grid()->gSites();
    Real faces = 1.0 * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
@ -819,9 +819,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    int ndim = Umu._grid->_ndimension;
+    int ndim = Umu.Grid()->_ndimension;
    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
-    Real vol = Umu._grid->gSites();
+    Real vol = Umu.Grid()->gSites();
    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
@ -831,7 +831,7 @@ typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
 typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
 typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
 typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
-}
-}
+
+NAMESPACE_END(Grid);

 #endif