Merge GPU support (upstream/develop) into distillation branch.

This compiles and looks right ... but may need some testing * develop: (762 commits) Tensor ambiguous fix Fix for GCC preprocessor/pragma handling bug Trips up NVCC for reasons I dont understand on summit Fix GCC complaint Zero() change Force a couple of things to compile on NVCC Remove debug code nvcc error suppress Merge develop Reduction finished and hopefully fixes CI regression fail on single precisoin and force Double precision variants for summation accuracy Update todo list Freeze the seed Fix compiling of MSource::Gauss for single precision Think the reduction is now sorted and cleaned up Fix force term Printing improvement GPU reduction fix and also exit backtrace option GPU friendly Simplify the comms benchmark ... # Conflicts: # Grid/communicator/SharedMemoryMPI.cc # Grid/qcd/action/fermion/WilsonKernelsAsm.cc # Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h # Grid/qcd/smearing/StoutSmearing.h # Hadrons/Modules.hpp # Hadrons/Utilities/Contractor.cc # Hadrons/modules.inc # tests/forces/Test_dwf_force_eofa.cc # tests/forces/Test_dwf_gpforce_eofa.cc
2025-06-19 16:27:05 +01:00 · 2019-09-13 13:30:00 +01:00
parent 04a661cafe b473405652
commit 61d017d0a5
796 changed files with 41536 additions and 52391 deletions
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@ -1,9 +1,8 @@
 #pragma once
 //#include <Grid/Hadrons/Global.hpp>
-#include <Grid/Eigen/unsupported/CXX11/Tensor>
+#include <Grid/Grid_Eigen_Tensor.h>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 #undef DELTA_F_EQ_2

@ -134,7 +133,7 @@ void A2Autils<FImpl>::NucleonFieldMom(Eigen::Tensor<ComplexD,6> &mat,
  int twoBlock = mat.dimension(3);
  int threeBlock = mat.dimension(4);

-  GridBase *grid = one[0]._grid;
+  GridBase *grid = one[0].Grid();

  const int    nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
@ -153,22 +152,21 @@ void A2Autils<FImpl>::NucleonFieldMom(Eigen::Tensor<ComplexD,6> &mat,
  int MFlvol = ld*oneBlock*twoBlock*threeBlock*Nmom;

  Vector<SpinVector_v > lvSum(MFrvol);
-  parallel_for (int r = 0; r < MFrvol; r++){
-    lvSum[r] = zero;
-  }
+  accelerator_for (r, MFrvol, Nsimd, {
+    lvSum[r] = 0;
+  } );

  Vector<SpinVector_s > lsSum(MFlvol);             
-  parallel_for (int r = 0; r < MFlvol; r++){
-    lsSum[r]=scalar_type(0.0);
-  }
+  accelerator_for (r, MFlvol, Nsimd, {
+    lsSum[r] = scalar_type(0.0);
+  } );

  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];

-  parallel_for(int r=0;r<rd;r++){
-
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+  accelerator_for(r, rd, Nsimd, {
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane

    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
@ -196,7 +194,7 @@ void A2Autils<FImpl>::NucleonFieldMom(Eigen::Tensor<ComplexD,6> &mat,
 	      SpinVector_v vv;

              for(int s1=0;s1<Ns;s1++){
-              vv()(s1)() = zero;
+              vv()(s1)() = 0;
 	      for(int s2=0;s2<Ns;s2++){            
 	     /*   vv()(s1)() =  pv1()(s1)(0) * v2g()(s2)(1) * v3()(s2)(2)   //Cross product
                  -           pv1()(s1)(0) * v2g()(s2)(2) * v3()(s2)(1)    
@ -233,13 +231,13 @@ void A2Autils<FImpl>::NucleonFieldMom(Eigen::Tensor<ComplexD,6> &mat,
 	}
      }
    }
-  }
+  } );


  // Sum across simd lanes in the plane, breaking out orthog dir.
-  parallel_for(int rt=0;rt<rd;rt++){
+  accelerator_for(rt, rd, Nsimd, {

-    std::vector<int> icoor(nd);
+    Coordinate icoor(nd);
    std::vector<SpinVector_s>  extracted(Nsimd);               

    for(int i=0;i<oneBlock;i++){
@ -263,14 +261,14 @@ void A2Autils<FImpl>::NucleonFieldMom(Eigen::Tensor<ComplexD,6> &mat,

      }
    }}}}
-  }
+  } );

  assert(mat.dimension(0) == Nmom);
  assert(mat.dimension(1) == Nt);

  int pd = grid->_processors[orthogdim];
  int pc = grid->_processor_coor[orthogdim];
-  parallel_for_nest2(int lt=0;lt<ld;lt++)
+  accelerator_for(lt, ld, Nsimd,
  {
    for(int pt=0;pt<pd;pt++){
      int t = lt + pt*ld;
@ -302,7 +300,7 @@ void A2Autils<FImpl>::NucleonFieldMom(Eigen::Tensor<ComplexD,6> &mat,
 	}
      }
    }
-  }
+  } );

  grid->GlobalSumVector(&mat(0,0,0,0,0,0),Nmom*Nt*oneBlock*twoBlock*threeBlock*4);
 }
@ -338,7 +336,7 @@ void A2Autils<FImpl>::BaryonField(TensorType &mat,
  int twoBlock = mat.dimension(4);
  int threeBlock = mat.dimension(5);

-  GridBase *grid = one[0]._grid;
+  GridBase *grid = one[0].Grid();
  
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
@ -530,7 +528,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  int Lblock = mat.dimension(3); 
  int Rblock = mat.dimension(4);

-  GridBase *grid = lhs_wi[0]._grid;
+  GridBase *grid = lhs_wi[0].Grid();
  
  const int    Nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
@ -550,14 +548,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  int MFlvol = ld*Lblock*Rblock*Nmom;

  Vector<SpinMatrix_v > lvSum(MFrvol);
-  parallel_for (int r = 0; r < MFrvol; r++){
-    lvSum[r] = zero;
-  }
+  thread_for( r, MFrvol,{
+    lvSum[r] = Zero();
+  });

  Vector<SpinMatrix_s > lsSum(MFlvol);             
-  parallel_for (int r = 0; r < MFlvol; r++){
+  thread_for(r,MFlvol,{
    lsSum[r]=scalar_type(0.0);
-  }
+  });

  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
@ -565,7 +563,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,

  // potentially wasting cores here if local time extent too small
  if (t_kernel) *t_kernel = -usecond();
-  parallel_for(int r=0;r<rd;r++){
+  thread_for(r,rd,{

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@ -576,12 +574,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,

 	for(int i=0;i<Lblock;i++){

-	  auto left = conjugate(lhs_wi[i]._odata[ss]);
+	  auto lhs_v = lhs_wi[i].View();
+	  auto left = conjugate(lhs_v[ss]);

 	  for(int j=0;j<Rblock;j++){

 	    SpinMatrix_v vv;
-	    auto right = rhs_vj[j]._odata[ss];
+	    auto rhs_v = rhs_vj[j].View();
+	    auto right = rhs_v[ss];
 	    for(int s1=0;s1<Ns;s1++){
 	    for(int s2=0;s2<Ns;s2++){
 	      vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
@ -593,7 +593,8 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
 	    for ( int m=0;m<Nmom;m++){
 	      int idx = m+base;
-	      auto phase = mom[m]._odata[ss];
+	      auto mom_v = mom[m].View();
+	      auto phase = mom_v[ss];
 	      mac(&lvSum[idx],&vv,&phase);
 	    }
 	  
@ -601,14 +602,13 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	}
      }
    }
-  }
-
+  });

  // Sum across simd lanes in the plane, breaking out orthog dir.
-  parallel_for(int rt=0;rt<rd;rt++){
+  thread_for(rt,rd,{

-    std::vector<int> icoor(Nd);
-    std::vector<SpinMatrix_s> extracted(Nsimd);               
+    Coordinate icoor(Nd);
+    ExtractBuffer<SpinMatrix_s> extracted(Nsimd);               

    for(int i=0;i<Lblock;i++){
    for(int j=0;j<Rblock;j++){
@ -630,7 +630,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,

      }
    }}}
-  }
+  });
  if (t_kernel) *t_kernel += usecond();
  assert(mat.dimension(0) == Nmom);
  assert(mat.dimension(1) == Ngamma);
@ -639,8 +639,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
  // ld loop and local only??
  int pd = grid->_processors[orthogdim];
  int pc = grid->_processor_coor[orthogdim];
-  parallel_for_nest2(int lt=0;lt<ld;lt++)
-  {
+  thread_for_collapse(2,lt,ld,{
    for(int pt=0;pt<pd;pt++){
      int t = lt + pt*ld;
      if (pt == pc){
@ -668,7 +667,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	}
      }
    }
-  }
+  });

  ////////////////////////////////////////////////////////////////////
  // This global sum is taking as much as 50% of time on 16 nodes
@ -719,7 +718,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
  int Lblock = mat.dimension(1); 
  int Rblock = mat.dimension(2);

-  GridBase *grid = wi[0]._grid;
+  GridBase *grid = wi[0].Grid();
  
  const int    nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
@ -737,20 +736,20 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
  int MFlvol = ld*Lblock*Rblock;

  Vector<vector_type > lvSum(MFrvol);
-  parallel_for (int r = 0; r < MFrvol; r++){
-    lvSum[r] = zero;
-  }
+  thread_for(r,MFrvol,{
+    lvSum[r] = Zero();
+  });

  Vector<scalar_type > lsSum(MFlvol);             
-  parallel_for (int r = 0; r < MFlvol; r++){
+  thread_for(r,MFlvol,{
    lsSum[r]=scalar_type(0.0);
-  }
+  });

  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];

-  parallel_for(int r=0;r<rd;r++){
+  thread_for(r,rd,{

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@ -761,7 +760,8 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,

 	for(int i=0;i<Lblock;i++){

-	  auto w = conjugate(wi[i]._odata[ss]);
+	  auto wi_v = wi[i].View();
+	  auto w = conjugate(wi_v[ss]);
 	  if (g5) {
 	    w()(2)(0) = - w()(2)(0);
 	    w()(2)(1) = - w()(2)(1);
@ -771,8 +771,9 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
 	    w()(3)(2) = - w()(3)(2);
 	  }
 	  for(int j=0;j<Rblock;j++){
-
-	    auto v = vj[j]._odata[ss];
+	    
+	    auto vj_v=vj[j].View();
+	    auto v  = vj_v[ss];
 	    auto vv = v()(0)(0);

 	    vv =      w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
@ -794,14 +795,14 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
 	}
      }
    }
-  }
+  });

  // Sum across simd lanes in the plane, breaking out orthog dir.
-  parallel_for(int rt=0;rt<rd;rt++){
+  thread_for(rt,rd,{

-    std::vector<int> icoor(nd);
+      Coordinate icoor(nd);
    iScalar<vector_type> temp; 
-    std::vector<iScalar<scalar_type> > extracted(Nsimd);               
+    ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               

    for(int i=0;i<Lblock;i++){
    for(int j=0;j<Rblock;j++){
@ -823,14 +824,13 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,

      }
    }}
-  }
+  });

  assert(mat.dimension(0) == Nt);
  // ld loop and local only??
  int pd = grid->_processors[orthogdim];
  int pc = grid->_processor_coor[orthogdim];
-  parallel_for_nest2(int lt=0;lt<ld;lt++)
-  {
+  thread_for_collapse(2,lt,ld,{
    for(int pt=0;pt<pd;pt++){
      int t = lt + pt*ld;
      if (pt == pc){
@ -849,7 +849,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
 	}
      }
    }
-  }
+  });

  grid->GlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock);
 }
@ -864,7 +864,7 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
  int Lblock = mat.dimension(2); 
  int Rblock = mat.dimension(3);

-  GridBase *grid = wi[0]._grid;
+  GridBase *grid = wi[0].Grid();
  
  const int    nd = grid->_ndimension;
  const int Nsimd = grid->Nsimd();
@ -883,20 +883,20 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
  int MFlvol = ld*Lblock*Rblock*Nmom;

  Vector<vector_type > lvSum(MFrvol);
-  parallel_for (int r = 0; r < MFrvol; r++){
-    lvSum[r] = zero;
-  }
+  thread_for(r,MFrvol,{
+    lvSum[r] = Zero();
+  });

  Vector<scalar_type > lsSum(MFlvol);             
-  parallel_for (int r = 0; r < MFlvol; r++){
+  thread_for(r,MFlvol,{
    lsSum[r]=scalar_type(0.0);
-  }
+  });

  int e1=    grid->_slice_nblock[orthogdim];
  int e2=    grid->_slice_block [orthogdim];
  int stride=grid->_slice_stride[orthogdim];

-  parallel_for(int r=0;r<rd;r++){
+  thread_for(r,rd,{

    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@ -907,11 +907,13 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,

 	for(int i=0;i<Lblock;i++){

-	  auto w = conjugate(wi[i]._odata[ss]);
+	  auto wi_v = wi[i].View();
+	  auto w = conjugate(wi_v[ss]);

 	  for(int j=0;j<Rblock;j++){
-
-	    auto v = vj[j]._odata[ss];
+	    
+	    auto vj_v = vj[j].View();
+	    auto v = vj_v[ss];

 	    auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
 	      +       w()(0)(1) * v()(0)(1)
@ -931,22 +933,23 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
 	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
 	    for ( int m=0;m<Nmom;m++){
 	      int idx = m+base;
-	      auto phase = mom[m]._odata[ss];
+	      auto mom_v = mom[m].View();
+	      auto phase = mom_v[ss];
 	      mac(&lvSum[idx],&vv,&phase()()());
 	    }
 	  }
 	}
      }
    }
-  }
+  });


  // Sum across simd lanes in the plane, breaking out orthog dir.
-  parallel_for(int rt=0;rt<rd;rt++){
+  thread_for(rt,rd,{

-    std::vector<int> icoor(nd);
+    Coordinate icoor(nd);
    iScalar<vector_type> temp; 
-    std::vector<iScalar<scalar_type> > extracted(Nsimd);               
+    ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               

    for(int i=0;i<Lblock;i++){
    for(int j=0;j<Rblock;j++){
@ -969,15 +972,14 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,

      }
    }}}
-  }
+  });

  assert(mat.dimension(0) == Nmom);
  assert(mat.dimension(1) == Nt);
-
+ 
  int pd = grid->_processors[orthogdim];
  int pc = grid->_processor_coor[orthogdim];
-  parallel_for_nest2(int lt=0;lt<ld;lt++)
-  {
+  thread_for_collapse(2,lt,ld,{
    for(int pt=0;pt<pd;pt++){
      int t = lt + pt*ld;
      if (pt == pc){
@ -1000,7 +1002,7 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
 	}
      }
    }
-  }
+  });

  grid->GlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock);
 }
@ -1068,7 +1070,7 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
    int Lblock = mat.dimension(3); 
    int Rblock = mat.dimension(4);

-    GridBase *grid = lhs_wi[0]._grid;
+    GridBase *grid = lhs_wi[0].Grid();
    
    const int    Nd = grid->_ndimension;
    const int Nsimd = grid->Nsimd();
@ -1088,16 +1090,16 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
    int MFlvol = ld*Lblock*Rblock*Nem;

    Vector<vector_type> lvSum(MFrvol);
-    parallel_for (int r = 0; r < MFrvol; r++)
+    thread_for(r,MFrvol,
    {
-        lvSum[r] = zero;
-    }
+      lvSum[r] = Zero();
+    });

    Vector<scalar_type> lsSum(MFlvol);             
-    parallel_for (int r = 0; r < MFlvol; r++)
+    thread_for(r,MFlvol,
    {
        lsSum[r] = scalar_type(0.0);
-    }
+    });

    int e1=    grid->_slice_nblock[orthogdim];
    int e2=    grid->_slice_block [orthogdim];
@ -1106,7 +1108,7 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
    // Nested parallelism would be ok
    // Wasting cores here. Test case r
    if (t_kernel) *t_kernel = -usecond();
-    parallel_for(int r=0;r<rd;r++)
+    thread_for(r,rd,
    {
        int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 

@ -1117,17 +1119,19 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,

            for(int i=0;i<Lblock;i++)
            {
-                auto left = conjugate(lhs_wi[i]._odata[ss]);
+  	        auto wi_v = lhs_wi[i].View();
+                auto left = conjugate(wi_v[ss]);

                for(int j=0;j<Rblock;j++)
                {
                    SpinMatrix_v vv;
-                    auto right = rhs_vj[j]._odata[ss];
+		    auto vj_v  = rhs_vj[j].View();
+                    auto right = vj_v[ss];

                    for(int s1=0;s1<Ns;s1++)
                    for(int s2=0;s2<Ns;s2++)
                    {
-                        vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+		          vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
                                        + left()(s2)(1) * right()(s1)(1)
                                        + left()(s2)(2) * right()(s1)(2);
                    }
@ -1137,9 +1141,11 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,

                    for ( int m=0;m<Nem;m++)
                    {
+  		        auto emB0_v = emB0[m].View();
+  		        auto emB1_v = emB1[m].View();
                        int idx  = m+base;
-                        auto b0  = emB0[m]._odata[ss];
-                        auto b1  = emB1[m]._odata[ss];
+                        auto b0  = emB0_v[ss];
+                        auto b1  = emB1_v[ss];
                        auto cb0 = conjugate(b0);
                        auto cb1 = conjugate(b1);

@ -1151,13 +1157,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
                }
            }
        }
-    }
+    });

    // Sum across simd lanes in the plane, breaking out orthog dir.
-    parallel_for(int rt=0;rt<rd;rt++)
+    thread_for(rt,rd,
    {
-        std::vector<int> icoor(Nd);
-        std::vector<scalar_type> extracted(Nsimd);               
+        Coordinate icoor(Nd);
+        ExtractBuffer<scalar_type> extracted(Nsimd);               

        for(int i=0;i<Lblock;i++)
        for(int j=0;j<Rblock;j++)
@ -1177,13 +1183,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
                lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
            }
        }
-    }
+    });
    if (t_kernel) *t_kernel += usecond();

    // ld loop and local only??
    int pd = grid->_processors[orthogdim];
    int pc = grid->_processor_coor[orthogdim];
-    parallel_for_nest2(int lt=0;lt<ld;lt++)
+    thread_for_collapse(2,lt,ld,
    {
        for(int pt=0;pt<pd;pt++)
        {
@ -1211,7 +1217,7 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
                }
            }
        }
-    }
+    });
    if (t_gsum) *t_gsum = -usecond();
    grid->GlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock);
    if (t_gsum) *t_gsum += usecond();
@ -1375,9 +1381,9 @@ void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
 				   const FermionField *vs,
 				   const FermionField *vd)
 {
-  GridBase *grid = vs[0]._grid;
+  GridBase *grid = vs[0].Grid();

-  int nd    = grid->_ndimension;
+  //  int nd    = grid->_ndimension;
  int Nsimd = grid->Nsimd();
  int N_t   = WW_sd.dimension(0);
  int N_s = WW_sd.dimension(1); 
@ -1386,42 +1392,44 @@ void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
  int d_unroll = 32;// Empirical optimisation

  for(int t=0;t<N_t;t++){
-    WWVV[t] = zero;
+    WWVV[t] = Zero();
  }

-  parallel_for(int ss=0;ss<grid->oSites();ss++){
+  thread_for(ss,grid->oSites(),{
    for(int d_o=0;d_o<N_d;d_o+=d_unroll){
      for(int t=0;t<N_t;t++){
      for(int s=0;s<N_s;s++){
-	auto tmp1 = vs[s]._odata[ss];
-  vobj tmp2 = zero;
-  vobj tmp3 = zero;
-
+	auto vs_v = vs[s].View();
+	auto tmp1 = vs_v[ss];
+	vobj tmp2 = Zero();
+	vobj tmp3 = Zero();
 	for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
+	  auto vd_v = vd[d].View();
 	  Scalar_v coeff = WW_sd(t,s,d);
-	  tmp3 = conjugate(vd[d]._odata[ss]);
+	  tmp3 = conjugate(vd_v[ss]);
 	  mac(&tmp2, &coeff, &tmp3);
-  }
+	}

 	//////////////////////////
 	// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
 	//////////////////////////
+	auto WWVV_v = WWVV[t].View();
 	for(int s1=0;s1<Ns;s1++){
 	for(int s2=0;s2<Ns;s2++){
-	  WWVV[t]._odata[ss]()(s1,s2)(0,0) += tmp1()(s1)(0)*tmp2()(s2)(0);
-	  WWVV[t]._odata[ss]()(s1,s2)(0,1) += tmp1()(s1)(0)*tmp2()(s2)(1);
-	  WWVV[t]._odata[ss]()(s1,s2)(0,2) += tmp1()(s1)(0)*tmp2()(s2)(2);
-	  WWVV[t]._odata[ss]()(s1,s2)(1,0) += tmp1()(s1)(1)*tmp2()(s2)(0);
-	  WWVV[t]._odata[ss]()(s1,s2)(1,1) += tmp1()(s1)(1)*tmp2()(s2)(1);
-	  WWVV[t]._odata[ss]()(s1,s2)(1,2) += tmp1()(s1)(1)*tmp2()(s2)(2);
-	  WWVV[t]._odata[ss]()(s1,s2)(2,0) += tmp1()(s1)(2)*tmp2()(s2)(0);
-	  WWVV[t]._odata[ss]()(s1,s2)(2,1) += tmp1()(s1)(2)*tmp2()(s2)(1);
-	  WWVV[t]._odata[ss]()(s1,s2)(2,2) += tmp1()(s1)(2)*tmp2()(s2)(2);
+	  WWVV_v[ss]()(s1,s2)(0,0) += tmp1()(s1)(0)*tmp2()(s2)(0);
+	  WWVV_v[ss]()(s1,s2)(0,1) += tmp1()(s1)(0)*tmp2()(s2)(1);
+	  WWVV_v[ss]()(s1,s2)(0,2) += tmp1()(s1)(0)*tmp2()(s2)(2);
+	  WWVV_v[ss]()(s1,s2)(1,0) += tmp1()(s1)(1)*tmp2()(s2)(0);
+	  WWVV_v[ss]()(s1,s2)(1,1) += tmp1()(s1)(1)*tmp2()(s2)(1);
+	  WWVV_v[ss]()(s1,s2)(1,2) += tmp1()(s1)(1)*tmp2()(s2)(2);
+	  WWVV_v[ss]()(s1,s2)(2,0) += tmp1()(s1)(2)*tmp2()(s2)(0);
+	  WWVV_v[ss]()(s1,s2)(2,1) += tmp1()(s1)(2)*tmp2()(s2)(1);
+	  WWVV_v[ss]()(s1,s2)(2,2) += tmp1()(s1)(2)*tmp2()(s2)(2);
 	}}

      }}
    }
-  }
+  });
 }


@ -1436,17 +1444,21 @@ void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWV
  assert(gamma0.size()==gamma1.size());
  int Ng = gamma0.size();

-  GridBase *grid = WWVV0._grid;
+  GridBase *grid = WWVV0.Grid();

-  parallel_for(int ss=0;ss<grid->oSites();ss++){
+  auto WWVV0_v = WWVV0.View();
+  auto WWVV1_v = WWVV1.View();
+  auto O_trtr_v= O_trtr.View();
+  auto O_fig8_v= O_fig8.View();
+  thread_for(ss,grid->oSites(),{

    typedef typename ComplexField::vector_object vobj;

    vobj v_trtr;
    vobj v_fig8;

-    auto VV0 = WWVV0._odata[ss];
-    auto VV1 = WWVV1._odata[ss];
+    auto VV0 = WWVV0_v[ss];
+    auto VV1 = WWVV1_v[ss];
    
    for(int g=0;g<Ng;g++){

@ -1454,15 +1466,15 @@ void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWV
      v_fig8 = trace(VV0 * gamma0[g] * VV1 * gamma1[g]);

      if ( g==0 ) {
-	O_trtr._odata[ss] = v_trtr; 
-	O_fig8._odata[ss] = v_fig8;
+	O_trtr_v[ss] = v_trtr; 
+	O_fig8_v[ss] = v_fig8;
      } else { 
-	O_trtr._odata[ss]+= v_trtr; 
-	O_fig8._odata[ss]+= v_fig8;
+	O_trtr_v[ss]+= v_trtr; 
+	O_fig8_v[ss]+= v_fig8;
      }
      
    }
-  }
+  });
 }

 template<class FImpl>
@ -1476,22 +1488,27 @@ void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,
  assert(gamma0.size()==gamma1.size());
  int Ng = gamma0.size();

-  GridBase *grid = WWVV0._grid;
+  GridBase *grid = WWVV0.Grid();

-  parallel_for(int ss=0;ss<grid->oSites();ss++){
+  auto WWVV0_v = WWVV0.View();
+  auto WWVV1_v = WWVV1.View();
+  auto O_trtr_v= O_trtr.View();
+  auto O_fig8_v= O_fig8.View();
+
+  thread_for(ss,grid->oSites(),{

    typedef typename ComplexField::vector_object vobj;

-    auto VV0 = WWVV0._odata[ss];
-    auto VV1 = WWVV1._odata[ss];
+    auto VV0 = WWVV0_v[ss];
+    auto VV1 = WWVV1_v[ss];
    
    for(int g=0;g<Ng;g++){

      auto VV0G = VV0 * gamma0[g];  // Spin multiply
      auto VV1G = VV1 * gamma1[g];

-      vobj v_trtr=zero;
-      vobj v_fig8=zero;
+      vobj v_trtr=Zero();
+      vobj v_fig8=Zero();

      /////////////////////////////////////////
      // Colour mixed
@ -1542,15 +1559,15 @@ Bag [8,4]  fig8 (-227.58,3.58808e-17) trtr (-32.5776,1.83286e-17)     //  - 1602
      }}}}

      if ( g==0 ) {
-	O_trtr._odata[ss] = v_trtr; 
-	O_fig8._odata[ss] = v_fig8;
+	O_trtr_v[ss] = v_trtr; 
+	O_fig8_v[ss] = v_fig8;
      } else { 
-	O_trtr._odata[ss]+= v_trtr; 
-	O_fig8._odata[ss]+= v_fig8;
+	O_trtr_v[ss]+= v_trtr; 
+	O_fig8_v[ss]+= v_fig8;
      }
      
    }
-  }
+  });
 }

 #ifdef DELTA_F_EQ_2
@ -1572,7 +1589,7 @@ void A2Autils<FImpl>::DeltaFeq2(int dt_min,int dt_max,
 				const FermionField *vd,
 				int orthogdim)
 {
-  GridBase *grid = vs[0]._grid;
+  GridBase *grid = vs[0].Grid();

  LOG(Message) << "Computing A2A DeltaF=2 graph" << std::endl;

@ -1624,32 +1641,32 @@ void A2Autils<FImpl>::DeltaFeq2(int dt_min,int dt_max,
    denom_P(t) =ComplexD(0.0);
  }

-  ComplexField D0(grid);   D0 = zero; // <P|A0> correlator from each wall
-  ComplexField D1(grid);   D1 = zero;
+  ComplexField D0(grid);   D0 = Zero(); // <P|A0> correlator from each wall
+  ComplexField D1(grid);   D1 = Zero();

-  ComplexField O1_trtr(grid);  O1_trtr = zero;
-  ComplexField O2_trtr(grid);  O2_trtr = zero;
-  ComplexField O3_trtr(grid);  O3_trtr = zero;
-  ComplexField O4_trtr(grid);  O4_trtr = zero;
-  ComplexField O5_trtr(grid);  O5_trtr = zero;
+  ComplexField O1_trtr(grid);  O1_trtr = Zero();
+  ComplexField O2_trtr(grid);  O2_trtr = Zero();
+  ComplexField O3_trtr(grid);  O3_trtr = Zero();
+  ComplexField O4_trtr(grid);  O4_trtr = Zero();
+  ComplexField O5_trtr(grid);  O5_trtr = Zero();

-  ComplexField O1_fig8(grid);  O1_fig8 = zero;
-  ComplexField O2_fig8(grid);  O2_fig8 = zero;
-  ComplexField O3_fig8(grid);  O3_fig8 = zero;
-  ComplexField O4_fig8(grid);  O4_fig8 = zero;
-  ComplexField O5_fig8(grid);  O5_fig8 = zero;
+  ComplexField O1_fig8(grid);  O1_fig8 = Zero();
+  ComplexField O2_fig8(grid);  O2_fig8 = Zero();
+  ComplexField O3_fig8(grid);  O3_fig8 = Zero();
+  ComplexField O4_fig8(grid);  O4_fig8 = Zero();
+  ComplexField O5_fig8(grid);  O5_fig8 = Zero();

-  ComplexField VV_trtr(grid);  VV_trtr = zero;
-  ComplexField AA_trtr(grid);  AA_trtr = zero;
-  ComplexField SS_trtr(grid);  SS_trtr = zero;
-  ComplexField PP_trtr(grid);  PP_trtr = zero;
-  ComplexField TT_trtr(grid);  TT_trtr = zero;
+  ComplexField VV_trtr(grid);  VV_trtr = Zero();
+  ComplexField AA_trtr(grid);  AA_trtr = Zero();
+  ComplexField SS_trtr(grid);  SS_trtr = Zero();
+  ComplexField PP_trtr(grid);  PP_trtr = Zero();
+  ComplexField TT_trtr(grid);  TT_trtr = Zero();

-  ComplexField VV_fig8(grid);  VV_fig8 = zero;
-  ComplexField AA_fig8(grid);  AA_fig8 = zero;
-  ComplexField SS_fig8(grid);  SS_fig8 = zero;
-  ComplexField PP_fig8(grid);  PP_fig8 = zero;
-  ComplexField TT_fig8(grid);  TT_fig8 = zero;
+  ComplexField VV_fig8(grid);  VV_fig8 = Zero();
+  ComplexField AA_fig8(grid);  AA_fig8 = Zero();
+  ComplexField SS_fig8(grid);  SS_fig8 = Zero();
+  ComplexField PP_fig8(grid);  PP_fig8 = Zero();
+  ComplexField TT_fig8(grid);  TT_fig8 = Zero();

  //////////////////////////////////////////////////
  // Used to store appropriate correlation funcs
@ -1784,5 +1801,5 @@ void A2Autils<FImpl>::DeltaFeq2(int dt_min,int dt_max,
 }
 #endif 

-}}
+NAMESPACE_END(Grid);

--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@ -1,9 +1,36 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./lib/qcd/utils/BaryonUtils.h
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <felix.erben@ed.ac.uk>
+ Author: Michael Marshall <Michael.Marshall@ed.ac.uk>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
 #pragma once
 //#include <Grid/Hadrons/Global.hpp>
 #include <Grid/Eigen/unsupported/CXX11/Tensor>

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 #undef DELTA_F_EQ_2

@ -45,7 +72,6 @@ public:
 			     const PropagatorField &q2);
 };

-
 template<class FImpl>
 void BaryonUtils<FImpl>::ContractBaryons_debug(const PropagatorField &q1,
 						 const PropagatorField &q2,
@ -60,7 +86,7 @@ void BaryonUtils<FImpl>::ContractBaryons_debug(const PropagatorField &q1,
 						 ComplexField &bc6,
 						 ComplexField &baryon_corr)
 {
-  GridBase *grid = q1._grid;
+  GridBase *grid = q1.Grid();

  // C = i gamma_2 gamma_4 => C gamma_5 = - i gamma_1 gamma_3 
  //Gamma GammaA(Gamma::Algebra::Identity); //Still hardcoded 1
@ -78,30 +104,31 @@ void BaryonUtils<FImpl>::ContractBaryons_debug(const PropagatorField &q1,
    if (left[0] == right[epsilon[ie][0]] && left[1] == right[epsilon[ie][1]] && left[2] == right[epsilon[ie][2]])
      wick_contraction[ie]=1;

+  const int parity{ 1 };

-  int parity = 1;
+  LatticeView<pobj> v1(q1);
+  LatticeView<pobj> v2(q2);
+  LatticeView<pobj> v3(q3);

+  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {

-  parallel_for(int ss=0;ss<grid->oSites();ss++){
-
-    typedef typename ComplexField::vector_object vobj;
-
-    auto D1 = q1._odata[ss];
-    auto D2 = q2._odata[ss];
-    auto D3 = q3._odata[ss];
+    using CF_vobj = typename ComplexField::vector_object;
+    const auto &D1{ v1[ss] };
+    const auto &D2{ v2[ss] };
+    const auto &D3{ v3[ss] };

    auto gD1a = GammaA * GammaA * D1;
    auto gD1b = GammaA * g4 * GammaA * D1;
    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
    auto gD3 = GammaB * D3;

-    vobj result=zero;
-    vobj result1=zero;
-    vobj result2=zero;
-    vobj result3=zero;
-    vobj result4=zero;
-    vobj result5=zero;
-    vobj result6=zero;
+    CF_vobj result { 0 };
+    CF_vobj result1{ 0 };
+    CF_vobj result2{ 0 };
+    CF_vobj result3{ 0 };
+    CF_vobj result4{ 0 };
+    CF_vobj result5{ 0 };
+    CF_vobj result6{ 0 };

    for (int ie_src=0; ie_src < 6 ; ie_src++){
      int a_src = epsilon[ie_src][0]; //a
@ -180,16 +207,22 @@ void BaryonUtils<FImpl>::ContractBaryons_debug(const PropagatorField &q1,
      
      }
    }
-    baryon_corr._odata[ss] = result; 
-
-    bc1._odata[ss] = result1; 
-    bc2._odata[ss] = result2; 
-    bc3._odata[ss] = result3; 
-    bc4._odata[ss] = result4; 
-    bc5._odata[ss] = result5; 
-    bc6._odata[ss] = result6; 
-  } //end loop over lattice sites
+    LatticeView<CF_vobj> vbaryon_corr(baryon_corr);
+    vbaryon_corr[ss] = result;

+    LatticeView<CF_vobj> vbc1(bc1);
+    LatticeView<CF_vobj> vbc2(bc2);
+    LatticeView<CF_vobj> vbc3(bc3);
+    LatticeView<CF_vobj> vbc4(bc4);
+    LatticeView<CF_vobj> vbc5(bc5);
+    LatticeView<CF_vobj> vbc6(bc6);
+    vbc1[ss] = result1;
+    vbc2[ss] = result2;
+    vbc3[ss] = result3;
+    vbc4[ss] = result4;
+    vbc5[ss] = result5;
+    vbc6[ss] = result6;
+  } );//end loop over lattice sites
 }

 template<class FImpl>
@ -200,7 +233,7 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1,
 						 const Gamma GammaB,
 						 ComplexField &baryon_corr)
 {
-  GridBase *grid = q1._grid;
+  GridBase *grid = q1.Grid();

  // C = i gamma_2 gamma_4 => C gamma_5 = - i gamma_1 gamma_3 
  //Gamma GammaA(Gamma::Algebra::Identity); //Still hardcoded 1
@ -218,24 +251,25 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1,
    if (left[0] == right[epsilon[ie][0]] && left[1] == right[epsilon[ie][1]] && left[2] == right[epsilon[ie][2]])
      wick_contraction[ie]=1;

+  const int parity{ 1 };
+  
+  LatticeView<pobj> v1(q1);
+  LatticeView<pobj> v2(q2);
+  LatticeView<pobj> v3(q3);
+  
+  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {

-  int parity = 1;
-
-
-  parallel_for(int ss=0;ss<grid->oSites();ss++){
-
-    typedef typename ComplexField::vector_object vobj;
-
-    auto D1 = q1._odata[ss];
-    auto D2 = q2._odata[ss];
-    auto D3 = q3._odata[ss];
+    const auto &D1{ v1[ss] };
+    const auto &D2{ v2[ss] };
+    const auto &D3{ v3[ss] };

    auto gD1a = GammaA * GammaA * D1;
    auto gD1b = GammaA * g4 * GammaA * D1;
    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
    auto gD3 = GammaB * D3;

-    vobj result=zero;
+    using CF_vobj = typename ComplexField::vector_object;
+    CF_vobj result{ 0 };
    
    for (int ie_src=0; ie_src < 6 ; ie_src++){
      int a_src = epsilon[ie_src][0]; //a
@ -308,9 +342,9 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1,
      
      }
    }
-    baryon_corr._odata[ss] = result; 
-
-  } //end loop over lattice sites
+    LatticeView<CF_vobj> vbaryon_corr(baryon_corr);
+    vbaryon_corr[ss] = result;
+  } ); //end loop over lattice sites
 }

 //QDP / CHROMA - style diquark construction
@ -319,20 +353,22 @@ template<class FImpl>
 LatticeSpinColourMatrix BaryonUtils<FImpl>::quarkContract13(const PropagatorField &q1,
 					 const PropagatorField &q2)
 {
-  GridBase *grid = q1._grid;
+  GridBase *grid = q1.Grid();

  std::vector<std::vector<int>> epsilon = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
  std::vector<int> epsilon_sgn = {1,1,1,-1,-1,-1};

-  // TODO: Felix, made a few changes to fix this as there were compiler errors. Please validate!
+  // TODO: Felix, made a few changes to fix this. Please validate!
  LatticeSpinColourMatrix q_out(grid);
-  // q_out = zero; TODO: Don't think you need this, as you'll set each site explicitly anyway
-
-  parallel_for(int ss=0;ss<grid->oSites();ss++){
-    const auto & D1    = q1._odata[ss];
-    const auto & D2    = q2._odata[ss];
-          auto & D_out = q_out._odata[ss];
-    D_out=zero;
+  // q_out = 0; TODO: Don't think you need this, as you'll set each site explicitly anyway
+  LatticeView<pobj> v1(q1);
+  LatticeView<pobj> v2(q2);
+  LatticeView<vSpinColourMatrix> vw( q_out );
+  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+    const auto & D1{ v1[ss] };
+    const auto & D2{ v2[ss] };
+          auto & D_out { vw[ss] };
+    D_out = 0;
    for (int ie_src=0; ie_src < 6 ; ie_src++){
      int a_src = epsilon[ie_src][0]; //a
      int b_src = epsilon[ie_src][1]; //b
@ -341,17 +377,17 @@ LatticeSpinColourMatrix BaryonUtils<FImpl>::quarkContract13(const PropagatorFiel
        int a_snk = epsilon[ie_snk][0]; //a'
        int b_snk = epsilon[ie_snk][1]; //b'
        int c_snk = epsilon[ie_snk][2]; //c'
-        for (int alpha=0; alpha<Ns; alpha++){
-        for (int beta=0; beta<Ns; beta++){
-        for (int rho=0; rho<Ns; rho++){
-          D_out()(alpha,beta)(c_snk,c_src) += epsilon_sgn[ie_src] * epsilon_sgn[ie_snk] * D1()(rho,alpha)(a_src,a_snk)*D2()(rho,beta)(b_src,b_snk); //D1 conjugate??
-        }}}
+        for (int alpha=0; alpha<Ns; alpha++)
+          for (int beta=0; beta<Ns; beta++)
+            for (int rho=0; rho<Ns; rho++) {
+              D_out()(alpha,beta)(c_snk,c_src) += epsilon_sgn[ie_src] * epsilon_sgn[ie_snk] * D1()(rho,alpha)(a_src,a_snk)*D2()(rho,beta)(b_src,b_snk); //D1 conjugate??
+            }
      }
    }
- } //end loop over lattice sites
+  } ); //end loop over lattice sites


  return q_out;
 }

-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/utils/CovariantCshift.h
+++ b/Grid/qcd/utils/CovariantCshift.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -24,13 +24,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef QCD_UTILS_COVARIANT_CSHIFT_H
 #define QCD_UTILS_COVARIANT_CSHIFT_H

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);
+
 ////////////////////////////////////////////////////////////////////////
 // Low performance implementation of CovariantCshift API
 ////////////////////////////////////////////////////////////////////////
@ -39,8 +39,8 @@ namespace QCD {
 namespace PeriodicBC { 

  template<class covariant,class gauge> Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link, 
-									    int mu,
-									    const Lattice<covariant> &field)
+									   int mu,
+									   const Lattice<covariant> &field)
  {
    return Link*Cshift(field,mu,1);// moves towards negative mu
  }
@ -48,7 +48,7 @@ namespace PeriodicBC {
 									    int mu,
 									    const Lattice<covariant> &field)
  {
-    Lattice<covariant> tmp(field._grid);
+    Lattice<covariant> tmp(field.Grid());
    tmp = adj(Link)*field;
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }
@ -73,21 +73,21 @@ namespace ConjugateBC {
  //                      U  U^* U^* U^T U^adj =  U  (U U U^dag U^T )^*
  //                                           =  U  (U U U^dag)^* ( U^T )^*
  //
-  // So covariant shift rule: conjugate inward shifted plane when crossing boundary applies.
+  // So covariant shift rule: Conjugate inward shifted plane when crossing boundary applies.
  //
-  // This conjugate should be applied to BOTH the link and the covariant field on backward shift
+  // This Conjugate should be applied to BOTH the link and the covariant field on backward shift
  // boundary wrap.
  // 
  //      |  |              
  // xxxxxxxxxxxxxxxxx         
-  //      |  | <---- this link is conjugated, and the path leading into it. Segment crossing in and out is double conjugated.
+  //      |  | <---- this link is Conjugated, and the path leading into it. Segment crossing in and out is double Conjugated.
  //       -- 
  //    ------->
  template<class covariant,class gauge> Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link, 
-									    int mu,
-									    const Lattice<covariant> &field)
+									   int mu,
+									   const Lattice<covariant> &field)
  {
-    GridBase * grid = Link._grid;
+    GridBase * grid = Link.Grid();

    int Lmu = grid->GlobalDimensions()[mu]-1;

@ -106,7 +106,7 @@ namespace ConjugateBC {
 									    int mu,
 									    const Lattice<covariant> &field)
  {
-    GridBase * grid = field._grid;
+    GridBase * grid = field.Grid();

    int Lmu = grid->GlobalDimensions()[mu]-1;

@ -122,9 +122,8 @@ namespace ConjugateBC {
    return Cshift(tmp,mu,-1);// moves towards positive mu
  }

-
 }


-}}
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/utils/CovariantLaplacian.h
+++ b/Grid/qcd/utils/CovariantLaplacian.h
@ -25,13 +25,10 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
+#pragma once 

-#ifndef COVARIANT_LAPLACIAN_H
-#define COVARIANT_LAPLACIAN_H
-
-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 struct LaplacianParams : Serializable {
  GRID_SERIALIZABLE_CLASS_MEMBERS(LaplacianParams, 
@ -80,19 +77,19 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
  MultiShiftFunction PowerHalf;    
  MultiShiftFunction PowerInvHalf;    

- public:
+public:
  INHERIT_GIMPL_TYPES(Impl);

  LaplacianAdjointField(GridBase* grid, OperatorFunction<GaugeField>& S, LaplacianParams& p, const RealD k = 1.0)
-      : U(Nd, grid), Solver(S), param(p), kappa(k){
-        AlgRemez remez(param.lo,param.hi,param.precision);
-        std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
-        remez.generateApprox(param.degree,1,2);
-        PowerHalf.Init(remez,param.tolerance,false);
-        PowerInvHalf.Init(remez,param.tolerance,true);
+    : U(Nd, grid), Solver(S), param(p), kappa(k){
+    AlgRemez remez(param.lo,param.hi,param.precision);
+    std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl;
+    remez.generateApprox(param.degree,1,2);
+    PowerHalf.Init(remez,param.tolerance,false);
+    PowerInvHalf.Init(remez,param.tolerance,true);
        

-      };
+  };

  void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);}
  void Mdiag(const GaugeField&, GaugeField&){ assert(0);}
@ -109,14 +106,14 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
    //GaugeField herm = in + adj(in);
    //std::cout << "AHermiticity: " << norm2(herm) << std::endl;

-    GaugeLinkField tmp(in._grid);
-    GaugeLinkField tmp2(in._grid);
-    GaugeLinkField sum(in._grid);
+    GaugeLinkField tmp(in.Grid());
+    GaugeLinkField tmp2(in.Grid());
+    GaugeLinkField sum(in.Grid());

    for (int nu = 0; nu < Nd; nu++) {
-      sum = zero;
+      sum = Zero();
      GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
-      GaugeLinkField out_nu(out._grid);
+      GaugeLinkField out_nu(out.Grid());
      for (int mu = 0; mu < Nd; mu++) {
        tmp = U[mu] * Cshift(in_nu, mu, +1) * adj(U[mu]);
        tmp2 = adj(U[mu]) * in_nu * U[mu];
@ -132,8 +129,8 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
    RealD factor = -kappa / (double(4 * Nd));
    
    for (int mu = 0; mu < Nd; mu++){
-      GaugeLinkField der_mu(der._grid);
-      der_mu = zero;
+      GaugeLinkField der_mu(der.Grid());
+      der_mu = Zero();
      for (int nu = 0; nu < Nd; nu++){
        GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu);
        der_mu += U[mu] * Cshift(in_nu, mu, 1) * adj(U[mu]) * in_nu;
@ -151,8 +148,8 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
    RealD factor = -kappa / (double(4 * Nd));

    for (int mu = 0; mu < Nd; mu++) {
-      GaugeLinkField der_mu(der._grid);
-      der_mu = zero;
+      GaugeLinkField der_mu(der.Grid());
+      der_mu = Zero();
      for (int nu = 0; nu < Nd; nu++) {
        GaugeLinkField left_nu = PeekIndex<LorentzIndex>(left, nu);
        GaugeLinkField right_nu = PeekIndex<LorentzIndex>(right, nu);
@ -169,7 +166,7 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
  }

  void MSquareRoot(GaugeField& P){
-    GaugeField Gp(P._grid);
+    GaugeField Gp(P.Grid());
    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerHalf);
    msCG(HermOp,P,Gp);
@ -177,7 +174,7 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {
  }

  void MInvSquareRoot(GaugeField& P){
-    GaugeField Gp(P._grid);
+    GaugeField Gp(P.Grid());
    HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this);
    ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerInvHalf);
    msCG(HermOp,P,Gp);
@ -186,12 +183,9 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> {



- private:
+private:
  RealD kappa;
  std::vector<GaugeLinkField> U;
 };

-}
-}
-
-#endif
+NAMESPACE_END(Grid);
--- a/Grid/qcd/utils/CovariantSmearing.h
+++ b/Grid/qcd/utils/CovariantSmearing.h
@ -43,7 +43,7 @@ public:
 			    T& chi, 
 			    const Real& width, int Iterations, int orthog)
  {
-    GridBase *grid = chi._grid;
+    GridBase *grid = chi.Grid();
    T psi(grid);

    ////////////////////////////////////////////////////////////////////////////////////
--- a/Grid/qcd/utils/GaugeFix.h
+++ b/Grid/qcd/utils/GaugeFix.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    grid` physics library, www.github.com/paboyle/Grid 

@ -22,19 +22,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 //#include <Grid/Grid.h>

 #ifndef GRID_QCD_GAUGE_FIX_H
 #define GRID_QCD_GAUGE_FIX_H
-namespace Grid {
-namespace QCD {
+
+NAMESPACE_BEGIN(Grid);


 template <class Gimpl> 
 class FourierAcceleratedGaugeFixer  : public Gimpl {
- public:
+public:
  INHERIT_GIMPL_TYPES(Gimpl);

  typedef typename Gimpl::GaugeLinkField GaugeMat;
@ -47,7 +47,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    }
  }
  static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu,int orthog) {
-    dmuAmu=zero;
+    dmuAmu=Zero();
    for(int mu=0;mu<Nd;mu++){
      if ( mu != orthog ) {
 	dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
@ -56,13 +56,13 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
  }  

  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {
-    GridBase *grid = Umu._grid;
+    GridBase *grid = Umu.Grid();
    GaugeMat xform(grid);
    SteepestDescentGaugeFix(Umu,xform,alpha,maxiter,Omega_tol,Phi_tol,Fourier,orthog);
  }
  static void SteepestDescentGaugeFix(GaugeLorentz &Umu,GaugeMat &xform,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false,int orthog=-1) {

-    GridBase *grid = Umu._grid;
+    GridBase *grid = Umu.Grid();

    Real org_plaq      =WilsonLoops<Gimpl>::avgPlaquette(Umu);
    Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu); 
@ -72,7 +72,6 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    xform=1.0;

    std::vector<GaugeMat> U(Nd,grid);
-
    GaugeMat dmuAmu(grid);

    {
@ -125,7 +124,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    }
  };
  static Real SteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {
-    GridBase *grid = U[0]._grid;
+    GridBase *grid = U[0].Grid();

    std::vector<GaugeMat> A(Nd,grid);
    GaugeMat g(grid);
@ -145,14 +144,14 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {

  static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,GaugeMat &xform,Real & alpha, GaugeMat & dmuAmu,int orthog) {

-    GridBase *grid = U[0]._grid;
+    GridBase *grid = U[0].Grid();

    Real vol = grid->gSites();

    FFT theFFT((GridCartesian *)grid);

    LatticeComplex  Fp(grid);
-    LatticeComplex  psq(grid); psq=zero;
+    LatticeComplex  psq(grid); psq=Zero();
    LatticeComplex  pmu(grid); 
    LatticeComplex   one(grid); one = Complex(1.0,0.0);

@ -172,8 +171,8 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
    // Work out Fp = psq_max/ psq...
    // Avoid singularities in Fp
    //////////////////////////////////
-    std::vector<int> latt_size = grid->GlobalDimensions();
-    std::vector<int> coor(grid->_ndimension,0);
+    Coordinate latt_size = grid->GlobalDimensions();
+    Coordinate coor(grid->_ndimension,0);
    for(int mu=0;mu<Nd;mu++) {
      if ( mu != orthog ) { 
 	Real TwoPiL =  M_PI * 2.0/ latt_size[mu];
@ -212,7 +211,7 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
  }

  static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu,int orthog) {
-    GridBase *grid = g._grid;
+    GridBase *grid = g.Grid();
    Complex cialpha(0.0,-alpha);
    GaugeMat ciadmam(grid);
    DmuAmu(A,dmuAmu,orthog);
@ -221,6 +220,6 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
  }  
 };

-}
-}
+NAMESPACE_END(Grid);
+
 #endif
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -25,13 +25,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#ifndef GRID_QCD_LINALG_UTILS_H
-#define GRID_QCD_LINALG_UTILS_H
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);

-namespace Grid{
-namespace QCD{
 ////////////////////////////////////////////////////////////////////////
 //This file brings additional linear combination assist that is helpful
 //to QCD such as chiral projectors and spin matrices applied to one of the inputs.
@ -42,170 +41,200 @@ namespace QCD{
 template<class vobj,class Coeff>
 void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,z);

-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();

  Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss++){
-    vobj tmp;
-    tmp = a*x._odata[ss];
-    tmp = tmp + G5*(b*timesI(x._odata[ss]));
-    vstream(z._odata[ss],tmp);
-  }
+  auto x_v = x.View();
+  auto z_v = z.View();
+  accelerator_for( ss, x_v.size(),vobj::Nsimd(), {
+    auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss)));
+    coalescedWrite(z_v[ss],tmp);
+  });
 }

 template<class vobj,class Coeff> 
 void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp = a*x._odata[ss+s]+b*y._odata[ss+sp];
-    vstream(z._odata[ss+s],tmp);
-  }
+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
+  // FIXME -- need a new class of accelerator_loop to implement this
+  //
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    auto tmp = a*x_v(ss+s)+b*y_v(ss+sp);
+    coalescedWrite(z_v[ss+s],tmp);
+  });
 }

 template<class vobj,class Coeff> 
 void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
-
  Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp;
-    tmp = G5*x._odata[ss+s]*a;
-    tmp = tmp + b*y._odata[ss+sp];
-    vstream(z._odata[ss+s],tmp);
-  }
+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    auto tmp = G5*x_v(ss+s)*a + b*y_v(ss+sp);
+    coalescedWrite(z_v[ss+s],tmp);
+  });
 }

 template<class vobj,class Coeff> 
 void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
  Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp;
-    tmp = G5*y._odata[ss+sp]*b;
-    tmp = tmp + a*x._odata[ss+s];
-    vstream(z._odata[ss+s],tmp);
-  }
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    auto tmp = G5*y_v(ss+sp)*b + a*x_v(ss+s);
+    coalescedWrite(z_v[ss+s],tmp);
+  });
 }

 template<class vobj,class Coeff> 
 void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];

+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
  Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp1;
-    vobj tmp2;
-    tmp1 = a*x._odata[ss+s]+b*y._odata[ss+sp];
-    tmp2 = G5*tmp1;
-    vstream(z._odata[ss+s],tmp2);
-  }
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    auto tmp1 = a*x_v(ss+s)+b*y_v(ss+sp);
+    auto tmp2 = G5*tmp1;
+    coalescedWrite(z_v[ss+s],tmp2);
+  });
 }

 template<class vobj,class Coeff> 
 void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp;
-    spProj5m(tmp,y._odata[ss+sp]);
-    tmp = a*x._odata[ss+s]+b*tmp;
-    vstream(z._odata[ss+s],tmp);
-  }
+
+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    decltype(coalescedRead(y_v[ss+sp])) tmp;
+    spProj5m(tmp,y_v(ss+sp));
+    tmp = a*x_v(ss+s)+b*tmp;
+    coalescedWrite(z_v[ss+s],tmp);
+  });
 }

 template<class vobj,class Coeff> 
 void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const Lattice<vobj> &y,int s,int sp)
 {
-  z.checkerboard = x.checkerboard;
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,y);
  conformable(x,z);
-  GridBase *grid=x._grid;
+  GridBase *grid=x.Grid();
  int Ls = grid->_rdimensions[0];
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
-    vobj tmp;
-    spProj5p(tmp,y._odata[ss+sp]);
-    tmp = a*x._odata[ss+s]+b*tmp;
-    vstream(z._odata[ss+s],tmp);
-  }
+  auto x_v = x.View();
+  auto y_v = y.View();
+  auto z_v = z.View();
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
+    decltype(coalescedRead(y_v[ss+sp])) tmp;
+    spProj5p(tmp,y_v(ss+sp));
+    tmp = a*x_v(ss+s)+b*tmp;
+    coalescedWrite(z_v[ss+s],tmp);
+  });
 }

 template<class vobj> 
 void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
 {
-  GridBase *grid=x._grid;
-  z.checkerboard = x.checkerboard;
+  GridBase *grid=x.Grid();
+  z.Checkerboard() = x.Checkerboard();
  conformable(x,z);
  int Ls = grid->_rdimensions[0];
  Gamma G5(Gamma::Algebra::Gamma5);
-  parallel_for(int ss=0;ss<grid->oSites();ss+=Ls) {
-    vobj tmp;
+  auto x_v = x.View();
+  auto z_v = z.View();
+  uint64_t nloop = grid->oSites()/Ls;
+  accelerator_for(sss,nloop,vobj::Nsimd(),{
+    uint64_t ss = sss*Ls;
    for(int s=0;s<Ls;s++){
      int sp = Ls-1-s;
-      tmp = G5*x._odata[ss+s];
-      vstream(z._odata[ss+sp],tmp);
+      coalescedWrite(z_v[ss+sp],G5*x_v(ss+s));
    }
-  }
-}
+  });
 }

 // I explicitly need these outside the QCD namespace
 template<typename vobj>
 void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
 {
-  GridBase *grid = x._grid;
-  z.checkerboard = x.checkerboard;
+  GridBase *grid = x.Grid();
+  z.Checkerboard() = x.Checkerboard();
  conformable(x, z);

-  QCD::Gamma G5(QCD::Gamma::Algebra::Gamma5);
+  Gamma G5(Gamma::Algebra::Gamma5);
  z = G5 * x;
 }

 template<class CComplex, int nbasis>
 void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x)
 {
-  GridBase *grid = x._grid;
-  z.checkerboard = x.checkerboard;
+  GridBase *grid = x.Grid();
+  z.Checkerboard() = x.Checkerboard();
  conformable(x, z);

  static_assert(nbasis % 2 == 0, "");
  int nb = nbasis / 2;

-  parallel_for(int ss = 0; ss < grid->oSites(); ss++) {
+  auto z_v = z.View();
+  auto x_v = x.View();
+  accelerator_for(ss,grid->oSites(),CComplex::Nsimd(),
+  {
    for(int n = 0; n < nb; ++n) {
-      z._odata[ss](n) = x._odata[ss](n);
+      coalescedWrite(z_v[ss](n), x_v(ss)(n));
    }
    for(int n = nb; n < nbasis; ++n) {
-      z._odata[ss](n) = -x._odata[ss](n);
+      coalescedWrite(z_v[ss](n), -x_v(ss)(n));
    }
-  }
+  });
 }

-}
-#endif 
+NAMESPACE_END(Grid);
+
--- a/Grid/qcd/utils/Metric.h
+++ b/Grid/qcd/utils/Metric.h
@ -25,13 +25,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
-//--------------------------------------------------------------------
-#ifndef METRIC_H
-#define METRIC_H
+			   /*  END LEGAL */
+			   //--------------------------------------------------------------------
+#pragma once

-namespace Grid{
-namespace QCD{
+NAMESPACE_BEGIN(Grid);

 template <typename Field> 
 class Metric{
@ -64,10 +62,10 @@ public:
    // do nothing
  }
  virtual void MDeriv(const Field& in, Field& out){
-    out = zero;
+    out = Zero();
  }
  virtual void MDeriv(const Field& left, const Field& right, Field& out){
-    out = zero;
+    out = Zero();
  }

 };
@ -108,7 +106,7 @@ public:
    if (1) {
      // Auxiliary momenta
      // do nothing if trivial, so hide in the metric
-      MomentaField AuxMomTemp(Mom._grid);
+      MomentaField AuxMomTemp(Mom.Grid());
      Implementation::generate_momenta(AuxMom, pRNG);
      Implementation::generate_momenta(AuxField, pRNG);
      // Modify the distribution with the metric
@ -119,11 +117,11 @@ public:

  // Correct
  RealD MomentaAction(){
-    MomentaField inv(Mom._grid);
-    inv = zero;
+    MomentaField inv(Mom.Grid());
+    inv = Zero();
    M.Minv(Mom, inv);
-    LatticeComplex Hloc(Mom._grid);
-    Hloc = zero;
+    LatticeComplex Hloc(Mom.Grid());
+    Hloc = Zero();
    for (int mu = 0; mu < Nd; mu++) {
      // This is not very general
      // hide in the metric
@ -147,7 +145,7 @@ public:
      }
    }

-    Complex Hsum = sum(Hloc);
+    auto Hsum = TensorRemove(sum(Hloc));
    return Hsum.real();
  }

@ -156,9 +154,9 @@ public:

    // Compute the derivative of the kinetic term
    // with respect to the gauge field
-    MomentaField MDer(in._grid);
-    MomentaField X(in._grid);
-    X = zero;
+    MomentaField MDer(in.Grid());
+    MomentaField X(in.Grid());
+    X = Zero();
    M.Minv(in, X);  // X = G in
    M.MDeriv(X, MDer);  // MDer = U * dS/dU
    der = Implementation::projectForce(MDer);  // Ta if gauge fields
@ -166,27 +164,27 @@ public:
  }

  void AuxiliaryFieldsDerivative(MomentaField& der){
-    der = zero;
+    der = Zero();
    if (1){
-    // Auxiliary fields
-    MomentaField der_temp(der._grid);
-    MomentaField X(der._grid);
-    X=zero;
-    //M.M(AuxMom, X); // X = M Aux
-    // Two derivative terms
-    // the Mderiv need separation of left and right terms
-    M.MDeriv(AuxMom, der); 
+      // Auxiliary fields
+      MomentaField der_temp(der.Grid());
+      MomentaField X(der.Grid());
+      X=Zero();
+      //M.M(AuxMom, X); // X = M Aux
+      // Two derivative terms
+      // the Mderiv need separation of left and right terms
+      M.MDeriv(AuxMom, der); 


-    // this one should not be necessary (identical to the previous one)
-    //M.MDeriv(X, AuxMom, der_temp); der += der_temp;
+      // this one should not be necessary (identical to the previous one)
+      //M.MDeriv(X, AuxMom, der_temp); der += der_temp;

-    der = -1.0*Implementation::projectForce(der);
+      der = -1.0*Implementation::projectForce(der);
    }
  }

  void DerivativeP(MomentaField& der){
-    der = zero;
+    der = Zero();
    M.Minv(Mom, der);
    // is the projection necessary here?
    // no for fields in the algebra
@ -201,8 +199,8 @@ public:

  void update_auxiliary_fields(RealD ep){
    if (1) {
-      MomentaField tmp(AuxMom._grid);
-      MomentaField tmp2(AuxMom._grid);
+      MomentaField tmp(AuxMom.Grid());
+      MomentaField tmp2(AuxMom.Grid());
      M.M(AuxMom, tmp);
      // M.M(tmp, tmp2);
      AuxField += ep * tmp;  // M^2 AuxMom
@ -212,15 +210,5 @@ public:

 };

+NAMESPACE_END(Grid);

-
-
-
-
-
-
-}
-}
-
-
-#endif //METRIC_H
--- a/Grid/qcd/utils/SUn.h
+++ b/Grid/qcd/utils/SUn.h
@ -28,16 +28,15 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-/*  END LEGAL */
+			   /*  END LEGAL */
 #ifndef QCD_UTIL_SUN_H
 #define QCD_UTIL_SUN_H

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template <int ncolour>
 class SU {
- public:
+public:
  static const int Dimension = ncolour;
  static const int AdjointDimension = ncolour * ncolour - 1;
  static int su2subgroups(void) { return (ncolour * (ncolour - 1)) / 2; }
@ -48,7 +47,7 @@ class SU {
  using iSU2Matrix = iScalar<iScalar<iMatrix<vtype, 2> > >;
  template <typename vtype>
  using iSUnAlgebraVector =
-      iScalar<iScalar<iVector<vtype, AdjointDimension> > >;
+    iScalar<iScalar<iVector<vtype, AdjointDimension> > >;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  // Types can be accessed as SU<2>::Matrix , SU<2>::vSUnMatrix,
@ -163,7 +162,7 @@ class SU {
  
  template <class cplx>
  static void generatorSigmaY(int su2Index, iSUnMatrix<cplx> &ta) {
-    ta = zero;
+    ta = Zero();
    int i1, i2;
    su2SubGroupIndex(i1, i2, su2Index);
    ta()()(i1, i2) = 1.0;
@ -173,7 +172,7 @@ class SU {
  
  template <class cplx>
  static void generatorSigmaX(int su2Index, iSUnMatrix<cplx> &ta) {
-    ta = zero;
+    ta = Zero();
    cplx i(0.0, 1.0);
    int i1, i2;
    su2SubGroupIndex(i1, i2, su2Index);
@ -185,7 +184,7 @@ class SU {
  template <class cplx>
  static void generatorDiagonal(int diagIndex, iSUnMatrix<cplx> &ta) {
    // diag ({1, 1, ..., 1}(k-times), -k, 0, 0, ...)
-    ta = zero;
+    ta = Zero();
    int k = diagIndex + 1;                  // diagIndex starts from 0
    for (int i = 0; i <= diagIndex; i++) {  // k iterations
      ta()()(i, i) = 1.0;
@ -218,28 +217,32 @@ class SU {
                         Lattice<iSU2Matrix<vcplx> > &subgroup,
                         const Lattice<iSUnMatrix<vcplx> > &source,
                         int su2_index) {
-    GridBase *grid(source._grid);
+    GridBase *grid(source.Grid());
    conformable(subgroup, source);
    conformable(subgroup, Determinant);
    int i0, i1;
    su2SubGroupIndex(i0, i1, su2_index);
+    auto subgroup_v = subgroup.View();
+    auto source_v   = source.View();
+    auto Determinant_v = Determinant.View();

-    parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
-      subgroup._odata[ss]()()(0, 0) = source._odata[ss]()()(i0, i0);
-      subgroup._odata[ss]()()(0, 1) = source._odata[ss]()()(i0, i1);
-      subgroup._odata[ss]()()(1, 0) = source._odata[ss]()()(i1, i0);
-      subgroup._odata[ss]()()(1, 1) = source._odata[ss]()()(i1, i1);
+    thread_for(ss, grid->oSites(), {

-      iSU2Matrix<vcplx> Sigma = subgroup._odata[ss];
+      subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0);
+      subgroup_v[ss]()()(0, 1) = source_v[ss]()()(i0, i1);
+      subgroup_v[ss]()()(1, 0) = source_v[ss]()()(i1, i0);
+      subgroup_v[ss]()()(1, 1) = source_v[ss]()()(i1, i1);
+
+      iSU2Matrix<vcplx> Sigma = subgroup_v[ss];

      Sigma = Sigma - adj(Sigma) + trace(adj(Sigma));

-      subgroup._odata[ss] = Sigma;
+      subgroup_v[ss] = Sigma;

      // this should be purely real
-      Determinant._odata[ss] =
-          Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0);
-    }
+      Determinant_v[ss] =
+	Sigma()()(0, 0) * Sigma()()(1, 1) - Sigma()()(0, 1) * Sigma()()(1, 0);
+    });
  }

  //////////////////////////////////////////////////////////////////////////////////////////
@ -248,18 +251,21 @@ class SU {
  template <class vcplx>
  static void su2Insert(const Lattice<iSU2Matrix<vcplx> > &subgroup,
                        Lattice<iSUnMatrix<vcplx> > &dest, int su2_index) {
-    GridBase *grid(dest._grid);
+    GridBase *grid(dest.Grid());
    conformable(subgroup, dest);
    int i0, i1;
    su2SubGroupIndex(i0, i1, su2_index);

    dest = 1.0;  // start out with identity
-    parallel_for (int ss = 0; ss < grid->oSites(); ss++) {
-      dest._odata[ss]()()(i0, i0) = subgroup._odata[ss]()()(0, 0);
-      dest._odata[ss]()()(i0, i1) = subgroup._odata[ss]()()(0, 1);
-      dest._odata[ss]()()(i1, i0) = subgroup._odata[ss]()()(1, 0);
-      dest._odata[ss]()()(i1, i1) = subgroup._odata[ss]()()(1, 1);
-    }
+    auto dest_v = dest.View();
+    auto subgroup_v = subgroup.View();
+    thread_for(ss, grid->oSites(),
+    {
+      dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0);
+      dest_v[ss]()()(i0, i1) = subgroup_v[ss]()()(0, 1);
+      dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0);
+      dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1);
+    });
  }

  ///////////////////////////////////////////////
@ -272,16 +278,14 @@ class SU {
  // in action.
  //
  ///////////////////////////////////////////////
-  static void SubGroupHeatBath(
-      GridSerialRNG &sRNG, GridParallelRNG &pRNG,
-      RealD beta,  // coeff multiplying staple in action (with no 1/Nc)
-      LatticeMatrix &link,
-      const LatticeMatrix &barestaple,  // multiplied by action coeffs so th
-      int su2_subgroup, int nheatbath, LatticeInteger &wheremask) {
-    GridBase *grid = link._grid;
+  static void SubGroupHeatBath(GridSerialRNG &sRNG, GridParallelRNG &pRNG,
+			       RealD beta,  // coeff multiplying staple in action (with no 1/Nc)
+			       LatticeMatrix &link,
+			       const LatticeMatrix &barestaple,  // multiplied by action coeffs so th
+			       int su2_subgroup, int nheatbath, LatticeInteger &wheremask) 
+  {
+    GridBase *grid = link.Grid();

-    int ntrials = 0;
-    int nfails = 0;
    const RealD twopi = 2.0 * M_PI;

    LatticeMatrix staple(grid);
@ -292,8 +296,7 @@ class SU {
    V = link * staple;

    // Subgroup manipulation in the lie algebra space
-    LatticeSU2Matrix u(
-        grid);  // Kennedy pendleton "u" real projected normalised Sigma
+    LatticeSU2Matrix u(grid);  // Kennedy pendleton "u" real projected normalised Sigma
    LatticeSU2Matrix uinv(grid);
    LatticeSU2Matrix ua(grid);  // a in pauli form
    LatticeSU2Matrix b(grid);   // rotated matrix after hb
@ -302,11 +305,11 @@ class SU {
    LatticeComplex ones(grid);
    ones = 1.0;
    LatticeComplex zeros(grid);
-    zeros = zero;
+    zeros = Zero();
    LatticeReal rones(grid);
    rones = 1.0;
    LatticeReal rzeros(grid);
-    rzeros = zero;
+    rzeros = Zero();
    LatticeComplex udet(grid);  // determinant of real(staple)
    LatticeInteger mask_true(grid);
    mask_true = 1;
@ -314,41 +317,41 @@ class SU {
    mask_false = 0;

    /*
-  PLB 156 P393 (1985) (Kennedy and Pendleton)
+      PLB 156 P393 (1985) (Kennedy and Pendleton)

-  Note: absorb "beta" into the def of sigma compared to KP paper; staple
-  passed to this routine has "beta" already multiplied in
+      Note: absorb "beta" into the def of sigma compared to KP paper; staple
+      passed to this routine has "beta" already multiplied in

-  Action linear in links h and of form:
+      Action linear in links h and of form:

      beta S = beta  Sum_p (1 - 1/Nc Re Tr Plaq )

-  Writing Sigma = 1/Nc (beta Sigma') where sum over staples is "Sigma' "
+      Writing Sigma = 1/Nc (beta Sigma') where sum over staples is "Sigma' "

-       beta S = const - beta/Nc Re Tr h Sigma'
-              = const - Re Tr h Sigma
+      beta S = const - beta/Nc Re Tr h Sigma'
+      = const - Re Tr h Sigma

-  Decompose h and Sigma into (1, sigma_j) ; h_i real, h^2=1, Sigma_i complex
-  arbitrary.
+      Decompose h and Sigma into (1, sigma_j) ; h_i real, h^2=1, Sigma_i complex
+      arbitrary.

      Tr h Sigma = h_i Sigma_j Tr (sigma_i sigma_j)  = h_i Sigma_j 2 delta_ij
-   Re Tr h Sigma = 2 h_j Re Sigma_j
+      Re Tr h Sigma = 2 h_j Re Sigma_j

-  Normalised re Sigma_j = xi u_j
+      Normalised re Sigma_j = xi u_j

-  With u_j a unit vector and U can be in SU(2);
+      With u_j a unit vector and U can be in SU(2);

-  Re Tr h Sigma = 2 h_j Re Sigma_j = 2 xi (h.u)
+      Re Tr h Sigma = 2 h_j Re Sigma_j = 2 xi (h.u)

-  4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
-   u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
+      4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
+      u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]

-   xi = sqrt(Det)/2;
+      xi = sqrt(Det)/2;

-  Write a= u h in SU(2); a has pauli decomp a_j;
+      Write a= u h in SU(2); a has pauli decomp a_j;

-  Note: Product b' xi is unvariant because scaling Sigma leaves
-        normalised vector "u" fixed; Can rescale Sigma so b' = 1.
+      Note: Product b' xi is unvariant because scaling Sigma leaves
+      normalised vector "u" fixed; Can rescale Sigma so b' = 1.
    */

    ////////////////////////////////////////////////////////
@ -386,7 +389,7 @@ class SU {

    xi = 0.5 * sqrt(udet);  // 4xi^2 = Det [ Sig - Sig^dag  + 1 Tr Sigdag]
    u = 0.5 * u *
-        pow(xi, -1.0);  //  u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]
+      pow(xi, -1.0);  //  u   = 1/2xi [ Sig - Sig^dag  + 1 Tr Sigdag]

    // Debug test for sanity
    uinv = adj(u);
@ -394,36 +397,36 @@ class SU {
    assert(norm2(b) < 1.0e-4);

    /*
-  Measure: Haar measure dh has d^4a delta(1-|a^2|)
-  In polars:
-    da = da0 r^2 sin theta dr dtheta dphi delta( 1 - r^2 -a0^2)
-       = da0 r^2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r)(sqrt(1-a0^) +
-  r) )
-       = da0 r/2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r) )
+      Measure: Haar measure dh has d^4a delta(1-|a^2|)
+      In polars:
+      da = da0 r^2 sin theta dr dtheta dphi delta( 1 - r^2 -a0^2)
+      = da0 r^2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r)(sqrt(1-a0^) +
+      r) )
+      = da0 r/2 sin theta dr dtheta dphi delta( (sqrt(1-a0^) - r) )

-  Action factor Q(h) dh  = e^-S[h]  dh =  e^{  xi Tr uh} dh    // beta enters
-  through xi
-                                       =  e^{2 xi (h.u)} dh
-                                       =  e^{2 xi h0u0}.e^{2 xi h1u1}.e^{2 xi
-  h2u2}.e^{2 xi h3u3} dh
+      Action factor Q(h) dh  = e^-S[h]  dh =  e^{  xi Tr uh} dh    // beta enters
+      through xi
+      =  e^{2 xi (h.u)} dh
+      =  e^{2 xi h0u0}.e^{2 xi h1u1}.e^{2 xi
+      h2u2}.e^{2 xi h3u3} dh

-  Therefore for each site, take xi for that site
-  i) generate  |a0|<1 with dist
-     (1-a0^2)^0.5 e^{2 xi a0 } da0
+      Therefore for each site, take xi for that site
+      i) generate  |a0|<1 with dist
+      (1-a0^2)^0.5 e^{2 xi a0 } da0

-  Take alpha = 2 xi  = 2 xi [ recall 2 beta/Nc unmod staple norm]; hence 2.0/Nc
-  factor in Chroma ]
-  A. Generate two uniformly distributed pseudo-random numbers R and R', R'',
-  R''' in the unit interval;
-  B. Set X = -(ln R)/alpha, X' =-(ln R')/alpha;
-  C. Set C = cos^2(2pi R"), with R" another uniform random number in [0,1] ;
-  D. Set A = XC;
-  E. Let d  = X'+A;
-  F. If R'''^2 :> 1 - 0.5 d,  go back to A;
-  G. Set a0 = 1 - d;
+      Take alpha = 2 xi  = 2 xi [ recall 2 beta/Nc unmod staple norm]; hence 2.0/Nc
+      factor in Chroma ]
+      A. Generate two uniformly distributed pseudo-random numbers R and R', R'',
+      R''' in the unit interval;
+      B. Set X = -(ln R)/alpha, X' =-(ln R')/alpha;
+      C. Set C = cos^2(2pi R"), with R" another uniform random number in [0,1] ;
+      D. Set A = XC;
+      E. Let d  = X'+A;
+      F. If R'''^2 :> 1 - 0.5 d,  go back to A;
+      G. Set a0 = 1 - d;

-  Note that in step D setting B ~ X - A and using B in place of A in step E will
-  generate a second independent a 0 value.
+      Note that in step D setting B ~ X - A and using B in place of A in step E will
+      generate a second independent a 0 value.
    */

    /////////////////////////////////////////////////////////
@ -435,13 +438,13 @@ class SU {
    RealD numSites = sum(rtmp);
    RealD numAccepted;
    LatticeInteger Accepted(grid);
-    Accepted = zero;
+    Accepted = Zero();
    LatticeInteger newlyAccepted(grid);

    std::vector<LatticeReal> xr(4, grid);
    std::vector<LatticeReal> a(4, grid);
    LatticeReal d(grid);
-    d = zero;
+    d = Zero();
    LatticeReal alpha(grid);

    //    std::cout<<GridLogMessage<<"xi "<<xi <<std::endl;
@ -478,7 +481,7 @@ class SU {
      LatticeInteger ione(grid);
      ione = 1;
      LatticeInteger izero(grid);
-      izero = zero;
+      izero = Zero();

      newlyAccepted = where(xrsq < thresh, ione, izero);
      Accepted = where(newlyAccepted, newlyAccepted, Accepted);
@ -493,7 +496,7 @@ class SU {
    } while ((numAccepted < numSites) && (hit < nheatbath));

    // G. Set a0 = 1 - d;
-    a[0] = zero;
+    a[0] = Zero();
    a[0] = where(wheremask, 1.0 - d, a[0]);

    //////////////////////////////////////////
@ -517,7 +520,7 @@ class SU {
    a[2] = a123mag * sin_theta * sin(phi);
    a[3] = a123mag * cos_theta;

-    ua = toComplex(a[0]) * ident + toComplex(a[1]) * pauli1 +
+    ua = toComplex(a[0]) * ident  + toComplex(a[1]) * pauli1 +
         toComplex(a[2]) * pauli2 + toComplex(a[3]) * pauli3;

    b = 1.0;
@ -531,7 +534,7 @@ class SU {
    // Debug Checks
    // SU2 check
    LatticeSU2Matrix check(grid);  // rotated matrix after hb
-    u = zero;
+    u = Zero();
    check = ua * adj(ua) - 1.0;
    check = where(Accepted, check, u);
    assert(norm2(check) < 1.0e-4);
@ -541,7 +544,7 @@ class SU {
    assert(norm2(check) < 1.0e-4);

    LatticeMatrix Vcheck(grid);
-    Vcheck = zero;
+    Vcheck = Zero();
    Vcheck = where(Accepted, V * adj(V) - 1.0, Vcheck);
    //    std::cout<<GridLogMessage << "SU3 check " <<norm2(Vcheck)<<std::endl;
    assert(norm2(Vcheck) < 1.0e-4);
@ -607,7 +610,7 @@ class SU {
  template <typename LatticeMatrixType>
  static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out,
                           double scale = 1.0) {
-    GridBase *grid = out._grid;
+    GridBase *grid = out.Grid();

    typedef typename LatticeMatrixType::vector_type vector_type;
    typedef typename LatticeMatrixType::scalar_type scalar_type;
@ -616,16 +619,16 @@ class SU {

    typedef Lattice<vTComplexType> LatticeComplexType;
    typedef typename GridTypeMapper<
-        typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
+      typename LatticeMatrixType::vector_object>::scalar_object MatrixType;

    LatticeComplexType ca(grid);
    LatticeMatrixType lie(grid);
    LatticeMatrixType la(grid);
    ComplexD ci(0.0, scale);
-    ComplexD cone(1.0, 0.0);
+    //    ComplexD cone(1.0, 0.0);
    MatrixType ta;

-    lie = zero;
+    lie = Zero();
    for (int a = 0; a < AdjointDimension; a++) {
      random(pRNG, ca);

@ -644,13 +647,13 @@ class SU {
  static void GaussianFundamentalLieAlgebraMatrix(GridParallelRNG &pRNG,
                                                  LatticeMatrix &out,
                                                  Real scale = 1.0) {
-    GridBase *grid = out._grid;
+    GridBase *grid = out.Grid();
    LatticeReal ca(grid);
    LatticeMatrix la(grid);
    Complex ci(0.0, scale);
    Matrix ta;

-    out = zero;
+    out = Zero();
    for (int a = 0; a < AdjointDimension; a++) {
      gaussian(pRNG, ca);
      generator(a, ta);
@ -664,11 +667,11 @@ class SU {
                                          LatticeMatrix &out,
                                          Real scale = 1.0) {
    conformable(h, out);
-    GridBase *grid = out._grid;
+    GridBase *grid = out.Grid();
    LatticeMatrix la(grid);
    Matrix ta;

-    out = zero;
+    out = Zero();
    for (int a = 0; a < AdjointDimension; a++) {
      generator(a, ta);
      la = peekColour(h, a) * timesI(ta) * scale;
@ -687,10 +690,11 @@ class SU {
 /*
 * Adjoint rep gauge xform
 */
+
  template<typename GaugeField,typename GaugeMat>
  static void GaugeTransform( GaugeField &Umu, GaugeMat &g){
-    GridBase *grid = Umu._grid;
-    conformable(grid,g._grid);
+    GridBase *grid = Umu.Grid();
+    conformable(grid,g.Grid());

    GaugeMat U(grid);
    GaugeMat ag(grid); ag = adj(g);
@ -702,8 +706,8 @@ class SU {
    }
  }
  template<typename GaugeMat>
-    static void GaugeTransform( std::vector<GaugeMat> &U, GaugeMat &g){
-    GridBase *grid = g._grid;
+  static void GaugeTransform( std::vector<GaugeMat> &U, GaugeMat &g){
+    GridBase *grid = g.Grid();
    GaugeMat ag(grid); ag = adj(g);
    for(int mu=0;mu<Nd;mu++){
      U[mu] = g*U[mu]*Cshift(ag, mu, 1);
@ -719,7 +723,7 @@ class SU {
  // inverse operation: FundamentalLieAlgebraMatrix
  static void projectOnAlgebra(LatticeAlgebraVector &h_out, const LatticeMatrix &in, Real scale = 1.0) {
    conformable(h_out, in);
-    h_out = zero;
+    h_out = Zero();
    Matrix Ta;

    for (int a = 0; a < AdjointDimension; a++) {
@ -735,7 +739,7 @@ class SU {
    typedef iSUnMatrix<vector_type> vMatrixType;
    typedef Lattice<vMatrixType> LatticeMatrixType;

-    LatticeMatrixType Umu(out._grid);
+    LatticeMatrixType Umu(out.Grid());
    for (int mu = 0; mu < Nd; mu++) {
      LieRandomize(pRNG, Umu, 1.0);
      PokeIndex<LorentzIndex>(out, Umu, mu);
@ -747,7 +751,7 @@ class SU {
    typedef iSUnMatrix<vector_type> vMatrixType;
    typedef Lattice<vMatrixType> LatticeMatrixType;

-    LatticeMatrixType Umu(out._grid);
+    LatticeMatrixType Umu(out.Grid());
    for(int mu=0;mu<Nd;mu++){
      LieRandomize(pRNG,Umu,0.01);
      PokeIndex<LorentzIndex>(out,Umu,mu);
@ -759,7 +763,7 @@ class SU {
    typedef iSUnMatrix<vector_type> vMatrixType;
    typedef Lattice<vMatrixType> LatticeMatrixType;

-    LatticeMatrixType Umu(out._grid);
+    LatticeMatrixType Umu(out.Grid());
    Umu=1.0;
    for(int mu=0;mu<Nd;mu++){
      PokeIndex<LorentzIndex>(out,Umu,mu);
@ -778,7 +782,7 @@ class SU {
  static void taExp(const LatticeMatrixType &x, LatticeMatrixType &ex) {
    typedef typename LatticeMatrixType::scalar_type ComplexType;

-    LatticeMatrixType xn(x._grid);
+    LatticeMatrixType xn(x.Grid());
    RealD nfac = 1.0;

    xn = x;
@ -801,6 +805,5 @@ typedef SU<5> SU5;

 typedef SU<Nc> FundamentalMatrices;

-}
-}
+NAMESPACE_END(Grid);
 #endif
--- a/Grid/qcd/utils/SUnAdjoint.h
+++ b/Grid/qcd/utils/SUnAdjoint.h
@ -22,17 +22,16 @@
 //
 ////////////////////////////////////////////////////////////////////////

-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 template <int ncolour>
 class SU_Adjoint : public SU<ncolour> {
- public:
+public:
  static const int Dimension = ncolour * ncolour - 1;

  template <typename vtype>
  using iSUnAdjointMatrix =
-      iScalar<iScalar<iMatrix<vtype, Dimension > > >;
+    iScalar<iScalar<iMatrix<vtype, Dimension > > >;

  // Actually the adjoint matrices are real...
  // Consider this overhead... FIXME
@ -49,11 +48,11 @@ class SU_Adjoint : public SU<ncolour> {
  typedef Lattice<vAMatrixD> LatticeAdjMatrixD;

  typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> >
-      LatticeAdjField;
+  LatticeAdjField;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> >
-      LatticeAdjFieldF;
+  LatticeAdjFieldF;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> >
-      LatticeAdjFieldD;
+  LatticeAdjFieldD;



@ -62,7 +61,7 @@ class SU_Adjoint : public SU<ncolour> {
  static void generator(int Index, iSUnAdjointMatrix<cplx> &iAdjTa) {
    // returns i(T_Adj)^index necessary for the projectors
    // see definitions above
-    iAdjTa = zero;
+    iAdjTa = Zero();
    Vector<typename SU<ncolour>::template iSUnMatrix<cplx> > ta(ncolour * ncolour - 1);
    typename SU<ncolour>::template iSUnMatrix<cplx> tmp;

@ -73,7 +72,7 @@ class SU_Adjoint : public SU<ncolour> {
      tmp = ta[a] * ta[Index] - ta[Index] * ta[a];
      for (int b = 0; b < (ncolour * ncolour - 1); b++) {
        typename SU<ncolour>::template iSUnMatrix<cplx> tmp1 =
-            2.0 * tmp * ta[b];  // 2.0 from the normalization
+	  2.0 * tmp * ta[b];  // 2.0 from the normalization
        Complex iTr = TensorRemove(timesI(trace(tmp1)));
        //iAdjTa()()(b, a) = iTr;
        iAdjTa()()(a, b) = iTr;
@ -112,14 +111,14 @@ class SU_Adjoint : public SU<ncolour> {
  }

  static void AdjointLieAlgebraMatrix(
-      const typename SU<ncolour>::LatticeAlgebraVector &h,
-      LatticeAdjMatrix &out, Real scale = 1.0) {
+				      const typename SU<ncolour>::LatticeAlgebraVector &h,
+				      LatticeAdjMatrix &out, Real scale = 1.0) {
    conformable(h, out);
-    GridBase *grid = out._grid;
+    GridBase *grid = out.Grid();
    LatticeAdjMatrix la(grid);
    AMatrix iTa;

-    out = zero;
+    out = Zero();
    for (int a = 0; a < Dimension; a++) {
      generator(a, iTa);
      la = peekColour(h, a) * iTa;
@ -131,7 +130,7 @@ class SU_Adjoint : public SU<ncolour> {
  // Projects the algebra components a lattice matrix (of dimension ncol*ncol -1 )
  static void projectOnAlgebra(typename SU<ncolour>::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) {
    conformable(h_out, in);
-    h_out = zero;
+    h_out = Zero();
    AMatrix iTa;
    Real coefficient = - 1.0/(ncolour) * scale;// 1/Nc for the normalization of the trace in the adj rep

@ -146,15 +145,15 @@ class SU_Adjoint : public SU<ncolour> {
  static void projector(typename SU<ncolour>::LatticeAlgebraVector &h_out, const LatticeAdjMatrix &in, Real scale = 1.0) {
    conformable(h_out, in);
    static std::vector<AMatrix> iTa(Dimension);  // to store the generators
-    h_out = zero;
+    h_out = Zero();
    static bool precalculated = false; 
    if (!precalculated){
      precalculated = true;
-        for (int a = 0; a < Dimension; a++) generator(a, iTa[a]);
+      for (int a = 0; a < Dimension; a++) generator(a, iTa[a]);
    }

    Real coefficient = -1.0 / (ncolour) * scale;  // 1/Nc for the normalization of
-                                                // the trace in the adj rep
+    // the trace in the adj rep

    for (int a = 0; a < Dimension; a++) {
      auto tmp = real(trace(iTa[a] * in)) * coefficient; 
@ -176,7 +175,7 @@ typedef SU_Adjoint<4> SU4Adjoint;
 typedef SU_Adjoint<5> SU5Adjoint;

 typedef SU_Adjoint<Nc> AdjointMatrices;
-}
-}
+
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/utils/SUnTwoIndex.h
+++ b/Grid/qcd/utils/SUnTwoIndex.h
@ -26,8 +26,7 @@
 #define QCD_UTIL_SUN2INDEX_H


-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);

 enum TwoIndexSymmetry { Symmetric = 1, AntiSymmetric = -1 };

@ -35,7 +34,7 @@ inline Real delta(int a, int b) { return (a == b) ? 1.0 : 0.0; }

 template <int ncolour, TwoIndexSymmetry S>
 class SU_TwoIndex : public SU<ncolour> {
- public:
+public:
  static const int Dimension = ncolour * (ncolour + S) / 2;
  static const int NumGenerators = SU<ncolour>::AdjointDimension;

@ -55,11 +54,11 @@ class SU_TwoIndex : public SU<ncolour> {
  typedef Lattice<vTIMatrixD> LatticeTwoIndexMatrixD;

  typedef Lattice<iVector<iScalar<iMatrix<vComplex, Dimension> >, Nd> >
-      LatticeTwoIndexField;
+  LatticeTwoIndexField;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> >
-      LatticeTwoIndexFieldF;
+  LatticeTwoIndexFieldF;
  typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> >
-      LatticeTwoIndexFieldD;
+  LatticeTwoIndexFieldD;

  template <typename vtype>
  using iSUnMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >;
@ -72,7 +71,7 @@ class SU_TwoIndex : public SU<ncolour> {
  static void base(int Index, iSUnMatrix<cplx> &eij) {
    // returns (e)^(ij)_{kl} necessary for change of base U_F -> U_R
    assert(Index < NumGenerators);
-    eij = zero;
+    eij = Zero();

    // for the linearisation of the 2 indexes 
    static int a[ncolour * (ncolour - 1) / 2][2]; // store the a <-> i,j
@ -98,18 +97,18 @@ class SU_TwoIndex : public SU<ncolour> {

  template <class cplx>
  static void baseDiagonal(int Index, iSUnMatrix<cplx> &eij) {
-    eij = zero;
+    eij = Zero();
    eij()()(Index - ncolour * (ncolour - 1) / 2,
            Index - ncolour * (ncolour - 1) / 2) = 1.0;
  }

  template <class cplx>
  static void baseOffDiagonal(int i, int j, iSUnMatrix<cplx> &eij) {
-    eij = zero;
+    eij = Zero();
    for (int k = 0; k < ncolour; k++)
      for (int l = 0; l < ncolour; l++)
        eij()()(l, k) = delta(i, k) * delta(j, l) +
-                        S * delta(j, k) * delta(i, l);
+	  S * delta(j, k) * delta(i, l);

    RealD nrm = 1. / std::sqrt(2.0);
    eij = eij * nrm;
@ -128,10 +127,10 @@ class SU_TwoIndex : public SU<ncolour> {
  template <class cplx>
  static void generator(int Index, iSUnTwoIndexMatrix<cplx> &i2indTa) {
    Vector<typename SU<ncolour>::template iSUnMatrix<cplx> > ta(
-        ncolour * ncolour - 1);
+								ncolour * ncolour - 1);
    Vector<typename SU<ncolour>::template iSUnMatrix<cplx> > eij(Dimension);
    typename SU<ncolour>::template iSUnMatrix<cplx> tmp;
-    i2indTa = zero;
+    i2indTa = Zero();
    
    for (int a = 0; a < ncolour * ncolour - 1; a++)
      SU<ncolour>::generator(a, ta[a]);
@ -142,7 +141,7 @@ class SU_TwoIndex : public SU<ncolour> {
      tmp = transpose(ta[Index]) * adj(eij[a]) + adj(eij[a]) * ta[Index];
      for (int b = 0; b < Dimension; b++) {
        typename SU<ncolour>::template iSUnMatrix<cplx> tmp1 =
-            tmp * eij[b]; 
+	  tmp * eij[b]; 
        Complex iTr = TensorRemove(timesI(trace(tmp1)));
        i2indTa()()(a, b) = iTr;
      }
@ -197,14 +196,14 @@ class SU_TwoIndex : public SU<ncolour> {
  }

  static void TwoIndexLieAlgebraMatrix(
-      const typename SU<ncolour>::LatticeAlgebraVector &h,
-      LatticeTwoIndexMatrix &out, Real scale = 1.0) {
+				       const typename SU<ncolour>::LatticeAlgebraVector &h,
+				       LatticeTwoIndexMatrix &out, Real scale = 1.0) {
    conformable(h, out);
-    GridBase *grid = out._grid;
+    GridBase *grid = out.Grid();
    LatticeTwoIndexMatrix la(grid);
    TIMatrix i2indTa;

-    out = zero;
+    out = Zero();
    for (int a = 0; a < ncolour * ncolour - 1; a++) {
      generator(a, i2indTa);
      la = peekColour(h, a) * i2indTa;
@ -216,10 +215,10 @@ class SU_TwoIndex : public SU<ncolour> {
  // Projects the algebra components 
  // of a lattice matrix ( of dimension ncol*ncol -1 )
  static void projectOnAlgebra(
-      typename SU<ncolour>::LatticeAlgebraVector &h_out,
-      const LatticeTwoIndexMatrix &in, Real scale = 1.0) {
+			       typename SU<ncolour>::LatticeAlgebraVector &h_out,
+			       const LatticeTwoIndexMatrix &in, Real scale = 1.0) {
    conformable(h_out, in);
-    h_out = zero;
+    h_out = Zero();
    TIMatrix i2indTa;
    Real coefficient = -2.0 / (ncolour + 2 * S) * scale;
    // 2/(Nc +/- 2) for the normalization of the trace in the two index rep
@ -237,7 +236,7 @@ class SU_TwoIndex : public SU<ncolour> {
    conformable(h_out, in);
    // to store the generators
    static std::vector<TIMatrix> i2indTa(ncolour * ncolour -1); 
-    h_out = zero;
+    h_out = Zero();
    static bool precalculated = false;
    if (!precalculated) {
      precalculated = true;
@ -245,8 +244,8 @@ class SU_TwoIndex : public SU<ncolour> {
    }

    Real coefficient =
-        -2.0 / (ncolour + 2 * S) * scale;  // 2/(Nc +/- 2) for the normalization
-                                           // of the trace in the two index rep
+      -2.0 / (ncolour + 2 * S) * scale;  // 2/(Nc +/- 2) for the normalization
+    // of the trace in the two index rep

    for (int a = 0; a < ncolour * ncolour - 1; a++) {
      auto tmp = real(trace(i2indTa[a] * in)) * coefficient;
@ -269,8 +268,6 @@ typedef SU_TwoIndex<3, AntiSymmetric> SU3TwoIndexAntiSymm;
 typedef SU_TwoIndex<4, AntiSymmetric> SU4TwoIndexAntiSymm;
 typedef SU_TwoIndex<5, AntiSymmetric> SU5TwoIndexAntiSymm;

-
-}
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/utils/ScalarObjs.h
+++ b/Grid/qcd/utils/ScalarObjs.h
@ -28,15 +28,13 @@ directory
 /*  END LEGAL */
 #ifndef SCALAR_OBJS_H
 #define SCALAR_OBJS_H
-namespace Grid {

-  // FIXME drop the QCD namespace in Nd
-  
+NAMESPACE_BEGIN(Grid);

 // Scalar field obs
 template <class Impl>
 class ScalarObs {
- public:
+public:
  //////////////////////////////////////////////////
  // squared field
  //////////////////////////////////////////////////
@ -61,7 +59,7 @@ class ScalarObs {
  static void phider(typename Impl::Field &fsq,
                     const typename Impl::Field &f) {
    fsq = Cshift(f, 0, -1) * f;
-    for (int mu = 1; mu < QCD::Nd; mu++) fsq += Cshift(f, mu, -1) * f;
+    for (int mu = 1; mu < Nd; mu++) fsq += Cshift(f, mu, -1) * f;
  }

  //////////////////////////////////////////////////
@ -69,28 +67,27 @@ class ScalarObs {
  //////////////////////////////////////////////////

  static RealD sumphider(const typename Impl::Field &f) {
-    typename Impl::Field tmp(f._grid);
+    typename Impl::Field tmp(f.Grid());
    tmp = Cshift(f, 0, -1) * f;
-    for (int mu = 1; mu < QCD::Nd; mu++) {
+    for (int mu = 1; mu < Nd; mu++) {
      tmp += Cshift(f, mu, -1) * f;
    }
    return -sum(trace(tmp));
  }

  static RealD sumphisquared(const typename Impl::Field &f) {
-    typename Impl::Field tmp(f._grid);
+    typename Impl::Field tmp(f.Grid());
    tmp = f * f;
    return sum(trace(tmp));
  }

  static RealD sumphifourth(const typename Impl::Field &f) {
-    typename Impl::Field tmp(f._grid);
+    typename Impl::Field tmp(f.Grid());
    phifourth(tmp, f);
    return sum(trace(tmp));
  }
 };

-
-}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/utils/SpaceTimeGrid.cc
+++ b/Grid/qcd/utils/SpaceTimeGrid.cc
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -23,18 +23,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>

-namespace Grid { 
-  namespace QCD {
+NAMESPACE_BEGIN(Grid); 

 /////////////////////////////////////////////////////////////////
 // Public interface
 /////////////////////////////////////////////////////////////////
-GridCartesian *SpaceTimeGrid::makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi)
+GridCartesian *SpaceTimeGrid::makeFourDimGrid(const Coordinate & latt,const Coordinate &simd,const Coordinate &mpi)
 {
  return new GridCartesian(latt,simd,mpi); 
 }
@ -42,23 +41,23 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
 {
  return new GridRedBlackCartesian(FourDimGrid); 
 }
-GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
+GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const Coordinate & latt,const Coordinate &mpi)
 {
-  std::vector<int> simd(4,1);
+  Coordinate simd(4,1);
  return makeFourDimGrid(latt,simd,mpi);
 }
 GridCartesian         *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
 {
  int N4=FourDimGrid->_ndimension;

-  std::vector<int> latt5(1,Ls);
-  std::vector<int> simd5(1,1);
-  std::vector<int>  mpi5(1,1);
+  Coordinate latt5(1,Ls);
+  Coordinate simd5(1,1);
+  Coordinate  mpi5(1,1);
  
  for(int d=0;d<N4;d++){
    latt5.push_back(FourDimGrid->_fdimensions[d]);
    simd5.push_back(FourDimGrid->_simd_layout[d]);
-     mpi5.push_back(FourDimGrid->_processors[d]);
+    mpi5.push_back(FourDimGrid->_processors[d]);
  }
  return new GridCartesian(latt5,simd5,mpi5,*FourDimGrid); 
 }
@ -68,9 +67,9 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
 {
  int N4=FourDimGrid->_ndimension;
  int cbd=1;
-  std::vector<int>   cb5(1,0);
+  Coordinate   cb5(1,0);
  for(int d=0;d<N4;d++){
-      cb5.push_back(  1);
+    cb5.push_back(  1);
  }
  GridCartesian *tmp = makeFiveDimGrid(Ls,FourDimGrid);
  GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,cb5,cbd); 
@ -84,14 +83,14 @@ GridCartesian         *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartes
  int N4    = FourDimGrid->_ndimension;
  int nsimd = FourDimGrid->Nsimd();

-  std::vector<int> latt5(1,Ls);
-  std::vector<int> simd5(1,nsimd);
-  std::vector<int>  mpi5(1,1);
+  Coordinate latt5(1,Ls);
+  Coordinate simd5(1,nsimd);
+  Coordinate  mpi5(1,1);
  
  for(int d=0;d<N4;d++){
    latt5.push_back(FourDimGrid->_fdimensions[d]);
    simd5.push_back(1);
-     mpi5.push_back(FourDimGrid->_processors[d]);
+    mpi5.push_back(FourDimGrid->_processors[d]);
  }
  return new GridCartesian(latt5,simd5,mpi5,*FourDimGrid); 
 }
@ -103,9 +102,9 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const Gr
 {
  int N4=FourDimGrid->_ndimension;
  int cbd=1;
-  std::vector<int>   cb5(1,0);
+  Coordinate   cb5(1,0);
  for(int d=0;d<N4;d++){
-      cb5.push_back(1);
+    cb5.push_back(1);
  }
  GridCartesian *tmp         = makeFiveDimDWFGrid(Ls,FourDimGrid);
  GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,cb5,cbd); 
@ -113,5 +112,4 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const Gr
  return ret;
 }

-
-}}
+NAMESPACE_END(Grid);
--- a/Grid/qcd/utils/SpaceTimeGrid.h
+++ b/Grid/qcd/utils/SpaceTimeGrid.h
@ -1,4 +1,4 @@
-    /*************************************************************************************
+/*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 

@ -23,17 +23,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
+*************************************************************************************/
+/*  END LEGAL */
 #ifndef GRID_QCD_SPACE_TIME_GRID_H
 #define GRID_QCD_SPACE_TIME_GRID_H
-namespace Grid {
-namespace QCD {
+
+NAMESPACE_BEGIN(Grid);

 class SpaceTimeGrid {
- public:
+public:

-  static GridCartesian         *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
+  static GridCartesian         *makeFourDimGrid(const Coordinate & latt,const Coordinate &simd,const Coordinate &mpi);
  static GridRedBlackCartesian *makeFourDimRedBlackGrid       (const GridCartesian *FourDimGrid);

  static GridCartesian         *makeFiveDimGrid        (int Ls,const GridCartesian *FourDimGrid);
@ -41,10 +41,10 @@ class SpaceTimeGrid {

  static GridCartesian         *makeFiveDimDWFGrid        (int Ls,const GridCartesian *FourDimGrid);
  static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
-  static GridCartesian         *makeFourDimDWFGrid        (const std::vector<int> & latt,const std::vector<int> &mpi);
+  static GridCartesian         *makeFourDimDWFGrid        (const Coordinate & latt,const Coordinate &mpi);

 };

-}}
+NAMESPACE_END(Grid);

 #endif
--- a/Grid/qcd/utils/WilsonLoops.h
+++ b/Grid/qcd/utils/WilsonLoops.h
@ -33,8 +33,8 @@ directory
 /*  END LEGAL */
 #ifndef QCD_UTILS_WILSON_LOOPS_H
 #define QCD_UTILS_WILSON_LOOPS_H
-namespace Grid {
-namespace QCD {
+
+NAMESPACE_BEGIN(Grid);

 // Common wilson loop observables
 template <class Gimpl> class WilsonLoops : public Gimpl {
@ -57,16 +57,16 @@ public:
    // purpose of deriving
    // from Gimpl.
    /*
-    plaq = Gimpl::CovShiftBackward(
-        U[mu], mu, Gimpl::CovShiftBackward(
-                       U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
-                       */
+      plaq = Gimpl::CovShiftBackward(
+      U[mu], mu, Gimpl::CovShiftBackward(
+      U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
+    */
    // _
    //|< _|
    plaq = Gimpl::CovShiftForward(U[mu],mu,
-           Gimpl::CovShiftForward(U[nu],nu,
-           Gimpl::CovShiftBackward(U[mu],mu,
-           Gimpl::CovShiftIdentityBackward(U[nu], nu))));
+				  Gimpl::CovShiftForward(U[nu],nu,
+							 Gimpl::CovShiftBackward(U[mu],mu,
+										 Gimpl::CovShiftIdentityBackward(U[nu], nu))));



@ -78,7 +78,7 @@ public:
  static void traceDirPlaquette(ComplexField &plaq,
                                const std::vector<GaugeMat> &U, const int mu,
                                const int nu) {
-    GaugeMat sp(U[0]._grid);
+    GaugeMat sp(U[0].Grid());
    dirPlaquette(sp, U, mu, nu);
    plaq = trace(sp);
  }
@ -87,8 +87,8 @@ public:
  //////////////////////////////////////////////////
  static void sitePlaquette(ComplexField &Plaq,
                            const std::vector<GaugeMat> &U) {
-    ComplexField sitePlaq(U[0]._grid);
-    Plaq = zero;
+    ComplexField sitePlaq(U[0].Grid());
+    Plaq = Zero();
    for (int mu = 1; mu < Nd; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceDirPlaquette(sitePlaq, U, mu, nu);
@ -100,13 +100,13 @@ public:
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static RealD sumPlaquette(const GaugeLorentz &Umu) {
-    std::vector<GaugeMat> U(Nd, Umu._grid);
+    std::vector<GaugeMat> U(Nd, Umu.Grid());
    // inefficient here
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

-    ComplexField Plaq(Umu._grid);
+    ComplexField Plaq(Umu.Grid());

    sitePlaquette(Plaq, U);
    auto Tp = sum(Plaq);
@ -120,7 +120,7 @@ public:
  //////////////////////////////////////////////////
  static RealD avgPlaquette(const GaugeLorentz &Umu) {
    RealD sumplaq = sumPlaquette(Umu);
-    double vol = Umu._grid->gSites();
+    double vol = Umu.Grid()->gSites();
    double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
  }
@ -130,12 +130,12 @@ public:
  // average over all x,y,z the temporal loop
  //////////////////////////////////////////////////
  static ComplexD avgPolyakovLoop(const GaugeField &Umu) {  //assume Nd=4
-    GaugeMat Ut(Umu._grid), P(Umu._grid);
+    GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
    ComplexD out;
-    int T = Umu._grid->GlobalDimensions()[3];
-    int X = Umu._grid->GlobalDimensions()[0];
-    int Y = Umu._grid->GlobalDimensions()[1];
-    int Z = Umu._grid->GlobalDimensions()[2];
+    int T = Umu.Grid()->GlobalDimensions()[3];
+    int X = Umu.Grid()->GlobalDimensions()[0];
+    int Y = Umu.Grid()->GlobalDimensions()[1];
+    int Z = Umu.Grid()->GlobalDimensions()[2];

    Ut = peekLorentz(Umu,3); //Select temporal direction
    P = Ut;
@ -151,10 +151,10 @@ public:
  // average over traced single links
  //////////////////////////////////////////////////
  static RealD linkTrace(const GaugeLorentz &Umu) {
-    std::vector<GaugeMat> U(Nd, Umu._grid);
+    std::vector<GaugeMat> U(Nd, Umu.Grid());

-    ComplexField Tr(Umu._grid);
-    Tr = zero;
+    ComplexField Tr(Umu.Grid());
+    Tr = Zero();
    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
      Tr = Tr + trace(U[mu]);
@ -163,7 +163,7 @@ public:
    auto Tp = sum(Tr);
    auto p = TensorRemove(Tp);

-    double vol = Umu._grid->gSites();
+    double vol = Umu.Grid()->gSites();

    return p.real() / vol / 4.0 / 3.0;
  };
@ -174,13 +174,13 @@ public:
  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
                     int nu) {

-    GridBase *grid = Umu._grid;
+    GridBase *grid = Umu.Grid();

    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
-    staple = zero;
+    staple = Zero();

    if (nu != mu) {

@ -194,11 +194,11 @@ public:
      //

      staple += Gimpl::ShiftStaple(
-          Gimpl::CovShiftForward(
-              U[nu], nu,
-              Gimpl::CovShiftBackward(
-                  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
-          mu);
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftBackward(
+										  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+				   mu);

      //  __
      // |
@ -206,23 +206,23 @@ public:
      //
      //
      staple += Gimpl::ShiftStaple(
-          Gimpl::CovShiftBackward(U[nu], nu,
-                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
-          mu);
+				   Gimpl::CovShiftBackward(U[nu], nu,
+							   Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
+				   mu);
    }
  }


-// For the force term
+  // For the force term
 /*
-static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
-    GridBase *grid = Umu._grid;
+  static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
+    GridBase *grid = Umu.Grid();
    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      // this operation is taking too much time
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
-    staple = zero;
+    staple = Zero();
    GaugeMat tmp1(grid);
    GaugeMat tmp2(grid);

@ -237,20 +237,20 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      }
    }
    staple = U[mu]*staple;
-}
+  }
 */
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
  static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

-    GridBase *grid = Umu._grid;
+    GridBase *grid = Umu.Grid();

    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
-    staple = zero;
+    staple = Zero();

    for (int nu = 0; nu < Nd; nu++) {

@ -266,11 +266,11 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
        //
     
        staple += Gimpl::ShiftStaple(
-            Gimpl::CovShiftForward(
-                U[nu], nu,
-                Gimpl::CovShiftBackward(
-                    U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
-            mu);
+				     Gimpl::CovShiftForward(
+							    U[nu], nu,
+							    Gimpl::CovShiftBackward(
+										    U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+				     mu);

        //  __
        // |
@ -279,8 +279,8 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
        //

        staple += Gimpl::ShiftStaple(
-            Gimpl::CovShiftBackward(U[nu], nu,
-                                    Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
+				     Gimpl::CovShiftBackward(U[nu], nu,
+							     Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
      }
    }
  }
@ -291,7 +291,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void StapleUpper(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
                          int nu) {
    if (nu != mu) {
-      GridBase *grid = Umu._grid;
+      GridBase *grid = Umu.Grid();

      std::vector<GaugeMat> U(Nd, grid);
      for (int d = 0; d < Nd; d++) {
@ -308,11 +308,11 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      //

      staple = Gimpl::ShiftStaple(
-          Gimpl::CovShiftForward(
-              U[nu], nu,
-              Gimpl::CovShiftBackward(
-                  U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
-          mu);
+				  Gimpl::CovShiftForward(
+							 U[nu], nu,
+							 Gimpl::CovShiftBackward(
+										 U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
+				  mu);
    }
  }

@ -322,7 +322,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void StapleLower(GaugeMat &staple, const GaugeLorentz &Umu, int mu,
                          int nu) {
    if (nu != mu) {
-      GridBase *grid = Umu._grid;
+      GridBase *grid = Umu.Grid();

      std::vector<GaugeMat> U(Nd, grid);
      for (int d = 0; d < Nd; d++) {
@ -339,7 +339,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
      //
      //
      staple = Gimpl::ShiftStaple(
-          Gimpl::CovShiftBackward(U[nu], nu,
+				  Gimpl::CovShiftBackward(U[nu], nu,
                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
          mu);

@ -350,18 +350,18 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //  Field Strength
  //////////////////////////////////////////////////////
  static void FieldStrength(GaugeMat &FS, const GaugeLorentz &Umu, int mu, int nu){
-      // Fmn +--<--+  Ut +--<--+
-      //     |     |     |     |
+    // Fmn +--<--+  Ut +--<--+
+    //     |     |     |     |
      //  (x)+-->--+     +-->--+(x)  - h.c.
-      //     |     |     |     |
-      //     +--<--+     +--<--+
+    //     |     |     |     |
+    //     +--<--+     +--<--+

-      GaugeMat Vup(Umu._grid), Vdn(Umu._grid);
-      StapleUpper(Vup, Umu, mu, nu);
-      StapleLower(Vdn, Umu, mu, nu);
-      GaugeMat v = Vup - Vdn;
-      GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
-      GaugeMat vu = v*u;
+    GaugeMat Vup(Umu.Grid()), Vdn(Umu.Grid());
+    StapleUpper(Vup, Umu, mu, nu);
+    StapleLower(Vdn, Umu, mu, nu);
+    GaugeMat v = Vup - Vdn;
+    GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
+    GaugeMat vu = v*u;
      //FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
      FS = (u*v + Cshift(vu, mu, -1));
      FS = 0.125*(FS - adj(FS));
@ -371,13 +371,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
    // 4d topological charge
    assert(Nd==4);
    // Bx = -iF(y,z), By = -iF(z,y), Bz = -iF(x,y)
-    GaugeMat Bx(U._grid), By(U._grid), Bz(U._grid);
+    GaugeMat Bx(U.Grid()), By(U.Grid()), Bz(U.Grid());
    FieldStrength(Bx, U, Ydir, Zdir);
    FieldStrength(By, U, Zdir, Xdir);
    FieldStrength(Bz, U, Xdir, Ydir);

    // Ex = -iF(t,x), Ey = -iF(t,y), Ez = -iF(t,z)
-    GaugeMat Ex(U._grid), Ey(U._grid), Ez(U._grid);
+    GaugeMat Ex(U.Grid()), Ey(U.Grid()), Ez(U.Grid());
    FieldStrength(Ex, U, Tdir, Xdir);
    FieldStrength(Ey, U, Tdir, Ydir);
    FieldStrength(Ez, U, Tdir, Zdir);
@ -396,26 +396,26 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void dirRectangle(GaugeMat &rect, const std::vector<GaugeMat> &U,
                           const int mu, const int nu) {
    rect = Gimpl::CovShiftForward(
-               U[mu], mu, Gimpl::CovShiftForward(U[mu], mu, U[nu])) * // ->->|
-           adj(Gimpl::CovShiftForward(
-               U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[mu])));
+				  U[mu], mu, Gimpl::CovShiftForward(U[mu], mu, U[nu])) * // ->->|
+      adj(Gimpl::CovShiftForward(
+				 U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[mu])));
    rect = rect +
-           Gimpl::CovShiftForward(
-               U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])) * // ->||
-               adj(Gimpl::CovShiftForward(
-                   U[nu], nu, Gimpl::CovShiftForward(U[nu], nu, U[mu])));
+      Gimpl::CovShiftForward(
+			     U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])) * // ->||
+      adj(Gimpl::CovShiftForward(
+				 U[nu], nu, Gimpl::CovShiftForward(U[nu], nu, U[mu])));
  }
  static void traceDirRectangle(ComplexField &rect,
                                const std::vector<GaugeMat> &U, const int mu,
                                const int nu) {
-    GaugeMat sp(U[0]._grid);
+    GaugeMat sp(U[0].Grid());
    dirRectangle(sp, U, mu, nu);
    rect = trace(sp);
  }
  static void siteRectangle(ComplexField &Rect,
                            const std::vector<GaugeMat> &U) {
-    ComplexField siteRect(U[0]._grid);
-    Rect = zero;
+    ComplexField siteRect(U[0].Grid());
+    Rect = Zero();
    for (int mu = 1; mu < Nd; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceDirRectangle(siteRect, U, mu, nu);
@ -428,13 +428,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
  static RealD sumRectangle(const GaugeLorentz &Umu) {
-    std::vector<GaugeMat> U(Nd, Umu._grid);
+    std::vector<GaugeMat> U(Nd, Umu.Grid());

    for (int mu = 0; mu < Nd; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

-    ComplexField Rect(Umu._grid);
+    ComplexField Rect(Umu.Grid());

    siteRectangle(Rect, U);

@ -449,7 +449,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

    RealD sumrect = sumRectangle(Umu);

-    double vol = Umu._grid->gSites();
+    double vol = Umu.Grid()->gSites();

    double faces = (1.0 * Nd * (Nd - 1)); // 2 distinct orientations summed

@ -473,9 +473,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void RectStapleOptimised(GaugeMat &Stap, std::vector<GaugeMat> &U2,
                                  std::vector<GaugeMat> &U, int mu) {

-    Stap = zero;
+    Stap = Zero();

-    GridBase *grid = U[0]._grid;
+    GridBase *grid = U[0].Grid();

    GaugeMat Staple2x1(grid);
    GaugeMat tmp(grid);
@ -552,14 +552,14 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {

  static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu,
                                    int mu) {
-    GridBase *grid = Umu._grid;
+    GridBase *grid = Umu.Grid();

    std::vector<GaugeMat> U(Nd, grid);
    for (int d = 0; d < Nd; d++) {
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }

-    Stap = zero;
+    Stap = Zero();

    for (int nu = 0; nu < Nd; nu++) {
      if (nu != mu) {
@ -567,52 +567,52 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
        //          |    __ |
        //
        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftForward(
-                U[mu], mu,
-                Gimpl::CovShiftForward(
-                    U[nu], nu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu,
-                        Gimpl::CovShiftBackward(
-                            U[mu], mu,
-                            Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
-            mu);
+				   Gimpl::CovShiftForward(
+							  U[mu], mu,
+							  Gimpl::CovShiftForward(
+										 U[nu], nu,
+										 Gimpl::CovShiftBackward(
+													 U[mu], mu,
+													 Gimpl::CovShiftBackward(
+																 U[mu], mu,
+																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
+				   mu);

        //              __
        //          |__ __ |

        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftForward(
-                U[mu], mu,
-                Gimpl::CovShiftBackward(
-                    U[nu], nu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))),
-            mu);
+				   Gimpl::CovShiftForward(
+							  U[mu], mu,
+							  Gimpl::CovShiftBackward(
+										  U[nu], nu,
+										  Gimpl::CovShiftBackward(
+													  U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))),
+				   mu);

        //           __
        //          |__ __ |

        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftBackward(
-                U[nu], nu,
-                Gimpl::CovShiftBackward(
-                    U[mu], mu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))),
-            mu);
+				   Gimpl::CovShiftBackward(
+							   U[nu], nu,
+							   Gimpl::CovShiftBackward(
+										   U[mu], mu,
+										   Gimpl::CovShiftBackward(
+													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))),
+				   mu);

        //           __ ___
        //          |__    |

        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftForward(
-                U[nu], nu,
-                Gimpl::CovShiftBackward(
-                    U[mu], mu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))),
-            mu);
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftBackward(
+										  U[mu], mu,
+										  Gimpl::CovShiftBackward(
+													  U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))),
+				   mu);

        //       --
        //      |  |
@ -620,16 +620,16 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
        //      |  |

        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftForward(
-                U[nu], nu,
-                Gimpl::CovShiftForward(
-                    U[nu], nu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu,
-                        Gimpl::CovShiftBackward(
-                            U[nu], nu,
-                            Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
-            mu);
+				   Gimpl::CovShiftForward(
+							  U[nu], nu,
+							  Gimpl::CovShiftForward(
+										 U[nu], nu,
+										 Gimpl::CovShiftBackward(
+													 U[mu], mu,
+													 Gimpl::CovShiftBackward(
+																 U[nu], nu,
+																 Gimpl::CovShiftIdentityBackward(U[nu], nu))))),
+				   mu);

        //      |  |
        //
@ -637,13 +637,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
        //       --

        Stap += Gimpl::ShiftStaple(
-            Gimpl::CovShiftBackward(
-                U[nu], nu,
-                Gimpl::CovShiftBackward(
-                    U[nu], nu,
-                    Gimpl::CovShiftBackward(
-                        U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))),
-            mu);
+				   Gimpl::CovShiftBackward(
+							   U[nu], nu,
+							   Gimpl::CovShiftBackward(
+										   U[nu], nu,
+										   Gimpl::CovShiftBackward(
+													   U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))),
+				   mu);
      }
    }
  }
@ -679,7 +679,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
                                const std::vector<GaugeMat> &U,
                                const int Rmu, const int Rnu,
                                const int mu, const int nu) {
-    GaugeMat sp(U[0]._grid);
+    GaugeMat sp(U[0].Grid());
    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
    wl = trace(sp);
  }
@ -689,9 +689,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void siteWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
-    LatticeComplex siteWl(U[0]._grid);
-    Wl = zero;
-    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
+    LatticeComplex siteWl(U[0].Grid());
+    Wl = Zero();
+    for (int mu = 1; mu < U[0].Grid()->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
@ -707,11 +707,11 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
-    LatticeComplex siteWl(U[0]._grid);
+    LatticeComplex siteWl(U[0].Grid());

-    int ndim = U[0]._grid->_ndimension;
+    int ndim = U[0].Grid()->_ndimension;

-    Wl = zero;
+    Wl = Zero();
    for (int nu = 0; nu < ndim - 1; nu++) {
      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
      Wl = Wl + siteWl;
@ -723,10 +723,10 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
-    LatticeComplex siteWl(U[0]._grid);
+    LatticeComplex siteWl(U[0].Grid());

-    Wl = zero;
-    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
+    Wl = Zero();
+    for (int mu = 1; mu < U[0].Grid()->_ndimension - 1; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
@ -740,13 +740,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu._grid);
+    std::vector<GaugeMat> U(4, Umu.Grid());

-    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

-    LatticeComplex Wl(Umu._grid);
+    LatticeComplex Wl(Umu.Grid());

    siteWilsonLoop(Wl, U, R1, R2);

@ -759,13 +759,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu._grid);
+    std::vector<GaugeMat> U(4, Umu.Grid());

-    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

-    LatticeComplex Wl(Umu._grid);
+    LatticeComplex Wl(Umu.Grid());

    siteTimelikeWilsonLoop(Wl, U, R1, R2);

@ -778,13 +778,13 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    std::vector<GaugeMat> U(4, Umu._grid);
+    std::vector<GaugeMat> U(4, Umu.Grid());

-    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
+    for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }

-    LatticeComplex Wl(Umu._grid);
+    LatticeComplex Wl(Umu.Grid());

    siteSpatialWilsonLoop(Wl, U, R1, R2);

@ -797,9 +797,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real avgWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    int ndim = Umu._grid->_ndimension;
+    int ndim = Umu.Grid()->_ndimension;
    Real sumWl = sumWilsonLoop(Umu, R1, R2);
-    Real vol = Umu._grid->gSites();
+    Real vol = Umu.Grid()->gSites();
    Real faces = 1.0 * ndim * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
@ -808,9 +808,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    int ndim = Umu._grid->_ndimension;
+    int ndim = Umu.Grid()->_ndimension;
    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
-    Real vol = Umu._grid->gSites();
+    Real vol = Umu.Grid()->gSites();
    Real faces = 1.0 * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
@ -819,9 +819,9 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
  //////////////////////////////////////////////////
  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    int ndim = Umu._grid->_ndimension;
+    int ndim = Umu.Grid()->_ndimension;
    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
-    Real vol = Umu._grid->gSites();
+    Real vol = Umu.Grid()->gSites();
    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
@ -831,7 +831,7 @@ typedef WilsonLoops<PeriodicGimplR> ColourWilsonLoops;
 typedef WilsonLoops<PeriodicGimplR> U1WilsonLoops;
 typedef WilsonLoops<PeriodicGimplR> SU2WilsonLoops;
 typedef WilsonLoops<PeriodicGimplR> SU3WilsonLoops;
-}
-}
+
+NAMESPACE_END(Grid);

 #endif