diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h
index 6aa4e888..c7c7d329 100644
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@@ -40,21 +40,12 @@ public:
 			     const std::vector<ComplexField > &mom,
 			     int orthogdim);
 
-
-  static void NucleonFieldMom(Eigen::Tensor<ComplexD,6> &mat, 
-                              const std::vector<FermionField> &one,
-                              const std::vector<FermionField> &two,
-                              const std::vector<FermionField> &three,
-				     const std::vector<ComplexField > &mom,
-				     int parity,
-				     int orthogdim); 
- 
   static void PionFieldXX(Eigen::Tensor<ComplexD,3> &mat, 
 			  const FermionField *wi,
 			  const FermionField *vj,
 			  int orthogdim,
 			  int g5);
- 
+  
   static void PionFieldWV(Eigen::Tensor<ComplexD,3> &mat, 
 			  const FermionField *wi,
 			  const FermionField *vj,
@@ -127,404 +118,6 @@ private:
                                const int Ns, const int ss);
 };
 
-template<class FImpl>
-void A2Autils<FImpl>::NucleonFieldMom(Eigen::Tensor<ComplexD,6> &mat, 
-				     const std::vector<FermionField> &one,
-				     const std::vector<FermionField> &two,
-				     const std::vector<FermionField> &three,
-				     const std::vector<ComplexField > &mom,
-				     int parity,
-				     int orthogdim) 
-{
-  assert(parity == 1 || parity == -1);
-
-  typedef typename FImpl::SiteSpinor vobj;
-
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  typedef iSpinVector<vector_type> SpinVector_v;
-  typedef iSpinVector<scalar_type> SpinVector_s;
-  
-  int oneBlock = mat.dimension(2); 
-  int twoBlock = mat.dimension(3);
-  int threeBlock = mat.dimension(4);
-
-  GridBase *grid = one[0].Grid();
-
-  const int    nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
-
-  int Nt     = grid->GlobalDimensions()[orthogdim];
-  int Nmom   = mom.size();
-
-  int fd=grid->_fdimensions[orthogdim];
-  int ld=grid->_ldimensions[orthogdim];
-  int rd=grid->_rdimensions[orthogdim];
-
-  // will locally sum vectors first
-  // sum across these down to scalars
-  // splitting the SIMD
-  int MFrvol = rd*oneBlock*twoBlock*threeBlock*Nmom;
-  int MFlvol = ld*oneBlock*twoBlock*threeBlock*Nmom;
-
-  Vector<SpinVector_v > lvSum(MFrvol);
-  accelerator_for (r, MFrvol, Nsimd, {
-    lvSum[r] = 0;
-  } );
-
-  Vector<SpinVector_s > lsSum(MFlvol);             
-  accelerator_for (r, MFlvol, Nsimd, {
-    lsSum[r] = scalar_type(0.0);
-  } );
-
-  int e1=    grid->_slice_nblock[orthogdim];
-  int e2=    grid->_slice_block [orthogdim];
-  int stride=grid->_slice_stride[orthogdim];
-
-  accelerator_for(r, rd, Nsimd, {
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
-
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-
-	int ss= so+n*stride+b;
-
-	for(int i=0;i<oneBlock;i++){
-
-	  auto v1 = one[i]._odata[ss];
-	  auto gv1 = Gamma(Gamma::Algebra::GammaT)*v1;
-	  auto pv1 = 0.5*(v1 + (double)parity*gv1);
-
-       	  for(int j=0;j<twoBlock;j++){
-
-	    //auto v2 = conjugate(two[j]._odata[ss]);
-	    auto v2 = two[j]._odata[ss];
-            // C = i gamma_2 gamma_4 => C gamma_5 = - i gamma_1 gamma_3
-	    //auto v2g = v2*Gamma(Gamma::Algebra::SigmaXZ);
-            //auto v2g=v2;  
-
-	    for(int k=0;k<threeBlock;k++){
-
-	      auto v3 = three[k]._odata[ss];
-	      auto gv3 = Gamma(Gamma::Algebra::SigmaXZ)*v3;
-	      SpinVector_v vv;
-
-              for(int s1=0;s1<Ns;s1++){
-              vv()(s1)() = 0;
-	      for(int s2=0;s2<Ns;s2++){            
-	     /*   vv()(s1)() =  pv1()(s1)(0) * v2g()(s2)(1) * v3()(s2)(2)   //Cross product
-                  -           pv1()(s1)(0) * v2g()(s2)(2) * v3()(s2)(1)    
-                  +           pv1()(s1)(1) * v2g()(s2)(2) * v3()(s2)(0)    
-                  -           pv1()(s1)(1) * v2g()(s2)(0) * v3()(s2)(2)    
-                  +           pv1()(s1)(2) * v2g()(s2)(0) * v3()(s2)(1)    
-                  -           pv1()(s1)(2) * v2g()(s2)(1) * v3()(s2)(0);    */
-	        vv()(s1)() +=  pv1()(s1)(0) * v2()(s2)(1) * gv3()(s2)(2)   //Cross product
-                  -           pv1()(s1)(0) * v2()(s2)(2) * gv3()(s2)(1)    
-                  +           pv1()(s1)(1) * v2()(s2)(2) * gv3()(s2)(0)    
-                  -           pv1()(s1)(1) * v2()(s2)(0) * gv3()(s2)(2)    
-                  +           pv1()(s1)(2) * v2()(s2)(0) * gv3()(s2)(1)    
-                  -           pv1()(s1)(2) * v2()(s2)(1) * gv3()(s2)(0);    
-              }}
-
-              /*if (i+j+k == 0) {
-                Serializable::WriteMember(std::cout, pv1);
-                Serializable::WriteMember(std::cout, v2);
-                Serializable::WriteMember(std::cout, gv3);
-                Serializable::WriteMember(std::cout, vv);
-              }*/
-	    
-	      // After getting the sitewise product do the mom phase loop
-              int base = Nmom*i+Nmom*oneBlock*j+Nmom*oneBlock*twoBlock*k+Nmom*oneBlock*twoBlock*threeBlock*r;
-	      for ( int m=0;m<Nmom;m++){
-	        int idx = m+base;
-	        auto phase = mom[m]._odata[ss];
-                for(int is=0;is<Ns;is++){
-	          mac(&lvSum[idx]()(is)(),&vv()(is)(),&phase()()());
-		}
-              }
-	    }
-	  }
-	}
-      }
-    }
-  } );
-
-
-  // Sum across simd lanes in the plane, breaking out orthog dir.
-  accelerator_for(rt, rd, Nsimd, {
-
-    Coordinate icoor(nd);
-    std::vector<SpinVector_s>  extracted(Nsimd);               
-
-    for(int i=0;i<oneBlock;i++){
-    for(int j=0;j<twoBlock;j++){
-    for(int k=0;k<threeBlock;k++){
-    for(int m=0;m<Nmom;m++){
-
-      int ij_rdx = m+Nmom*i + Nmom*oneBlock * j + Nmom*oneBlock * twoBlock * k + Nmom*oneBlock * twoBlock *threeBlock * rt;
-
-      extract(lvSum[ij_rdx],extracted);
-
-      for(int idx=0;idx<Nsimd;idx++){
-
-	grid->iCoorFromIindex(icoor,idx);
-
-	int ldx    = rt+icoor[orthogdim]*rd;
-
-	int ij_ldx = m+Nmom*i + Nmom*oneBlock * j + Nmom*oneBlock * twoBlock * k + Nmom*oneBlock * twoBlock *threeBlock * ldx;
-
-	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
-
-      }
-    }}}}
-  } );
-
-  assert(mat.dimension(0) == Nmom);
-  assert(mat.dimension(1) == Nt);
-
-  int pd = grid->_processors[orthogdim];
-  int pc = grid->_processor_coor[orthogdim];
-  accelerator_for(lt, ld, Nsimd,
-  {
-    for(int pt=0;pt<pd;pt++){
-      int t = lt + pt*ld;
-      if (pt == pc){
-        for(int i=0;i<oneBlock;i++){
-          for(int j=0;j<twoBlock;j++){
-            for(int k=0;k<threeBlock;k++){
-	      for(int m=0;m<Nmom;m++){
-	        int ij_dx = m+Nmom*i + Nmom*oneBlock * j + Nmom*oneBlock * twoBlock * k + Nmom*oneBlock * twoBlock *threeBlock * lt;
-	        for(int is=0;is<4;is++){
-	          mat(m,t,i,j,k,is) = lsSum[ij_dx]()(is)();
-                }
-              }
-	    }
-	  }
-	}
-      } else { 
-	const scalar_type zz(0.0);
-        for(int i=0;i<oneBlock;i++){
-          for(int j=0;j<twoBlock;j++){
-            for(int k=0;k<threeBlock;k++){
-	      for(int m=0;m<Nmom;m++){
-	        for(int is=0;is<4;is++){
-	          mat(m,t,i,j,k,is) =zz;
-                }
-              }
-	    }
-	  }
-	}
-      }
-    }
-  } );
-
-  grid->GlobalSumVector(&mat(0,0,0,0,0,0),Nmom*Nt*oneBlock*twoBlock*threeBlock*4);
-}
-
-
-
-
-
-/*
-template <class FImpl>
-template <typename TensorType>
-void A2Autils<FImpl>::BaryonField(TensorType &mat, 
-				 const FermionField *one,
-				 const FermionField *two,
-				 const FermionField *three,
-				 std::vector<Gamma::Algebra> gammaA,
-				 std::vector<Gamma::Algebra> gammaB,
-				 const std::vector<ComplexField > &mom,
-				 int orthogdim, double *t_kernel, double *t_gsum) 
-{
-  typedef typename FImpl::SiteSpinor vobj;
-
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  typedef iSpinMatrix<vector_type> SpinMatrix_v;
-  typedef iSpinMatrix<scalar_type> SpinMatrix_s;
-  
-  typedef iSpinColourMatrix<vector_type> SpinColourMatrix_v;
-
-  int oneBlock = mat.dimension(3); 
-  int twoBlock = mat.dimension(4);
-  int threeBlock = mat.dimension(5);
-
-  GridBase *grid = one[0].Grid();
-  
-  const int    Nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
-
-  int Nt     = grid->GlobalDimensions()[orthogdim];
-  int Ngamma = gammaA.size();
-  assert (Ngamma = gammaB.size()); // Only combinatin of two gammas gives correct operator!
-  int Nmom   = mom.size();
-
-  int fd=grid->_fdimensions[orthogdim];
-  int ld=grid->_ldimensions[orthogdim];
-  int rd=grid->_rdimensions[orthogdim];
-
-  // will locally sum vectors first
-  // sum across these down to scalars
-  // splitting the SIMD
-  int MFrvol = rd*twoBlock*threeBlock*Nmom;
-  int MFlvol = ld*twoBlock*threeBlock*Nmom;
-
-  Vector<Vector<SpinMatrix_v >> lvSum(3);
-  for (int ic=0;ic<3;ic++){
-    lvSum[ic].resize(MFrvol);
-    parallel_for (int r = 0; r < MFrvol; r++){
-      lvSum[ic][r] = zero;
-    }
-  }
-
-  Vector<Vector<SpinMatrix_s >> lsSum(3);
-  for (int ic=0;ic<3;ic++){
-    lsSum[ic].resize(MFlvol);
-    parallel_for (int r = 0; r < MFlvol; r++){
-      lsSum[ic][r] = scalar_type(0.0);
-    }
-  }
-
-  int e1=    grid->_slice_nblock[orthogdim];
-  int e2=    grid->_slice_block [orthogdim];
-  int stride=grid->_slice_stride[orthogdim];
-
-  // potentially wasting cores here if local time extent too small
-  if (t_kernel) *t_kernel = -usecond();
-  parallel_for(int r=0;r<rd;r++){
-
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-    // first, the diquark two*gammaB*three
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-
-	int ss= so+n*stride+b;
-
-	  for(int j=0;j<twoBlock;j++){
-
-	    auto two_j = two[j]._odata[ss];
-
-	  for(int k=0;k<threeBlock;k++){
-
-	    auto three_k = three[j]._odata[ss];
-
-	    Vector<SpinMatrix_v > vv(3);
-
-	    for(int s1=0;s1<Ns;s1++){
-	    for(int s2=0;s2<Ns;s2++){
-	      vv[0]()(s1,s2)() = two_j()(s2)(1) * three_k()(s1)(2)           //ideal would be SpinMatrix but ColourVector...
-                -                two_j()(s2)(2) * three_k()(s1)(1);          //this is the cross product (two x three)^i
-	      vv[1]()(s1,s2)() = two_j()(s2)(2) * three_k()(s1)(0)           
-                -                two_j()(s2)(0) * three_k()(s1)(2);         
-	      vv[2]()(s1,s2)() = two_j()(s2)(0) * three_k()(s1)(1)           
-                -                two_j()(s2)(1) * three_k()(s1)(0);         
-	    }}
-	    
-	    // After getting the sitewise product do the mom phase loop
-	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
-	    for ( int m=0;m<Nmom;m++){
-	      for ( int ic=0;ic<3;ic++){
-	        int idx = m+base;
-	        auto phase = mom[m]._odata[ss];
-	        mac(&lvSum[ic][idx],&vv,&phase);
-	      }
-	    }
-	 
-	  }
-	}
-      }
-    }
-  }
-
-
-  for ( int ic=0;ic<3;ic++){
-  // Sum across simd lanes in the plane, breaking out orthog dir.
-  parallel_for(int rt=0;rt<rd;rt++){
-
-    std::vector<int> icoor(Nd);
-    std::vector<SpinMatrix_s> extracted(Nsimd);               
-
-    for(int i=0;i<twoBlock;i++){
-    for(int j=0;j<threeBlock;j++){
-    for(int m=0;m<Nmom;m++){
-
-      int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
-
-      extract(lvSum[ic][ij_rdx],extracted);
-
-      for(int idx=0;idx<Nsimd;idx++){
-
-	grid->iCoorFromIindex(icoor,idx);
-
-	int ldx    = rt+icoor[orthogdim]*rd;
-
-	int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
-
-	lsSum[ic][ij_ldx]=lsSum[ic][ij_ldx]+extracted[idx];
-
-      }
-    }}}
-  }
- 
-
-  if (t_kernel) *t_kernel += usecond();
-  assert(mat.dimension(0) == Nmom);
-  assert(mat.dimension(1) == Ngamma);
-  assert(mat.dimension(2) == Nt);
-
-  TensorType diquark; // Need this instead of mat!!!
-
-  // ld loop and local only??
-  int pd = grid->_processors[orthogdim];
-  int pc = grid->_processor_coor[orthogdim];
-  parallel_for_nest2(int lt=0;lt<ld;lt++)
-  {
-    for(int pt=0;pt<pd;pt++){
-      int t = lt + pt*ld;
-      if (pt == pc){
-	for(int i=0;i<twoBlock;i++){
-	  for(int j=0;j<threeBlock;j++){
-	    for(int m=0;m<Nmom;m++){
-	      int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
-	      for(int mu=0;mu<Ngamma;mu++){
-		// this is a bit slow
-		mat(m,mu,t,i,j) = trace(lsSum[ic][ij_dx]*Gamma(gammaB[mu]));
-	      }
-	    }
-	  }
-	}
-      } else { 
-	const scalar_type zz(0.0);
-	for(int i=0;i<twoBlock;i++){
-	  for(int j=0;j<threeBlock;j++){
-	    for(int mu=0;mu<Ngamma;mu++){
-	      for(int m=0;m<Nmom;m++){
-		mat(m,mu,t,i,j) =zz;
-	      }
-	    }
-	  }
-	}
-      }
-    }
-  }
-  }
-  ////////////////////////////////////////////////////////////////////
-  // This global sum is taking as much as 50% of time on 16 nodes
-  // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
-  // Healthy size that should suffice
-  ////////////////////////////////////////////////////////////////////
-  if (t_gsum) *t_gsum = -usecond();
-  grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
-  if (t_gsum) *t_gsum += usecond();
-}
-*/
-
 template <class FImpl>
 template <typename TensorType>
 void A2Autils<FImpl>::MesonField(TensorType &mat, 
@@ -667,7 +260,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
 	      int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
 	      for(int mu=0;mu<Ngamma;mu++){
 		// this is a bit slow
-		mat(m,mu,t,i,j) = (trace(lsSum[ij_dx]*Gamma(gammas[mu])))()()();
+		mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gammas[mu]))()()();
 	      }
 	    }
 	  }