diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc
index c610fb9c..e5cc0c42 100644
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@@ -1,7 +1,6 @@
 #include <Grid/GridCore.h>
 #ifndef GRID_UVM
 
-#warning "Using explicit device memory copies"
 NAMESPACE_BEGIN(Grid);
 
 #define MAXLINE 512
diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
index 837e3bea..53a592d1 100644
--- a/Grid/lattice/Lattice_reduction.h
+++ b/Grid/lattice/Lattice_reduction.h
@@ -376,9 +376,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
       coalescedWrite(z_v[ss],tmp);
   });
   bool ok;
+#ifdef GRID_SYCL
   uint64_t csum=0;
   uint64_t csum2=0;
-#ifdef GRID_SYCL
   if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
   {
     // z_v
@@ -522,14 +522,11 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
   int ostride=grid->_ostride[orthogdim];
   
   //Reduce Data down to lvSum
-  RealD t_sum =-usecond();
   sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
-  t_sum +=usecond();
 
   // Sum across simd lanes in the plane, breaking out orthog dir.
   Coordinate icoor(Nd);
 
-  RealD t_rest =-usecond();
   for(int rt=0;rt<rd;rt++){
 
     extract(lvSum[rt],extracted);
@@ -559,8 +556,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
   scalar_type * ptr = (scalar_type *) &result[0];
   int words = fd*sizeof(sobj)/sizeof(scalar_type);
   grid->GlobalSumVector(ptr, words);
-  t_rest +=usecond();
-  std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
+  //  std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
   
 }
 template<class vobj> inline
diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h
index 8acae81b..c3a46729 100644
--- a/Grid/qcd/action/ActionBase.h
+++ b/Grid/qcd/action/ActionBase.h
@@ -98,7 +98,7 @@ public:
   virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action
   virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
   virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative
-
+ 
   /////////////////////////////////////////////////////////////
   // virtual smeared interface through configuration container
   /////////////////////////////////////////////////////////////
@@ -132,6 +132,10 @@ public:
 template <class GaugeField >
 class EmptyAction : public Action <GaugeField>
 {
+  using Action<GaugeField>::refresh;
+  using Action<GaugeField>::Sinitial;
+  using Action<GaugeField>::deriv;
+
   virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
   virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action
   virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative
diff --git a/Grid/qcd/action/gauge/WilsonGaugeAction.h b/Grid/qcd/action/gauge/WilsonGaugeAction.h
index f535b54f..22c792cc 100644
--- a/Grid/qcd/action/gauge/WilsonGaugeAction.h
+++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h
@@ -43,6 +43,11 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
 public:  
   INHERIT_GIMPL_TYPES(Gimpl);
 
+  using Action<GaugeField>::S;
+  using Action<GaugeField>::Sinitial;
+  using Action<GaugeField>::deriv;
+  using Action<GaugeField>::refresh;
+  
   /////////////////////////// constructors
   explicit WilsonGaugeAction(RealD beta_):beta(beta_){};
 
diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h
index 1aeacbf2..7089fd1b 100644
--- a/Grid/qcd/utils/A2Autils.h
+++ b/Grid/qcd/utils/A2Autils.h
@@ -64,40 +64,6 @@ public:
 			 const std::vector<ComplexField > &mom,
 			 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
 
-  template <typename TensorType> 
-  static void MesonFieldGPU(TensorType &mat, 
-			 const FermionField *lhs_wi,
-			 const FermionField *rhs_vj,
-			 std::vector<Gamma::Algebra> gammas,
-			 const std::vector<ComplexField > &mom,
-			 int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);
-  /*
-  static void PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat, 
-			     const FermionField *wi,
-			     const FermionField *vj,
-			     const std::vector<ComplexField > &mom,
-			     int orthogdim);
-
-  static void PionFieldXX(Eigen::Tensor<ComplexD,3> &mat, 
-			  const FermionField *wi,
-			  const FermionField *vj,
-			  int orthogdim,
-			  int g5);
-  
-  static void PionFieldWV(Eigen::Tensor<ComplexD,3> &mat, 
-			  const FermionField *wi,
-			  const FermionField *vj,
-			  int orthogdim);
-  static void PionFieldWW(Eigen::Tensor<ComplexD,3> &mat, 
-			  const FermionField *wi,
-			  const FermionField *wj,
-			  int orthogdim);
-  static void PionFieldVV(Eigen::Tensor<ComplexD,3> &mat, 
-			  const FermionField *vi,
-			  const FermionField *vj,
-			  int orthogdim);
-  */
-  
   template <typename TensorType> // output: rank 5 tensor, e.g. Eigen::Tensor<ComplexD, 5>
   static void AslashField(TensorType &mat, 
         const FermionField *lhs_wi,
@@ -157,6 +123,211 @@ private:
                                const int Ns, const int ss);
 };
 
+const int A2Ablocking=8;
+
+template<typename vtype> using iVecSpinMatrix = iVector<iMatrix<iScalar<vtype>, Ns>, A2Ablocking>;
+typedef iVecSpinMatrix<Complex  >             VecSpinMatrix;
+typedef iVecSpinMatrix<vComplex >             vVecSpinMatrix;
+typedef Lattice<vVecSpinMatrix>               LatticeVecSpinMatrix;
+
+template<typename vtype> using iVecComplex = iVector<iScalar<iScalar<vtype> >, A2Ablocking>;
+typedef iVecComplex<Complex  >             VecComplex;
+typedef iVecComplex<vComplex >             vVecComplex;
+typedef Lattice<vVecComplex>               LatticeVecComplex;
+
+#define A2A_GPU_KERNELS
+#ifdef A2A_GPU_KERNELS
+template <class FImpl>
+template <typename TensorType>
+void A2Autils<FImpl>::MesonField(TensorType &mat, 
+				 const FermionField *lhs_wi,
+				 const FermionField *rhs_vj,
+				 std::vector<Gamma::Algebra> gammas,
+				 const std::vector<ComplexField > &mom,
+				 int orthogdim, double *t_kernel, double *t_gsum) 
+{
+  const int block=A2Ablocking;
+  typedef typename FImpl::SiteSpinor vobj;
+
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Lblock = mat.dimension(3); 
+  int Rblock = mat.dimension(4);
+
+  //  assert(Lblock % block==0);
+  //  assert(Rblock % block==0);
+  
+  GridBase *grid = lhs_wi[0].Grid();
+  
+  //  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+  int Ngamma = gammas.size();
+  int Nmom   = mom.size();
+
+  LatticeVecSpinMatrix SpinMat(grid);
+  LatticeVecSpinMatrix MomSpinMat(grid);
+  
+  std::vector<VecSpinMatrix> sliced;
+  for(int i=0;i<Lblock;i++){
+    autoView(SpinMat_v,SpinMat,AcceleratorWrite);
+    autoView(lhs_v,lhs_wi[i],AcceleratorRead);
+    for(int jo=0;jo<Rblock;jo+=block){
+      for(int j=jo;j<MIN(Rblock,jo+block);j++){
+	int jj=j%block;
+	autoView(rhs_v,rhs_vj[j],AcceleratorRead); // Create a vector of views
+	//////////////////////////////////////////
+	// Should write a SpinOuterColorTrace
+	//////////////////////////////////////////
+
+	accelerator_for(ss,grid->oSites(),(size_t)Nsimd,{
+	    auto left = conjugate(lhs_v(ss));
+	    auto right = rhs_v(ss);
+	    auto vv =  SpinMat_v(ss);
+	    for(int s1=0;s1<Ns;s1++){
+	      for(int s2=0;s2<Ns;s2++){
+		vv(jj)(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+		  +               left()(s2)(1) * right()(s1)(1)
+		  +               left()(s2)(2) * right()(s1)(2);
+	      }}
+	    coalescedWrite(SpinMat_v[ss],vv);
+	  });
+
+      }// j within block
+      // After getting the sitewise product do the mom phase loop
+      for(int m=0;m<Nmom;m++){
+
+	MomSpinMat   = SpinMat * mom[m];
+
+	sliceSum(MomSpinMat,sliced,orthogdim);
+
+	for(int mu=0;mu<Ngamma;mu++){
+	  for(int t=0;t<sliced.size();t++){
+	    for(int j=jo;j<MIN(Rblock,jo+block);j++){
+	      int jj=j%block;
+	      auto tmp = peekIndex<LorentzIndex>(sliced[t],jj);
+	      auto trSG = trace(tmp*Gamma(gammas[mu]));
+	      mat(m,mu,t,i,j) = trSG()();
+	    }
+	  }
+	}
+      }
+    }//jo
+  }
+}
+
+// "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x)
+//
+// With:
+//
+// B_0 = A_0 + i A_1
+// B_1 = A_2 + i A_3
+// 
+// then in spin space
+// 
+//                 ( 0          0          -conj(B_1) -B_0 )
+// i * A_mu g_mu = ( 0          0          -conj(B_0)  B_1 )
+//                 ( B_1        B_0        0          0    )
+//                 ( conj(B_0)  -conj(B_1) 0          0    )
+
+template <class FImpl>
+template <typename TensorType>
+void A2Autils<FImpl>::AslashField(TensorType &mat, 
+				  const FermionField *lhs_wi,
+				  const FermionField *rhs_vj,
+				  const std::vector<ComplexField> &emB0,
+				  const std::vector<ComplexField> &emB1,
+				  int orthogdim, double *t_kernel, double *t_gsum) 
+{
+  const int block=A2Ablocking;
+  typedef typename FImpl::SiteSpinor vobj;
+
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  int Lblock = mat.dimension(3); 
+  int Rblock = mat.dimension(4);
+
+  int Nem = emB0.size();
+  assert(emB1.size() == Nem);
+
+  //  assert(Lblock % block==0);
+  //  assert(Rblock % block==0);
+  
+  GridBase *grid = lhs_wi[0].Grid();
+  
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+
+  LatticeVecSpinMatrix SpinMat(grid);
+  LatticeVecComplex Aslash(grid);
+  std::vector<VecComplex> sliced;
+
+  for(int i=0;i<Lblock;i++){
+    autoView(SpinMat_v,SpinMat,AcceleratorWrite);
+    autoView(Aslash_v,Aslash,AcceleratorWrite);
+    autoView(lhs_v,lhs_wi[i],AcceleratorRead);
+    for(int jo=0;jo<Rblock;jo+=block){
+      for(int j=jo;j<MIN(Rblock,jo+block);j++){
+	int jj=j%block;
+	autoView(rhs_v,rhs_vj[j],AcceleratorRead); // Create a vector of views
+	//////////////////////////////////////////
+	// Should write a SpinOuterColorTrace
+	//////////////////////////////////////////
+	accelerator_for(ss,grid->oSites(),(size_t)Nsimd,{
+	    auto left = conjugate(lhs_v(ss));
+	    auto right = rhs_v(ss);
+	    auto vv =  SpinMat_v(ss);
+	    for(int s1=0;s1<Ns;s1++){
+	      for(int s2=0;s2<Ns;s2++){
+		vv(jj)(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+		  +             left()(s2)(1) * right()(s1)(1)
+		  +             left()(s2)(2) * right()(s1)(2);
+	      }}
+	    coalescedWrite(SpinMat_v[ss],vv);
+	  });
+      }
+      for(int m=0;m<Nem;m++){
+	autoView(emB0_v,emB0[m],AcceleratorRead);
+	autoView(emB1_v,emB1[m],AcceleratorRead);
+        accelerator_for(ss,grid->oSites(),(size_t)Nsimd,{
+	  auto vv  = SpinMat_v(ss);
+	  auto b0  = emB0_v(ss);
+	  auto b1  = emB1_v(ss);
+	  auto cb0 = conjugate(b0);
+	  auto cb1 = conjugate(b1);
+	  auto asl  = Aslash_v(ss);
+   	  for(int j=jo;j<MIN(Rblock,jo+block);j++){
+	    int jj=j%block;
+	    asl(jj)()()=- vv(jj)(3,0)()*b0()()()  - vv(jj)(2,0)()*cb1()()()
+	               + vv(jj)(3,1)()*b1()()()  - vv(jj)(2,1)()*cb0()()()
+     	               + vv(jj)(0,2)()*b1()()()  + vv(jj)(1,2)()*b0()()()
+	               + vv(jj)(0,3)()*cb0()()() - vv(jj)(1,3)()*cb1()()();
+	    
+	  }// j within block
+	  coalescedWrite(Aslash_v[ss],asl);
+	});
+
+	sliceSum(Aslash,sliced,orthogdim);
+
+	for(int t=0;t<sliced.size();t++){
+	  for(int j=jo;j<MIN(Rblock,jo+block);j++){
+	    int jj=j%block;
+	    mat(m,0,t,i,j) = sliced[t](jj)()();
+	  }
+	}
+      }
+    }
+  }
+}
+
+#else
 template <class FImpl>
 template <typename TensorType>
 void A2Autils<FImpl>::MesonField(TensorType &mat, 
@@ -329,488 +500,41 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
   if (t_gsum) *t_gsum += usecond();
 }
 
-const int A2Ablocking=8;
-template<typename vtype> using iVecSpinMatrix = iVector<iMatrix<iScalar<vtype>, Ns>, A2Ablocking>;
-typedef iVecSpinMatrix<Complex  >             VecSpinMatrix;
-typedef iVecSpinMatrix<vComplex >             vVecSpinMatrix;
-typedef Lattice<vVecSpinMatrix>               LatticeVecSpinMatrix;
-
 template <class FImpl>
 template <typename TensorType>
-void A2Autils<FImpl>::MesonFieldGPU(TensorType &mat, 
-				    const FermionField *lhs_wi,
-				    const FermionField *rhs_vj,
-				    std::vector<Gamma::Algebra> gammas,
-				    const std::vector<ComplexField > &mom,
-				    int orthogdim, double *t_kernel, double *t_gsum) 
+void A2Autils<FImpl>::AslashField(TensorType &mat, 
+				  const FermionField *lhs_wi,
+				  const FermionField *rhs_vj,
+				  const std::vector<ComplexField> &emB0,
+				  const std::vector<ComplexField> &emB1,
+				  int orthogdim, double *t_kernel, double *t_gsum) 
 {
-  const int block=A2Ablocking;
-  typedef typename FImpl::SiteSpinor vobj;
-
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
+  typedef typename FermionField::vector_object vobj;
+  typedef typename vobj::scalar_object         sobj;
+  typedef typename vobj::scalar_type           scalar_type;
+  typedef typename vobj::vector_type           vector_type;
 
+  typedef iSpinMatrix<vector_type> SpinMatrix_v;
+  typedef iSpinMatrix<scalar_type> SpinMatrix_s;
+  typedef iSinglet<vector_type>    Singlet_v;
+  typedef iSinglet<scalar_type>    Singlet_s;
+    
   int Lblock = mat.dimension(3); 
   int Rblock = mat.dimension(4);
-
-  //  assert(Lblock % block==0);
-  //  assert(Rblock % block==0);
   
   GridBase *grid = lhs_wi[0].Grid();
   
   const int    Nd = grid->_ndimension;
   const int Nsimd = grid->Nsimd();
 
-  int Nt     = grid->GlobalDimensions()[orthogdim];
-  int Ngamma = gammas.size();
-  int Nmom   = mom.size();
-
-
-  LatticeVecSpinMatrix SpinMat(grid);
-  LatticeVecSpinMatrix MomSpinMat(grid);
-  
-  RealD t_afor = 0.0;
-  RealD t_sum  = 0.0;
-  RealD t_pha  = 0.0;
-  RealD t_trace= 0.0;
-  uint64_t ncall=0;
-  
-  std::vector<VecSpinMatrix> sliced;
-  for(int i=0;i<Lblock;i++){
-    autoView(SpinMat_v,SpinMat,AcceleratorWrite);
-    autoView(lhs_v,lhs_wi[i],AcceleratorRead);
-    for(int jo=0;jo<Rblock;jo+=block){
-      for(int j=jo;j<MIN(Rblock,jo+block);j++){
-	int jj=j%block;
-	autoView(rhs_v,rhs_vj[j],AcceleratorRead); // Create a vector of views
-	//////////////////////////////////////////
-	// Should write a SpinOuterColorTrace
-	//////////////////////////////////////////
-	t_afor-=usecond();
-	accelerator_for(ss,grid->oSites(),(size_t)Nsimd,{
-	    auto left = conjugate(lhs_v(ss));
-	    auto right = rhs_v(ss);
-	    auto vv =  SpinMat_v(ss);
-	    for(int s1=0;s1<Ns;s1++){
-	      for(int s2=0;s2<Ns;s2++){
-		vv(jj)(s1,s2)() = left()(s2)(0) * right()(s1)(0)
-		  +               left()(s2)(1) * right()(s1)(1)
-		  +               left()(s2)(2) * right()(s1)(2);
-	      }}
-	    coalescedWrite(SpinMat_v[ss],vv);
-	  });
-	t_afor+=usecond();
-      }// j within block
-      // After getting the sitewise product do the mom phase loop
-      for(int m=0;m<Nmom;m++){
-	t_pha-=usecond();
-	MomSpinMat   = SpinMat * mom[m];
-	t_pha+=usecond();
-	t_sum-=usecond();
-	ncall++;
-	sliceSum(MomSpinMat,sliced,orthogdim);
-	t_sum+=usecond();
-	t_trace-=usecond();
-	for(int mu=0;mu<Ngamma;mu++){
-	  for(int t=0;t<sliced.size();t++){
-	    for(int j=jo;j<MIN(Rblock,jo+block);j++){
-	      int jj=j%block;
-	      auto tmp = sliced[t](jj);
-	      auto trSG = trace(tmp*Gamma(gammas[mu]));
-	      mat(m,mu,t,i,j) = trSG()();
-	    }
-	  }
-	}
-	t_trace+=usecond();
-      }
-    }//jo
-  }
-  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_afor  "<<t_afor<<" us"<<std::endl;
-  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_pha   "<<t_pha<<" us"<<std::endl;
-  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_sum   "<<t_sum<<" us"<<std::endl;
-  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU N_sum   "<<ncall<<" calls"<<std::endl;
-  std::cout << GridLogMessage<< " A2AUtils::MesonFieldGPU t_trace "<<t_trace<<" us"<<std::endl;
-}
-
-
-/*
-template<class FImpl>
-void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat, 
-				  const FermionField *wi,
-				  const FermionField *vj,
-				  int orthogdim,
-				  int g5) 
-{
-  int Lblock = mat.dimension(1); 
-  int Rblock = mat.dimension(2);
-
-  GridBase *grid = wi[0].Grid();
-  
-  const int    nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
-
-  int Nt     = grid->GlobalDimensions()[orthogdim];
+  int Nt  = grid->GlobalDimensions()[orthogdim];
+  int Nem = emB0.size();
+  assert(emB1.size() == Nem);
 
   int fd=grid->_fdimensions[orthogdim];
   int ld=grid->_ldimensions[orthogdim];
   int rd=grid->_rdimensions[orthogdim];
-
-  // will locally sum vectors first
-  // sum across these down to scalars
-  // splitting the SIMD
-  int MFrvol = rd*Lblock*Rblock;
-  int MFlvol = ld*Lblock*Rblock;
-
-  std::vector<vector_type > lvSum(MFrvol);
-  thread_for(r,MFrvol,{
-    lvSum[r] = Zero();
-  });
-
-  std::vector<scalar_type > lsSum(MFlvol);             
-  thread_for(r,MFlvol,{
-    lsSum[r]=scalar_type(0.0);
-  });
-
-  int e1=    grid->_slice_nblock[orthogdim];
-  int e2=    grid->_slice_block [orthogdim];
-  int stride=grid->_slice_stride[orthogdim];
-
-  thread_for(r,rd,{
-
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-
-	int ss= so+n*stride+b;
-
-	for(int i=0;i<Lblock;i++){
-
-	  autoView(wi_v,wi[i],CpuRead);
-	  auto w = conjugate(wi_v[ss]);
-	  if (g5) {
-	    w()(2)(0) = - w()(2)(0);
-	    w()(2)(1) = - w()(2)(1);
-	    w()(2)(2) = - w()(2)(2);
-	    w()(3)(0) = - w()(3)(0);
-	    w()(3)(1) = - w()(3)(1);
-	    w()(3)(2) = - w()(3)(2);
-	  }
-	  for(int j=0;j<Rblock;j++){
-	    
-	    autoView(vj_v,vj[j],CpuRead);
-	    auto v  = vj_v[ss];
-	    auto vv = v()(0)(0);
-
-	    vv =      w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
-	      +       w()(0)(1) * v()(0)(1)
-	      +       w()(0)(2) * v()(0)(2)
-	      +       w()(1)(0) * v()(1)(0)
-	      +       w()(1)(1) * v()(1)(1)
-	      +       w()(1)(2) * v()(1)(2)
-	      +       w()(2)(0) * v()(2)(0)
-	      +       w()(2)(1) * v()(2)(1)
-	      +       w()(2)(2) * v()(2)(2)
-	      +       w()(3)(0) * v()(3)(0)
-	      +       w()(3)(1) * v()(3)(1)
-	      +       w()(3)(2) * v()(3)(2);
-	    
-	    int idx = i+Lblock*j+Lblock*Rblock*r;
-	    lvSum[idx] = lvSum[idx]+vv;
-	  }
-	}
-      }
-    }
-  });
-
-  // Sum across simd lanes in the plane, breaking out orthog dir.
-  thread_for(rt,rd,{
-
-      Coordinate icoor(nd);
-    iScalar<vector_type> temp; 
-    ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               
-
-    for(int i=0;i<Lblock;i++){
-    for(int j=0;j<Rblock;j++){
-
-      int ij_rdx = i+Lblock*j+Lblock*Rblock*rt;
-
-      temp._internal =lvSum[ij_rdx];
-      extract(temp,extracted);
-
-      for(int idx=0;idx<Nsimd;idx++){
-
-	grid->iCoorFromIindex(icoor,idx);
-
-	int ldx    = rt+icoor[orthogdim]*rd;
-
-	int ij_ldx =i+Lblock*j+Lblock*Rblock*ldx;
-
-	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
-
-      }
-    }}
-  });
-
-  assert(mat.dimension(0) == Nt);
-  // ld loop and local only??
-  int pd = grid->_processors[orthogdim];
-  int pc = grid->_processor_coor[orthogdim];
-  thread_for_collapse(2,lt,ld,{
-    for(int pt=0;pt<pd;pt++){
-      int t = lt + pt*ld;
-      if (pt == pc){
-	for(int i=0;i<Lblock;i++){
-	  for(int j=0;j<Rblock;j++){
-	    int ij_dx = i + Lblock * j + Lblock * Rblock * lt;
-	    mat(t,i,j) = lsSum[ij_dx];
-	  }
-	}
-      } else { 
-	const scalar_type zz(0.0);
-	for(int i=0;i<Lblock;i++){
-	  for(int j=0;j<Rblock;j++){
-	    mat(t,i,j) =zz;
-	  }
-	}
-      }
-    }
-  });
-
-  grid->GlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock);
-}
-
-template<class FImpl>
-void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat, 
-				     const FermionField *wi,
-				     const FermionField *vj,
-				     const std::vector<ComplexField > &mom,
-				     int orthogdim) 
-{
-  int Lblock = mat.dimension(2); 
-  int Rblock = mat.dimension(3);
-
-  GridBase *grid = wi[0].Grid();
   
-  const int    nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
-
-  int Nt     = grid->GlobalDimensions()[orthogdim];
-  int Nmom   = mom.size();
-
-  int fd=grid->_fdimensions[orthogdim];
-  int ld=grid->_ldimensions[orthogdim];
-  int rd=grid->_rdimensions[orthogdim];
-
-  // will locally sum vectors first
-  // sum across these down to scalars
-  // splitting the SIMD
-  int MFrvol = rd*Lblock*Rblock*Nmom;
-  int MFlvol = ld*Lblock*Rblock*Nmom;
-
-  std::vector<vector_type > lvSum(MFrvol);
-  thread_for(r,MFrvol,{
-    lvSum[r] = Zero();
-  });
-
-  std::vector<scalar_type > lsSum(MFlvol);             
-  thread_for(r,MFlvol,{
-    lsSum[r]=scalar_type(0.0);
-  });
-
-  int e1=    grid->_slice_nblock[orthogdim];
-  int e2=    grid->_slice_block [orthogdim];
-  int stride=grid->_slice_stride[orthogdim];
-
-  thread_for(r,rd,{
-
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-
-	int ss= so+n*stride+b;
-
-	for(int i=0;i<Lblock;i++){
-
-	  autoView(wi_v,wi[i],CpuRead);
-	  auto w = conjugate(wi_v[ss]);
-
-	  for(int j=0;j<Rblock;j++){
-
-	    autoView(vj_v,vj[j],CpuRead);
-	    auto v = vj_v[ss];
-
-	    auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
-	      +       w()(0)(1) * v()(0)(1)
-	      +       w()(0)(2) * v()(0)(2)
-	      +       w()(1)(0) * v()(1)(0)
-	      +       w()(1)(1) * v()(1)(1)
-	      +       w()(1)(2) * v()(1)(2)
-	      -       w()(2)(0) * v()(2)(0)
-	      -       w()(2)(1) * v()(2)(1)
-	      -       w()(2)(2) * v()(2)(2)
-	      -       w()(3)(0) * v()(3)(0)
-	      -       w()(3)(1) * v()(3)(1)
-	      -       w()(3)(2) * v()(3)(2);
-
-	    
-	    // After getting the sitewise product do the mom phase loop
-	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
-	    for ( int m=0;m<Nmom;m++){
-	      int idx = m+base;
-	      autoView(mom_v,mom[m],CpuRead);
-	      auto phase = mom_v[ss];
-	      mac(&lvSum[idx],&vv,&phase()()());
-	    }
-	  }
-	}
-      }
-    }
-  });
-
-
-  // Sum across simd lanes in the plane, breaking out orthog dir.
-  thread_for(rt,rd,{
-
-    Coordinate icoor(nd);
-    iScalar<vector_type> temp; 
-    ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               
-
-    for(int i=0;i<Lblock;i++){
-    for(int j=0;j<Rblock;j++){
-    for(int m=0;m<Nmom;m++){
-
-      int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
-
-      temp._internal = lvSum[ij_rdx];
-      extract(temp,extracted);
-
-      for(int idx=0;idx<Nsimd;idx++){
-
-	grid->iCoorFromIindex(icoor,idx);
-
-	int ldx    = rt+icoor[orthogdim]*rd;
-
-	int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
-
-	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
-
-      }
-    }}}
-  });
-
-  assert(mat.dimension(0) == Nmom);
-  assert(mat.dimension(1) == Nt);
- 
-  int pd = grid->_processors[orthogdim];
-  int pc = grid->_processor_coor[orthogdim];
-  thread_for_collapse(2,lt,ld,{
-    for(int pt=0;pt<pd;pt++){
-      int t = lt + pt*ld;
-      if (pt == pc){
-	for(int i=0;i<Lblock;i++){
-	  for(int j=0;j<Rblock;j++){
-	    for(int m=0;m<Nmom;m++){
-	      int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
-	      mat(m,t,i,j) = lsSum[ij_dx];
-	    }
-	  }
-	}
-      } else { 
-	const scalar_type zz(0.0);
-	for(int i=0;i<Lblock;i++){
-	  for(int j=0;j<Rblock;j++){
-	    for(int m=0;m<Nmom;m++){
-	      mat(m,t,i,j) =zz;
-	    }
-	  }
-	}
-      }
-    }
-  });
-
-  grid->GlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock);
-}
-
-template<class FImpl>
-void A2Autils<FImpl>::PionFieldWV(Eigen::Tensor<ComplexD,3> &mat, 
-				  const FermionField *wi,
-				  const FermionField *vj,
-				  int orthogdim) 
-{
-  const int g5=1;
-  PionFieldXX(mat,wi,vj,orthogdim,g5);
-}
-template<class FImpl>
-void A2Autils<FImpl>::PionFieldWW(Eigen::Tensor<ComplexD,3> &mat, 
-				  const FermionField *wi,
-				  const FermionField *wj,
-				  int orthogdim) 
-{
-  const int nog5=0;
-  PionFieldXX(mat,wi,wj,orthogdim,nog5);
-}
-template<class FImpl>
-void A2Autils<FImpl>::PionFieldVV(Eigen::Tensor<ComplexD,3> &mat, 
-				  const FermionField *vi,
-				  const FermionField *vj,
-				  int orthogdim) 
-{
-  const int nog5=0;
-  PionFieldXX(mat,vi,vj,orthogdim,nog5);
-}
-*/
-
-// "A-slash" field w_i(x)^dag * i * A_mu * gamma_mu * v_j(x)
-//
-// With:
-//
-// B_0 = A_0 + i A_1
-// B_1 = A_2 + i A_3
-// 
-// then in spin space
-// 
-//                 ( 0          0          -conj(B_1) -B_0 )
-// i * A_mu g_mu = ( 0          0          -conj(B_0)  B_1 )
-//                 ( B_1        B_0        0          0    )
-//                 ( conj(B_0)  -conj(B_1) 0          0    )
-template <class FImpl>
-template <typename TensorType>
-void A2Autils<FImpl>::AslashField(TensorType &mat, 
-          const FermionField *lhs_wi,
-          const FermionField *rhs_vj,
-          const std::vector<ComplexField> &emB0,
-          const std::vector<ComplexField> &emB1,
-          int orthogdim, double *t_kernel, double *t_gsum) 
-{
-    typedef typename FermionField::vector_object vobj;
-    typedef typename vobj::scalar_object         sobj;
-    typedef typename vobj::scalar_type           scalar_type;
-    typedef typename vobj::vector_type           vector_type;
-
-    typedef iSpinMatrix<vector_type> SpinMatrix_v;
-    typedef iSpinMatrix<scalar_type> SpinMatrix_s;
-    typedef iSinglet<vector_type>    Singlet_v;
-    typedef iSinglet<scalar_type>    Singlet_s;
-    
-    int Lblock = mat.dimension(3); 
-    int Rblock = mat.dimension(4);
-
-    GridBase *grid = lhs_wi[0].Grid();
-    
-    const int    Nd = grid->_ndimension;
-    const int Nsimd = grid->Nsimd();
-
-    int Nt  = grid->GlobalDimensions()[orthogdim];
-    int Nem = emB0.size();
-    assert(emB1.size() == Nem);
-
-    int fd=grid->_fdimensions[orthogdim];
-    int ld=grid->_ldimensions[orthogdim];
-    int rd=grid->_rdimensions[orthogdim];
-
     // will locally sum vectors first
     // sum across these down to scalars
     // splitting the SIMD
@@ -836,7 +560,7 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
     // Nested parallelism would be ok
     // Wasting cores here. Test case r
     if (t_kernel) *t_kernel = -usecond();
-    thread_for(r,rd,
+    for(int r=0;r<rd;r++)
     {
         int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
 
@@ -863,8 +587,8 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
                                         + left()(s2)(1) * right()(s1)(1)
                                         + left()(s2)(2) * right()(s1)(2);
                     }
-                    
-                    // After getting the sitewise product do the mom phase loop
+
+		    // After getting the sitewise product do the mom phase loop
                     int base = Nem*i+Nem*Lblock*j+Nem*Lblock*Rblock*r;
 
                     for ( int m=0;m<Nem;m++)
@@ -885,7 +609,7 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
                 }
             }
         }
-    });
+    }
 
     // Sum across simd lanes in the plane, breaking out orthog dir.
     thread_for(rt,rd,
@@ -950,7 +674,7 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
     grid->GlobalSumVector(&mat(0,0,0,0,0),Nem*Nt*Lblock*Rblock);
     if (t_gsum) *t_gsum += usecond();
 }
-
+#endif
 ////////////////////////////////////////////
 // Schematic thoughts about more generalised four quark insertion
 //
@@ -1361,6 +1085,8 @@ Bag [8,4]  fig8 (-227.58,3.58808e-17) trtr (-32.5776,1.83286e-17)     //  - 1602
   });
 }
 
+
+
 #ifdef DELTA_F_EQ_2
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Perhaps this should move out of the utils and into Hadrons module
@@ -1592,5 +1318,534 @@ void A2Autils<FImpl>::DeltaFeq2(int dt_min,int dt_max,
 }
 #endif 
 
+  /*
+  static void PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat, 
+			     const FermionField *wi,
+			     const FermionField *vj,
+			     const std::vector<ComplexField > &mom,
+			     int orthogdim);
+
+  static void PionFieldXX(Eigen::Tensor<ComplexD,3> &mat, 
+			  const FermionField *wi,
+			  const FermionField *vj,
+			  int orthogdim,
+			  int g5);
+  
+  static void PionFieldWV(Eigen::Tensor<ComplexD,3> &mat, 
+			  const FermionField *wi,
+			  const FermionField *vj,
+			  int orthogdim);
+  static void PionFieldWW(Eigen::Tensor<ComplexD,3> &mat, 
+			  const FermionField *wi,
+			  const FermionField *wj,
+			  int orthogdim);
+  static void PionFieldVV(Eigen::Tensor<ComplexD,3> &mat, 
+			  const FermionField *vi,
+			  const FermionField *vj,
+			  int orthogdim);
+  */
+
+/*
+
+template <class FImpl>
+template <typename TensorType>
+void A2Autils<FImpl>::MesonField(TensorType &mat, 
+				 const FermionField *lhs_wi,
+				 const FermionField *rhs_vj,
+				 std::vector<Gamma::Algebra> gammas,
+				 const std::vector<ComplexField > &mom,
+				 int orthogdim, double *t_kernel, double *t_gsum) 
+{
+  typedef typename FImpl::SiteSpinor vobj;
+
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+
+  typedef iSpinMatrix<vector_type> SpinMatrix_v;
+  typedef iSpinMatrix<scalar_type> SpinMatrix_s;
+  
+  int Lblock = mat.dimension(3); 
+  int Rblock = mat.dimension(4);
+
+  GridBase *grid = lhs_wi[0].Grid();
+  
+  const int    Nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+  int Ngamma = gammas.size();
+  int Nmom   = mom.size();
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  // will locally sum vectors first
+  // sum across these down to scalars
+  // splitting the SIMD
+  int MFrvol = rd*Lblock*Rblock*Nmom;
+  int MFlvol = ld*Lblock*Rblock*Nmom;
+
+  std::vector<SpinMatrix_v > lvSum(MFrvol);
+  for(int r=0;r<MFrvol;r++){
+    lvSum[r] = Zero();
+  }
+
+  std::vector<SpinMatrix_s > lsSum(MFlvol);             
+  for(int r=0;r<MFlvol;r++){
+    lsSum[r]=scalar_type(0.0);
+  }
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  // potentially wasting cores here if local time extent too small
+  if (t_kernel) *t_kernel = -usecond();
+  for(int r=0;r<rd;r++) {
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+
+	int ss= so+n*stride+b;
+
+	for(int i=0;i<Lblock;i++){
+
+	  // Recreate view potentially expensive outside fo UVM mode
+	  autoView(lhs_v,lhs_wi[i],CpuRead);
+	  auto left = conjugate(lhs_v[ss]);
+	  for(int j=0;j<Rblock;j++){
+
+	    SpinMatrix_v vv;
+	    // Recreate view potentially expensive outside fo UVM mode
+	    autoView(rhs_v,rhs_vj[j],CpuRead);
+	    auto right = rhs_v[ss];
+	    for(int s1=0;s1<Ns;s1++){
+	    for(int s2=0;s2<Ns;s2++){
+	      vv()(s1,s2)() = left()(s2)(0) * right()(s1)(0)
+		+             left()(s2)(1) * right()(s1)(1)
+		+             left()(s2)(2) * right()(s1)(2);
+	    }}
+	    
+	    // After getting the sitewise product do the mom phase loop
+	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
+	    for ( int m=0;m<Nmom;m++){
+	      int idx = m+base;
+	      autoView(mom_v,mom[m],CpuRead);
+	      auto phase = mom_v[ss];
+	      mac(&lvSum[idx],&vv,&phase);
+	    }
+	  }
+	}
+      }
+    }
+  };
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  for(int rt=0;rt<rd;rt++){
+
+    Coordinate icoor(Nd);
+    ExtractBuffer<SpinMatrix_s> extracted(Nsimd);               
+
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+    for(int m=0;m<Nmom;m++){
+
+      int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
+
+      extract(lvSum[ij_rdx],extracted);
+
+      for(int idx=0;idx<Nsimd;idx++){
+
+	grid->iCoorFromIindex(icoor,idx);
+
+	int ldx    = rt+icoor[orthogdim]*rd;
+
+	int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
+
+	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx];
+
+      }
+    }}}
+  }
+  if (t_kernel) *t_kernel += usecond();
+  assert(mat.dimension(0) == Nmom);
+  assert(mat.dimension(1) == Ngamma);
+  assert(mat.dimension(2) == Nt);
+
+  // ld loop and local only??
+  int pd = grid->_processors[orthogdim];
+  int pc = grid->_processor_coor[orthogdim];
+  thread_for_collapse(2,lt,ld,{
+    for(int pt=0;pt<pd;pt++){
+      int t = lt + pt*ld;
+      if (pt == pc){
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    for(int m=0;m<Nmom;m++){
+	      int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
+	      for(int mu=0;mu<Ngamma;mu++){
+		// this is a bit slow
+		mat(m,mu,t,i,j) = trace(lsSum[ij_dx]*Gamma(gammas[mu]))()()();
+	      }
+	    }
+	  }
+	}
+      } else { 
+	const scalar_type zz(0.0);
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    for(int mu=0;mu<Ngamma;mu++){
+	      for(int m=0;m<Nmom;m++){
+		mat(m,mu,t,i,j) =zz;
+	      }
+	    }
+	  }
+	}
+      }
+    }
+  });
+
+  ////////////////////////////////////////////////////////////////////
+  // This global sum is taking as much as 50% of time on 16 nodes
+  // Vector size is 7 x 16 x 32 x 16 x 16 x sizeof(complex) = 2MB - 60MB depending on volume
+  // Healthy size that should suffice
+  ////////////////////////////////////////////////////////////////////
+  if (t_gsum) *t_gsum = -usecond();
+  grid->GlobalSumVector(&mat(0,0,0,0,0),Nmom*Ngamma*Nt*Lblock*Rblock);
+  if (t_gsum) *t_gsum += usecond();
+}
+
+template<class FImpl>
+void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat, 
+				  const FermionField *wi,
+				  const FermionField *vj,
+				  int orthogdim,
+				  int g5) 
+{
+  int Lblock = mat.dimension(1); 
+  int Rblock = mat.dimension(2);
+
+  GridBase *grid = wi[0].Grid();
+  
+  const int    nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  // will locally sum vectors first
+  // sum across these down to scalars
+  // splitting the SIMD
+  int MFrvol = rd*Lblock*Rblock;
+  int MFlvol = ld*Lblock*Rblock;
+
+  std::vector<vector_type > lvSum(MFrvol);
+  thread_for(r,MFrvol,{
+    lvSum[r] = Zero();
+  });
+
+  std::vector<scalar_type > lsSum(MFlvol);             
+  thread_for(r,MFlvol,{
+    lsSum[r]=scalar_type(0.0);
+  });
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  thread_for(r,rd,{
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+
+	int ss= so+n*stride+b;
+
+	for(int i=0;i<Lblock;i++){
+
+	  autoView(wi_v,wi[i],CpuRead);
+	  auto w = conjugate(wi_v[ss]);
+	  if (g5) {
+	    w()(2)(0) = - w()(2)(0);
+	    w()(2)(1) = - w()(2)(1);
+	    w()(2)(2) = - w()(2)(2);
+	    w()(3)(0) = - w()(3)(0);
+	    w()(3)(1) = - w()(3)(1);
+	    w()(3)(2) = - w()(3)(2);
+	  }
+	  for(int j=0;j<Rblock;j++){
+	    
+	    autoView(vj_v,vj[j],CpuRead);
+	    auto v  = vj_v[ss];
+	    auto vv = v()(0)(0);
+
+	    vv =      w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
+	      +       w()(0)(1) * v()(0)(1)
+	      +       w()(0)(2) * v()(0)(2)
+	      +       w()(1)(0) * v()(1)(0)
+	      +       w()(1)(1) * v()(1)(1)
+	      +       w()(1)(2) * v()(1)(2)
+	      +       w()(2)(0) * v()(2)(0)
+	      +       w()(2)(1) * v()(2)(1)
+	      +       w()(2)(2) * v()(2)(2)
+	      +       w()(3)(0) * v()(3)(0)
+	      +       w()(3)(1) * v()(3)(1)
+	      +       w()(3)(2) * v()(3)(2);
+	    
+	    int idx = i+Lblock*j+Lblock*Rblock*r;
+	    lvSum[idx] = lvSum[idx]+vv;
+	  }
+	}
+      }
+    }
+  });
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  thread_for(rt,rd,{
+
+      Coordinate icoor(nd);
+    iScalar<vector_type> temp; 
+    ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               
+
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+
+      int ij_rdx = i+Lblock*j+Lblock*Rblock*rt;
+
+      temp._internal =lvSum[ij_rdx];
+      extract(temp,extracted);
+
+      for(int idx=0;idx<Nsimd;idx++){
+
+	grid->iCoorFromIindex(icoor,idx);
+
+	int ldx    = rt+icoor[orthogdim]*rd;
+
+	int ij_ldx =i+Lblock*j+Lblock*Rblock*ldx;
+
+	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
+
+      }
+    }}
+  });
+
+  assert(mat.dimension(0) == Nt);
+  // ld loop and local only??
+  int pd = grid->_processors[orthogdim];
+  int pc = grid->_processor_coor[orthogdim];
+  thread_for_collapse(2,lt,ld,{
+    for(int pt=0;pt<pd;pt++){
+      int t = lt + pt*ld;
+      if (pt == pc){
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    int ij_dx = i + Lblock * j + Lblock * Rblock * lt;
+	    mat(t,i,j) = lsSum[ij_dx];
+	  }
+	}
+      } else { 
+	const scalar_type zz(0.0);
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    mat(t,i,j) =zz;
+	  }
+	}
+      }
+    }
+  });
+
+  grid->GlobalSumVector(&mat(0,0,0),Nt*Lblock*Rblock);
+}
+
+template<class FImpl>
+void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat, 
+				     const FermionField *wi,
+				     const FermionField *vj,
+				     const std::vector<ComplexField > &mom,
+				     int orthogdim) 
+{
+  int Lblock = mat.dimension(2); 
+  int Rblock = mat.dimension(3);
+
+  GridBase *grid = wi[0].Grid();
+  
+  const int    nd = grid->_ndimension;
+  const int Nsimd = grid->Nsimd();
+
+  int Nt     = grid->GlobalDimensions()[orthogdim];
+  int Nmom   = mom.size();
+
+  int fd=grid->_fdimensions[orthogdim];
+  int ld=grid->_ldimensions[orthogdim];
+  int rd=grid->_rdimensions[orthogdim];
+
+  // will locally sum vectors first
+  // sum across these down to scalars
+  // splitting the SIMD
+  int MFrvol = rd*Lblock*Rblock*Nmom;
+  int MFlvol = ld*Lblock*Rblock*Nmom;
+
+  std::vector<vector_type > lvSum(MFrvol);
+  thread_for(r,MFrvol,{
+    lvSum[r] = Zero();
+  });
+
+  std::vector<scalar_type > lsSum(MFlvol);             
+  thread_for(r,MFlvol,{
+    lsSum[r]=scalar_type(0.0);
+  });
+
+  int e1=    grid->_slice_nblock[orthogdim];
+  int e2=    grid->_slice_block [orthogdim];
+  int stride=grid->_slice_stride[orthogdim];
+
+  thread_for(r,rd,{
+
+    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+
+	int ss= so+n*stride+b;
+
+	for(int i=0;i<Lblock;i++){
+
+	  autoView(wi_v,wi[i],CpuRead);
+	  auto w = conjugate(wi_v[ss]);
+
+	  for(int j=0;j<Rblock;j++){
+
+	    autoView(vj_v,vj[j],CpuRead);
+	    auto v = vj_v[ss];
+
+	    auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
+	      +       w()(0)(1) * v()(0)(1)
+	      +       w()(0)(2) * v()(0)(2)
+	      +       w()(1)(0) * v()(1)(0)
+	      +       w()(1)(1) * v()(1)(1)
+	      +       w()(1)(2) * v()(1)(2)
+	      -       w()(2)(0) * v()(2)(0)
+	      -       w()(2)(1) * v()(2)(1)
+	      -       w()(2)(2) * v()(2)(2)
+	      -       w()(3)(0) * v()(3)(0)
+	      -       w()(3)(1) * v()(3)(1)
+	      -       w()(3)(2) * v()(3)(2);
+
+	    
+	    // After getting the sitewise product do the mom phase loop
+	    int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
+	    for ( int m=0;m<Nmom;m++){
+	      int idx = m+base;
+	      autoView(mom_v,mom[m],CpuRead);
+	      auto phase = mom_v[ss];
+	      mac(&lvSum[idx],&vv,&phase()()());
+	    }
+	  }
+	}
+      }
+    }
+  });
+
+
+  // Sum across simd lanes in the plane, breaking out orthog dir.
+  thread_for(rt,rd,{
+
+    Coordinate icoor(nd);
+    iScalar<vector_type> temp; 
+    ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);               
+
+    for(int i=0;i<Lblock;i++){
+    for(int j=0;j<Rblock;j++){
+    for(int m=0;m<Nmom;m++){
+
+      int ij_rdx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*rt;
+
+      temp._internal = lvSum[ij_rdx];
+      extract(temp,extracted);
+
+      for(int idx=0;idx<Nsimd;idx++){
+
+	grid->iCoorFromIindex(icoor,idx);
+
+	int ldx    = rt+icoor[orthogdim]*rd;
+
+	int ij_ldx = m+Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*ldx;
+
+	lsSum[ij_ldx]=lsSum[ij_ldx]+extracted[idx]._internal;
+
+      }
+    }}}
+  });
+
+  assert(mat.dimension(0) == Nmom);
+  assert(mat.dimension(1) == Nt);
+ 
+  int pd = grid->_processors[orthogdim];
+  int pc = grid->_processor_coor[orthogdim];
+  thread_for_collapse(2,lt,ld,{
+    for(int pt=0;pt<pd;pt++){
+      int t = lt + pt*ld;
+      if (pt == pc){
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    for(int m=0;m<Nmom;m++){
+	      int ij_dx = m+Nmom*i + Nmom*Lblock * j + Nmom*Lblock * Rblock * lt;
+	      mat(m,t,i,j) = lsSum[ij_dx];
+	    }
+	  }
+	}
+      } else { 
+	const scalar_type zz(0.0);
+	for(int i=0;i<Lblock;i++){
+	  for(int j=0;j<Rblock;j++){
+	    for(int m=0;m<Nmom;m++){
+	      mat(m,t,i,j) =zz;
+	    }
+	  }
+	}
+      }
+    }
+  });
+
+  grid->GlobalSumVector(&mat(0,0,0,0),Nmom*Nt*Lblock*Rblock);
+}
+
+template<class FImpl>
+void A2Autils<FImpl>::PionFieldWV(Eigen::Tensor<ComplexD,3> &mat, 
+				  const FermionField *wi,
+				  const FermionField *vj,
+				  int orthogdim) 
+{
+  const int g5=1;
+  PionFieldXX(mat,wi,vj,orthogdim,g5);
+}
+template<class FImpl>
+void A2Autils<FImpl>::PionFieldWW(Eigen::Tensor<ComplexD,3> &mat, 
+				  const FermionField *wi,
+				  const FermionField *wj,
+				  int orthogdim) 
+{
+  const int nog5=0;
+  PionFieldXX(mat,wi,wj,orthogdim,nog5);
+}
+template<class FImpl>
+void A2Autils<FImpl>::PionFieldVV(Eigen::Tensor<ComplexD,3> &mat, 
+				  const FermionField *vi,
+				  const FermionField *vj,
+				  int orthogdim) 
+{
+  const int nog5=0;
+  PionFieldXX(mat,vi,vj,orthogdim,nog5);
+}
+*/
+
 NAMESPACE_END(Grid);
 
diff --git a/Grid/util/FlightRecorder.cc b/Grid/util/FlightRecorder.cc
index c19d3dbb..139e7957 100644
--- a/Grid/util/FlightRecorder.cc
+++ b/Grid/util/FlightRecorder.cc
@@ -280,10 +280,11 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
   if(LoggingMode == LoggingModeNone) return;
 
   if ( ChecksumCommsSend ){
-  uint64_t *ubuf = (uint64_t *)buf;
-  if(LoggingMode == LoggingModeNone) return;
+
+    if(LoggingMode == LoggingModeNone) return;
   
 #ifdef GRID_SYCL
+  uint64_t *ubuf = (uint64_t *)buf;
   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
   if(LoggingMode == LoggingModePrint) {
     std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
@@ -327,9 +328,9 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
 void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
 {
   if ( ChecksumComms ){
-  uint64_t *ubuf = (uint64_t *)buf;
   if(LoggingMode == LoggingModeNone) return;
 #ifdef GRID_SYCL
+  uint64_t *ubuf = (uint64_t *)buf;
   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
   if(LoggingMode == LoggingModePrint) {
     std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc
index d2bbf769..9ae4ca37 100644
--- a/benchmarks/Benchmark_usqcd.cc
+++ b/benchmarks/Benchmark_usqcd.cc
@@ -118,7 +118,7 @@ public:
     fprintf(FP,"Packet bytes, direction, GB/s per node\n");
     for(int lat=16;lat<=maxlat;lat+=8){
       //      for(int Ls=8;Ls<=8;Ls*=2){
-      { int Ls=12;
+      { int Ls=8;
 
 	Coordinate latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
@@ -872,7 +872,7 @@ int main (int argc, char ** argv)
   int do_dslash=1;
 
   int sel=4;
-  std::vector<int> L_list({8,12,16,24,32});
+  std::vector<int> L_list({8,12,16,24});
   int selm1=sel-1;
 
   std::vector<double> clover;
diff --git a/examples/Example_taku.cc b/examples/Example_taku.cc
deleted file mode 100644
index b9ad272e..00000000
--- a/examples/Example_taku.cc
+++ /dev/null
@@ -1,383 +0,0 @@
-/*
- * Warning: This code illustrative only: not well tested, and not meant for production use
- * without regression / tests being applied
- */
-
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-
-RealD LLscale =1.0;
-RealD LCscale =1.0;
-
-template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
-{
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  GridBase *grid;
-  GaugeField U;
-  
-  CovariantLaplacianCshift(GaugeField &_U)    :
-    grid(_U.Grid()),
-    U(_U) {  };
-
-  virtual GridBase *Grid(void) { return grid; };
-
-  virtual void  M    (const Field &in, Field &out)
-  {
-    out=Zero();
-    for(int mu=0;mu<Nd-1;mu++) {
-      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
-      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
-      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
-      out = out + 2.0*in;
-    }
-  };
-  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
-  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
-};
-
-void MakePhase(Coordinate mom,LatticeComplex &phase)
-{
-  GridBase *grid = phase.Grid();
-  auto latt_size = grid->GlobalDimensions();
-  ComplexD ci(0.0,1.0);
-  phase=Zero();
-
-  LatticeComplex coor(phase.Grid());
-  for(int mu=0;mu<Nd;mu++){
-    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-    LatticeCoordinate(coor,mu);
-    phase = phase + (TwoPiL * mom[mu]) * coor;
-  }
-  phase = exp(phase*ci);
-}
-void PointSource(Coordinate &coor,LatticePropagator &source)
-{
-  //  Coordinate coor({0,0,0,0});
-  source=Zero();
-  SpinColourMatrix kronecker; kronecker=1.0;
-  pokeSite(kronecker,source,coor);
-}
-void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
-{
-  GridBase *grid = source.Grid();
-  LatticeComplex noise(grid);
-  LatticeComplex zz(grid); zz=Zero();
-  LatticeInteger t(grid);
-
-  RealD nrm=1.0/sqrt(2);
-  bernoulli(RNG, noise); // 0,1 50:50
-
-  noise = (2.*noise - Complex(1,1))*nrm;
-
-  LatticeCoordinate(t,Tdir);
-  noise = where(t==Integer(tslice), noise, zz);
-
-  source = 1.0;
-  source = source*noise;
-  std::cout << " Z2 wall " << norm2(source) << std::endl;
-}
-template<class Field>
-void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
-{
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
-  Laplacian_t Laplacian(U);
-
-  Integer Iterations = 40;
-  Real width = 2.0;
-  Real coeff = (width*width) / Real(4*Iterations);
-
-  Field tmp(U.Grid());
-  smeared=unsmeared;
-  //  chi = (1-p^2/2N)^N kronecker
-  for(int n = 0; n < Iterations; ++n) {
-    Laplacian.M(smeared,tmp);
-    smeared = smeared - coeff*tmp;
-    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
-  }
-}
-void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
-{
-  LatticePropagator tmp(source.Grid());
-  PointSource(site,source);
-  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
-  tmp = source;
-  GaussianSmear(U,tmp,source);
-  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
-}
-void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
-{
-  Z2WallSource(RNG,tslice,source);
-  auto tmp = source;
-  GaussianSmear(U,tmp,source);
-}
-void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
-{
-  assert(mom.size()==Nd);
-  assert(mom[Tdir] == 0);
-
-  GridBase * grid = spectator.Grid();
-
-
-  LatticeInteger ts(grid);
-  LatticeCoordinate(ts,Tdir);
-  source = Zero();
-  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
-
-  LatticeComplex phase(grid);
-  MakePhase(mom,phase);
-
-  source = source *phase;
-}
-template<class Action>
-void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{
-  GridBase *UGrid = D.GaugeGrid();
-  GridBase *FGrid = D.FermionGrid();
-
-  LatticeFermion src4  (UGrid); 
-  LatticeFermion src5  (FGrid); 
-  LatticeFermion result5(FGrid);
-  LatticeFermion result4(UGrid);
-  LatticePropagator prop5(FGrid);
-  
-  ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
-  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
-   for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
-      PropToFerm<Action>(src4,source,s,c);
-
-      D.ImportPhysicalFermionSource(src4,src5);
-
-      result5=Zero();
-      schur(D,src5,result5,ZG);
-      std::cout<<GridLogMessage
-	       <<"spin "<<s<<" color "<<c
-	       <<" norm2(src5d) "   <<norm2(src5)
-               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
-
-      D.ExportPhysicalFermionSolution(result5,result4);
-
-      FermToProp<Action>(prop5,result5,s,c);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-  LatticePropagator Axial_mu(UGrid); 
-  LatticePropagator Vector_mu(UGrid); 
-
-  LatticeComplex    PA (UGrid); 
-  LatticeComplex    VV (UGrid); 
-  LatticeComplex    PJ5q(UGrid);
-  LatticeComplex    PP (UGrid);
-
-  std::vector<TComplex> sumPA;
-  std::vector<TComplex> sumVV;
-  std::vector<TComplex> sumPP;
-  std::vector<TComplex> sumPJ5q;
-
-  Gamma g5(Gamma::Algebra::Gamma5);
-  D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
-  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
-  sliceSum(PA,sumPA,Tdir);
-
-  int Nt{static_cast<int>(sumPA.size())};
-
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
-
-  PP       = trace(adj(propagator)*propagator); // Pseudoscalar density
-  sliceSum(PP,sumPP,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
-  
-  D.ContractJ5q(prop5,PJ5q);
-  sliceSum(PJ5q,sumPJ5q,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
-
-  Gamma::Algebra GammaV[3] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ
-  };
-  for( int mu=0;mu<3;mu++ ) {
-    Gamma gV(GammaV[mu]);
-    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
-    //    auto ss=sliceSum(Vector_mu,Tdir);
-    //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
-    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
-    sliceSum(VV,sumVV,Tdir);
-    for(int t=0;t<Nt;t++){
-      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
-      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
-	       << " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
-    }
-  }
-
-}
-
-class MesonFile: Serializable {
-public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
-};
-
-void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
-{
-  const int nchannel=3;
-  Gamma::Algebra Gammas[nchannel][2] = {
-    {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX},
-    {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY},
-    {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ}
-  };
-
-  Gamma G5(Gamma::Algebra::Gamma5);
-
-  LatticeComplex meson_CF(q1.Grid());
-  MesonFile MF;
-
-  for(int ch=0;ch<nchannel;ch++){
-
-    Gamma Gsrc(Gammas[ch][0]);
-    Gamma Gsnk(Gammas[ch][1]);
-
-    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
-
-    std::vector<TComplex> meson_T;
-    sliceSum(meson_CF,meson_T, Tdir);
-
-    int nt=meson_T.size();
-
-    std::vector<Complex> corr(nt);
-    for(int t=0;t<nt;t++){
-      corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
-      std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *real(corr[t])<<std::endl;
-    }
-    MF.data.push_back(corr);
-  }
-
-  {
-    XmlWriter WR(file);
-    write(WR,"MesonFile",MF);
-  }
-}
-
-int main (int argc, char ** argv)
-{
-  const int Ls=32;
-
-  Grid_init(&argc,&argv);
-
-  // Double precision grids
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
-								   GridDefaultSimd(Nd,vComplex::Nsimd()),
-								   GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  //////////////////////////////////////////////////////////////////////
-  // You can manage seeds however you like.
-  // Recommend SeedUniqueString.
-  //////////////////////////////////////////////////////////////////////
-  std::vector<int> seeds4({1,2,3,4}); 
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-
-  LatticeGaugeField Umu(UGrid);
-  std::string config;
-  RealD M5=1.8;
-  if( argc > 1 && argv[1][0] != '-' )
-  {
-    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
-    FieldMetaData header;
-    NerscIO::readConfiguration(Umu, header, argv[1]);
-    config=argv[1];
-    M5=1.8;
-  }
-  else
-  {
-    SU<Nc>::ColdConfiguration(Umu);
-    config="ColdConfig";
-    //    RealD P=1.0; // Don't scale
-    RealD P=0.5871119; // 48I
-    //    RealD P=0.6153342; // 64I
-    //    RealD P=0.6388238 // 32Ifine
-    RealD u0 = sqrt(sqrt(P));
-    RealD M5mf = M5 - 4.0*(1.0-u0);
-    RealD w0   = 1.0 - M5mf;
-#if 0
-    // M5=1.8 with U=u0
-    Umu = Umu * u0;
-    LLscale = 1.0;
-    LCscale = 1.0;
-    std::cout<<GridLogMessage <<"Gauge links are u=u0= "<<u0<<std::endl;
-    std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
-#else
-    M5 = M5mf;
-    std::cout<<GridLogMessage <<"Gauge links are u=1  "<<std::endl;
-    std::cout<<GridLogMessage <<"u0="<<u0<<std::endl;
-    std::cout<<GridLogMessage <<"M5=M5mf =  "<<M5<<std::endl;
-    LLscale = 1.0/(1-w0*w0)/(1-w0*w0);
-    LCscale = 1.0/(1-w0*w0)/(1-w0*w0);
-#endif
-    std::cout<<GridLogMessage <<"LLscale =  "<<LLscale<<std::endl;
-    std::cout<<GridLogMessage <<"LCscale =  "<<LCscale<<std::endl;
-  }
-
-  std::vector<RealD> masses({ 0.00} ); // u/d, s, c ??
-
-  int nmass = masses.size();
-
-  std::vector<MobiusFermionD *> FermActs;
-  
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-
-  for(auto mass: masses) {
-
-    RealD b=1.5;// Scale factor b+c=2, b-c=1
-    RealD c=0.5;
-    
-    FermActs.push_back(new MobiusFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c));
-   
-  }
-
-  LatticePropagator point_source(UGrid);
-  //  LatticePropagator wall_source(UGrid);
-
-  Coordinate Origin({0,0,0,0});
-  PointSource   (Origin,point_source);
-  //  Z2WallSource  (RNG4,0,wall_source);
-  
-  std::vector<LatticePropagator> PointProps(nmass,UGrid);
-  //  std::vector<LatticePropagator> GaussProps(nmass,UGrid);
-  //  std::vector<LatticePropagator> Z2Props   (nmass,UGrid);
-
-  for(int m=0;m<nmass;m++) {
-    
-    Solve(*FermActs[m],point_source   ,PointProps[m]);
-  }
-
-  LatticeComplex phase(UGrid);
-  Coordinate mom({0,0,0,0});
-  MakePhase(mom,phase);
-  
-  for(int m1=0 ;m1<nmass;m1++) {
-  for(int m2=m1;m2<nmass;m2++) {
-    std::stringstream ssp,ssg,ssz;
-
-    ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_wall_meson.xml";
-
-    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
-    //    MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
-  }}
-
-  Grid_finalize();
-}
-
-
-
diff --git a/examples/Example_taku1.cc b/examples/Example_taku1.cc
deleted file mode 100644
index 7c1b3526..00000000
--- a/examples/Example_taku1.cc
+++ /dev/null
@@ -1,479 +0,0 @@
-/*
- * Warning: This code illustrative only: not well tested, and not meant for production use
- * without regression / tests being applied
- */
-
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-
-RealD LLscale =1.0;
-RealD LCscale =1.0;
-
-template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
-{
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  GridBase *grid;
-  GaugeField U;
-  
-  CovariantLaplacianCshift(GaugeField &_U)    :
-    grid(_U.Grid()),
-    U(_U) {  };
-
-  virtual GridBase *Grid(void) { return grid; };
-
-  virtual void  M    (const Field &in, Field &out)
-  {
-    out=Zero();
-    for(int mu=0;mu<Nd-1;mu++) {
-      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
-      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
-      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
-      out = out + 2.0*in;
-    }
-  };
-  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
-  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
-};
-
-void MakePhase(Coordinate mom,LatticeComplex &phase)
-{
-  GridBase *grid = phase.Grid();
-  auto latt_size = grid->GlobalDimensions();
-  ComplexD ci(0.0,1.0);
-  phase=Zero();
-
-  LatticeComplex coor(phase.Grid());
-  for(int mu=0;mu<Nd;mu++){
-    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-    LatticeCoordinate(coor,mu);
-    phase = phase + (TwoPiL * mom[mu]) * coor;
-  }
-  phase = exp(phase*ci);
-}
-
-void PointSource(Coordinate &coor,LatticePropagator &source)
-{
-  //  Coordinate coor({0,0,0,0});
-  source=Zero();
-  SpinColourMatrix kronecker; kronecker=1.0;
-  pokeSite(kronecker,source,coor);
-}
-void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
-{
-  GridBase *grid = source.Grid();
-  LatticeComplex noise(grid);
-  LatticeComplex zz(grid); zz=Zero();
-  LatticeInteger t(grid);
-
-  RealD nrm=1.0/sqrt(2);
-  bernoulli(RNG, noise); // 0,1 50:50
-
-  noise = (2.*noise - Complex(1,1))*nrm;
-
-  LatticeCoordinate(t,Tdir);
-  noise = where(t==Integer(tslice), noise, zz);
-
-  source = 1.0;
-  source = source*noise;
-  std::cout << " Z2 wall " << norm2(source) << std::endl;
-}
-template<class Field>
-void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
-{
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
-  Laplacian_t Laplacian(U);
-
-  Integer Iterations = 40;
-  Real width = 2.0;
-  Real coeff = (width*width) / Real(4*Iterations);
-
-  Field tmp(U.Grid());
-  smeared=unsmeared;
-  //  chi = (1-p^2/2N)^N kronecker
-  for(int n = 0; n < Iterations; ++n) {
-    Laplacian.M(smeared,tmp);
-    smeared = smeared - coeff*tmp;
-    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
-  }
-}
-void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
-{
-  LatticePropagator tmp(source.Grid());
-  PointSource(site,source);
-  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
-  tmp = source;
-  GaussianSmear(U,tmp,source);
-  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
-}
-void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
-{
-  Z2WallSource(RNG,tslice,source);
-  auto tmp = source;
-  GaussianSmear(U,tmp,source);
-}
-void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
-{
-  assert(mom.size()==Nd);
-  assert(mom[Tdir] == 0);
-
-  GridBase * grid = spectator.Grid();
-
-
-  LatticeInteger ts(grid);
-  LatticeCoordinate(ts,Tdir);
-  source = Zero();
-  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
-
-  LatticeComplex phase(grid);
-  MakePhase(mom,phase);
-
-  source = source *phase;
-}
-
-template<class Action>
-void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{			   
- GridBase *UGrid = source.Grid();
-  GridBase *FGrid = D.FermionGrid();
-  bool fiveD = true; //calculate 5d free propagator
-  RealD mass = D.Mass();
-  LatticeFermion src4  (UGrid);
-  LatticeFermion result4  (UGrid);
-  LatticeFermion result5(FGrid);
-  LatticeFermion src5(FGrid);
-  LatticePropagator prop5(FGrid);
-  for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
- 
-      PropToFerm<Action>(src4,source,s,c);
-
-      D.ImportPhysicalFermionSource(src4,src5);
-      D.FreePropagator(src5,result5,mass,true);
-      std::cout<<GridLogMessage
-               <<"Free 5D prop spin "<<s<<" color "<<c
-               <<" norm2(src5d) "   <<norm2(src5)
-               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
-
-      D.ExportPhysicalFermionSolution(result5,result4);
-
-      FermToProp<Action>(prop5,result5,s,c);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-
-  LatticePropagator Vector_mu(UGrid);
-  LatticeComplex    VV (UGrid);
-  std::vector<TComplex> sumVV;
-  Gamma::Algebra GammaV[3] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ
-  };
-  for( int mu=0;mu<3;mu++ ) {
-    Gamma gV(GammaV[mu]);
-    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
-    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
-    sliceSum(VV,sumVV,Tdir);
-    int Nt = sumVV.size();
-    for(int t=0;t<Nt;t++){
-      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
-      RealD Cont=0;
-      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
-      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
-               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
-    }
-  }
-}
-template<class Action>
-void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{			   
-  bool fiveD = false; //calculate 4d free propagator
-  RealD mass = D.Mass();
-  GridBase *UGrid = source.Grid();
-  LatticeFermion src4  (UGrid); 
-  LatticeFermion result4  (UGrid); 
-  for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
-      PropToFerm<Action>(src4,source,s,c);
-      D.FreePropagator(src4,result4,mass,false);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-}
-
-template<class Action>
-void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{
-  GridBase *UGrid = D.GaugeGrid();
-  GridBase *FGrid = D.FermionGrid();
-
-  LatticeFermion src4  (UGrid); 
-  LatticeFermion src5  (FGrid); 
-  LatticeFermion result5(FGrid);
-  LatticeFermion result4(UGrid);
-  LatticePropagator prop5(FGrid);
-  
-  ConjugateGradient<LatticeFermion> CG(1.0e-10,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
-  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
-   for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
-      PropToFerm<Action>(src4,source,s,c);
-
-      D.ImportPhysicalFermionSource(src4,src5);
-
-      result5=Zero();
-      schur(D,src5,result5,ZG);
-      std::cout<<GridLogMessage
-	       <<"spin "<<s<<" color "<<c
-	       <<" norm2(src5d) "   <<norm2(src5)
-               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
-
-      D.ExportPhysicalFermionSolution(result5,result4);
-
-      FermToProp<Action>(prop5,result5,s,c);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-  LatticePropagator Axial_mu(UGrid); 
-  LatticePropagator Vector_mu(UGrid); 
-
-  LatticeComplex    PA (UGrid); 
-  LatticeComplex    VV (UGrid); 
-  LatticeComplex    PJ5q(UGrid);
-  LatticeComplex    PP (UGrid);
-
-  std::vector<TComplex> sumPA;
-  std::vector<TComplex> sumVV;
-  std::vector<TComplex> sumPP;
-  std::vector<TComplex> sumPJ5q;
-
-  Gamma g5(Gamma::Algebra::Gamma5);
-  D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
-  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
-  sliceSum(PA,sumPA,Tdir);
-
-  int Nt{static_cast<int>(sumPA.size())};
-
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
-
-  PP       = trace(adj(propagator)*propagator); // Pseudoscalar density
-  sliceSum(PP,sumPP,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
-  
-  D.ContractJ5q(prop5,PJ5q);
-  sliceSum(PJ5q,sumPJ5q,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
-
-  Gamma::Algebra GammaV[3] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ
-  };
-  for( int mu=0;mu<3;mu++ ) {
-    Gamma gV(GammaV[mu]);
-    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
-    //    auto ss=sliceSum(Vector_mu,Tdir);
-    //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
-    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
-    sliceSum(VV,sumVV,Tdir);
-    for(int t=0;t<Nt;t++){
-      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
-      RealD Cont=0;
-      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
-      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
-               << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
-    }
-  }
-
-}
-
-class MesonFile: Serializable {
-public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
-};
-
-void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
-{
-  const int nchannel=4;
-  Gamma::Algebra Gammas[nchannel][2] = {
-    {Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5},
-    {Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5},
-    {Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5},
-    {Gamma::Algebra::Identity,Gamma::Algebra::Identity}
-  };
-
-  LatticeComplex meson_CF(q1.Grid());
-  MesonFile MF;
-
-  for(int ch=0;ch<nchannel;ch++){
-
-    Gamma Gsrc(Gammas[ch][0]);
-    Gamma Gsnk(Gammas[ch][1]);
-
-    meson_CF = trace(adj(q1)*Gsnk*q2*adj(Gsrc));
-
-    std::vector<TComplex> meson_T;
-    sliceSum(meson_CF,meson_T, Tdir);
-
-    int nt=meson_T.size();
-
-    std::vector<Complex> corr(nt);
-    for(int t=0;t<nt;t++){
-      corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
-      RealD Ct = real(corr[t]);
-      RealD Cont=0;
-      if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
-      std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t * Ct
-		<< " deltaC " <<Ct-Cont<<std::endl;
-    }
-    MF.data.push_back(corr);
-  }
-
-  {
-    XmlWriter WR(file);
-    write(WR,"MesonFile",MF);
-  }
-}
-
-int main (int argc, char ** argv)
-{
-  const int Ls=10;
-
-  Grid_init(&argc,&argv);
-
-  // Double precision grids
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
-								   GridDefaultSimd(Nd,vComplex::Nsimd()),
-								   GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  //////////////////////////////////////////////////////////////////////
-  // You can manage seeds however you like.
-  // Recommend SeedUniqueString.
-  //////////////////////////////////////////////////////////////////////
-  //  std::vector<int> seeds4({1,2,3,4}); 
-  //  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-
-  LatticeGaugeField Umu(UGrid);
-  std::string config;
-  RealD M5=atof(getenv("M5"));
-  RealD mq = atof(getenv("mass"));
-  int   tadpole = atof(getenv("tadpole"));
-  std::vector<RealD> masses({ mq} ); // u/d, s, c ??
-  if( argc > 1 && argv[1][0] != '-' )
-  {
-    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
-    FieldMetaData header;
-    NerscIO::readConfiguration(Umu, header, argv[1]);
-    config=argv[1];
-    LLscale = 1.0;
-    LCscale = 1.0;
-  }
-  else
-  {
-    SU<Nc>::ColdConfiguration(Umu);
-    config="ColdConfig";
-    //    RealD P=1.0; // Don't scale
-    //    RealD P=0.6388238 // 32Ifine
-    //    RealD P=0.6153342; // 64I
-    RealD P=0.5871119; // 48I
-    RealD u0 = sqrt(sqrt(P));
-    RealD w0 = 1 - M5;
-    std::cout<<GridLogMessage <<"For plaquette P="<<P<<" u0= "<<u0<<std::endl;
-    if ( tadpole == 1 ) {
-      Umu = Umu * u0;
-      //      LLscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
-      //      LCscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
-      LLscale = 1.0;
-      LCscale = 1.0;
-      std::cout<<GridLogMessage <<"Gauge links are u= u0 "<<std::endl;
-      std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
-    } else if ( tadpole == 2) {
-      std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
-      LLscale = 1.0;
-      LCscale = 1.0;
-      std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
-    } else {
-      LLscale = 1.0/u0/u0;
-      LCscale = 1.0/u0/u0;
-      M5 = M5 - 4.0 * (1-u0);
-      std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
-      std::cout<<GridLogMessage <<"M5mf =  "<<M5<<std::endl;
-    }
-    std::cout<<GridLogMessage <<"mq =  "<<mq<<std::endl;
-    std::cout<<GridLogMessage <<"LLscale =  "<<LLscale<<std::endl;
-    std::cout<<GridLogMessage <<"LCscale =  "<<LCscale<<std::endl;
-  }
-
-  int nmass = masses.size();
-
-  typedef DomainWallFermionD FermionActionD;
-  //  typedef MobiusFermionD FermionActionD;
-  std::vector<FermionActionD *> FermActs;
-  std::vector<DomainWallFermionD *> DWFActs;
-  
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-  std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-
-  for(auto mass: masses) {
-    std::vector<Complex> boundary = {1,1,1,-1};
-    FermionActionD::ImplParams Params(boundary);
-    RealD b=1.5;
-    RealD c=0.5;
-    std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl;
-    //    DWFActs.push_back(new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5));
-    FermActs.push_back(new FermionActionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params));
-    //    FermActs.push_back(new FermionActionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass+0.001,M5,b,c));
-    std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl;
-  }
-
-  LatticePropagator point_source(UGrid);
-
-  Coordinate Origin({0,0,0,0});
-  PointSource   (Origin,point_source);
-  
-  std::vector<LatticePropagator> PointProps(nmass,UGrid);
-  //  std::vector<LatticePropagator> FreeProps(nmass,UGrid);
-  //  LatticePropagator delta(UGrid);
-
-  for(int m=0;m<nmass;m++) {
-    Solve(*FermActs[m],point_source   ,PointProps[m]);
-    //    MasslessFreePropagator(*FermActs[m],point_source   ,FreeProps[m]);
-
-    //    delta = PointProps[m] - FreeProps[m];
-    //    std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl;
-  }
-
-  LatticeComplex phase(UGrid);
-  Coordinate mom({0,0,0,0});
-  MakePhase(mom,phase);
-  
-  for(int m1=0 ;m1<nmass;m1++) {
-  for(int m2=m1;m2<nmass;m2++) {
-    std::stringstream ssp,ssg,ssz;
-
-    ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
-
-    std::cout << "CG determined VV correlation function"<<std::endl;
-    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
-    
-    //    std::cout << "FFT derived VV correlation function"<<std::endl;
-    //    MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase);
-  }}
-
-  Grid_finalize();
-}
-
-
-
diff --git a/examples/Example_taku2.cc b/examples/Example_taku2.cc
deleted file mode 100644
index 75dea606..00000000
--- a/examples/Example_taku2.cc
+++ /dev/null
@@ -1,433 +0,0 @@
-/*
- * Warning: This code illustrative only: not well tested, and not meant for production use
- * without regression / tests being applied
- */
-
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-
-RealD LLscale =1.0;
-RealD LCscale =1.0;
-
-template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
-{
-public:
-  INHERIT_GIMPL_TYPES(Gimpl);
-
-  GridBase *grid;
-  GaugeField U;
-  
-  CovariantLaplacianCshift(GaugeField &_U)    :
-    grid(_U.Grid()),
-    U(_U) {  };
-
-  virtual GridBase *Grid(void) { return grid; };
-
-  virtual void  M    (const Field &in, Field &out)
-  {
-    out=Zero();
-    for(int mu=0;mu<Nd-1;mu++) {
-      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
-      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
-      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
-      out = out + 2.0*in;
-    }
-  };
-  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
-  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
-  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
-  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
-};
-
-void MakePhase(Coordinate mom,LatticeComplex &phase)
-{
-  GridBase *grid = phase.Grid();
-  auto latt_size = grid->GlobalDimensions();
-  ComplexD ci(0.0,1.0);
-  phase=Zero();
-
-  LatticeComplex coor(phase.Grid());
-  for(int mu=0;mu<Nd;mu++){
-    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
-    LatticeCoordinate(coor,mu);
-    phase = phase + (TwoPiL * mom[mu]) * coor;
-  }
-  phase = exp(phase*ci);
-}
-
-void PointSource(Coordinate &coor,LatticePropagator &source)
-{
-  //  Coordinate coor({0,0,0,0});
-  source=Zero();
-  SpinColourMatrix kronecker; kronecker=1.0;
-  pokeSite(kronecker,source,coor);
-}
-void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
-{
-  GridBase *grid = source.Grid();
-  LatticeComplex noise(grid);
-  LatticeComplex zz(grid); zz=Zero();
-  LatticeInteger t(grid);
-
-  RealD nrm=1.0/sqrt(2);
-  bernoulli(RNG, noise); // 0,1 50:50
-
-  noise = (2.*noise - Complex(1,1))*nrm;
-
-  LatticeCoordinate(t,Tdir);
-  noise = where(t==Integer(tslice), noise, zz);
-
-  source = 1.0;
-  source = source*noise;
-  std::cout << " Z2 wall " << norm2(source) << std::endl;
-}
-template<class Field>
-void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
-{
-  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
-  Laplacian_t Laplacian(U);
-
-  Integer Iterations = 40;
-  Real width = 2.0;
-  Real coeff = (width*width) / Real(4*Iterations);
-
-  Field tmp(U.Grid());
-  smeared=unsmeared;
-  //  chi = (1-p^2/2N)^N kronecker
-  for(int n = 0; n < Iterations; ++n) {
-    Laplacian.M(smeared,tmp);
-    smeared = smeared - coeff*tmp;
-    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
-  }
-}
-void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
-{
-  LatticePropagator tmp(source.Grid());
-  PointSource(site,source);
-  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
-  tmp = source;
-  GaussianSmear(U,tmp,source);
-  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
-}
-void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
-{
-  Z2WallSource(RNG,tslice,source);
-  auto tmp = source;
-  GaussianSmear(U,tmp,source);
-}
-void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
-{
-  assert(mom.size()==Nd);
-  assert(mom[Tdir] == 0);
-
-  GridBase * grid = spectator.Grid();
-
-
-  LatticeInteger ts(grid);
-  LatticeCoordinate(ts,Tdir);
-  source = Zero();
-  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
-
-  LatticeComplex phase(grid);
-  MakePhase(mom,phase);
-
-  source = source *phase;
-}
-
-template<class Action>
-void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{			   
- GridBase *UGrid = source.Grid();
-  GridBase *FGrid = D.FermionGrid();
-  bool fiveD = true; //calculate 4d free propagator                                                                                                                 
-  RealD mass = D.Mass();
-  LatticeFermion src4  (UGrid);
-  LatticeFermion result4  (UGrid);
-  LatticeFermion result5(FGrid);
-  LatticeFermion src5(FGrid);
-  LatticePropagator prop5(FGrid);
-  for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
- 
-      PropToFerm<Action>(src4,source,s,c);
-
-      D.ImportPhysicalFermionSource(src4,src5);
-      D.FreePropagator(src5,result5,mass,true);
-      std::cout<<GridLogMessage
-               <<"spin "<<s<<" color "<<c
-               <<" norm2(src5d) "   <<norm2(src5)
-               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
-
-      D.ExportPhysicalFermionSolution(result5,result4);
-
-      FermToProp<Action>(prop5,result5,s,c);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-
-  LatticePropagator Vector_mu(UGrid);
-  LatticeComplex    VV (UGrid);
-  std::vector<TComplex> sumVV;
-  Gamma::Algebra GammaV[3] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ
-  };
-  for( int mu=0;mu<3;mu++ ) {
-    Gamma gV(GammaV[mu]);
-    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
-    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
-    sliceSum(VV,sumVV,Tdir);
-    int Nt = sumVV.size();
-    for(int t=0;t<Nt;t++){
-      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
-      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
-               << " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
-    }
-  }
-}
-
-template<class Action>
-void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
-{
-  GridBase *UGrid = D.GaugeGrid();
-  GridBase *FGrid = D.FermionGrid();
-
-  LatticeFermion src4  (UGrid); 
-  LatticeFermion src5  (FGrid); 
-  LatticeFermion result5(FGrid);
-  LatticeFermion result4(UGrid);
-  LatticePropagator prop5(FGrid);
-  
-  ConjugateGradient<LatticeFermion> CG(1.0e-6,100000);
-  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
-  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
-   for(int s=0;s<Nd;s++){
-    for(int c=0;c<Nc;c++){
-      PropToFerm<Action>(src4,source,s,c);
-
-      D.ImportPhysicalFermionSource(src4,src5);
-
-      result5=Zero();
-      schur(D,src5,result5,ZG);
-      std::cout<<GridLogMessage
-	       <<"spin "<<s<<" color "<<c
-	       <<" norm2(src5d) "   <<norm2(src5)
-               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
-
-      D.ExportPhysicalFermionSolution(result5,result4);
-
-      FermToProp<Action>(prop5,result5,s,c);
-      FermToProp<Action>(propagator,result4,s,c);
-    }
-  }
-  LatticePropagator Axial_mu(UGrid); 
-  LatticePropagator Vector_mu(UGrid); 
-
-  LatticeComplex    PA (UGrid); 
-  LatticeComplex    VV (UGrid); 
-  LatticeComplex    PJ5q(UGrid);
-  LatticeComplex    PP (UGrid);
-
-  std::vector<TComplex> sumPA;
-  std::vector<TComplex> sumVV;
-  std::vector<TComplex> sumPP;
-  std::vector<TComplex> sumPJ5q;
-
-  Gamma g5(Gamma::Algebra::Gamma5);
-  D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
-  PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current
-  sliceSum(PA,sumPA,Tdir);
-
-  int Nt{static_cast<int>(sumPA.size())};
-
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
-
-  PP       = trace(adj(propagator)*propagator); // Pseudoscalar density
-  sliceSum(PP,sumPP,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
-  
-  D.ContractJ5q(prop5,PJ5q);
-  sliceSum(PJ5q,sumPJ5q,Tdir);
-  for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
-
-  Gamma::Algebra GammaV[3] = {
-    Gamma::Algebra::GammaX,
-    Gamma::Algebra::GammaY,
-    Gamma::Algebra::GammaZ
-  };
-  for( int mu=0;mu<3;mu++ ) {
-    Gamma gV(GammaV[mu]);
-    D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
-    //    auto ss=sliceSum(Vector_mu,Tdir);
-    //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
-    VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current
-    sliceSum(VV,sumVV,Tdir);
-    for(int t=0;t<Nt;t++){
-      RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
-      std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
-	       << " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
-    }
-  }
-
-}
-
-class MesonFile: Serializable {
-public:
-  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
-};
-
-void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
-{
-  const int nchannel=3;
-  Gamma::Algebra Gammas[nchannel][2] = {
-    {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX},
-    {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY},
-    //    {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ}
-    {Gamma::Algebra::Gamma5,Gamma::Algebra::Gamma5}
-  };
-
-  Gamma G5(Gamma::Algebra::Gamma5);
-
-  LatticeComplex meson_CF(q1.Grid());
-  MesonFile MF;
-
-  for(int ch=0;ch<nchannel;ch++){
-
-    Gamma Gsrc(Gammas[ch][0]);
-    Gamma Gsnk(Gammas[ch][1]);
-
-    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
-
-    std::vector<TComplex> meson_T;
-    sliceSum(meson_CF,meson_T, Tdir);
-
-    int nt=meson_T.size();
-
-    std::vector<Complex> corr(nt);
-    for(int t=0;t<nt;t++){
-      corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
-      std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *real(corr[t])<<std::endl;
-    }
-    MF.data.push_back(corr);
-  }
-
-  {
-    XmlWriter WR(file);
-    write(WR,"MesonFile",MF);
-  }
-}
-
-int main (int argc, char ** argv)
-{
-  const int Ls=8;
-
-  Grid_init(&argc,&argv);
-
-  // Double precision grids
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
-								   GridDefaultSimd(Nd,vComplex::Nsimd()),
-								   GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  //////////////////////////////////////////////////////////////////////
-  // You can manage seeds however you like.
-  // Recommend SeedUniqueString.
-  //////////////////////////////////////////////////////////////////////
-  //  std::vector<int> seeds4({1,2,3,4}); 
-  //  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-
-  LatticeGaugeField Umu(UGrid);
-  std::string config;
-  RealD M5=atof(getenv("M5"));
-  RealD mq = atof(getenv("mass"));
-  std::vector<RealD> masses({ mq} ); // u/d, s, c ??
-  if( argc > 1 && argv[1][0] != '-' )
-  {
-    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
-    FieldMetaData header;
-    NerscIO::readConfiguration(Umu, header, argv[1]);
-    config=argv[1];
-    LLscale = 1.0;
-    LCscale = 1.0;
-  }
-  else
-  {
-    SU<Nc>::ColdConfiguration(Umu);
-    config="ColdConfig";
-    //    RealD P=1.0; // Don't scale
-    //    RealD P=0.6153342; // 64I
-    //    RealD P=0.6388238 // 32Ifine
-    //    RealD P=0.5871119; // 48I
-    //    RealD u0 = sqrt(sqrt(P));
-    //    Umu = Umu * u0;
-    RealD w0 = 1 - M5;
-    LLscale = 1.0/(1-w0*w0)/(1-w0*w0);
-    LCscale = 1.0/(1-w0*w0)/(1-w0*w0);
-    std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
-    std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl;
-    std::cout<<GridLogMessage <<"mq =  "<<mq<<std::endl;
-    std::cout<<GridLogMessage <<"LLscale =  "<<LLscale<<std::endl;
-    std::cout<<GridLogMessage <<"LCscale =  "<<LCscale<<std::endl;
-  }
-
-  int nmass = masses.size();
-
-  std::vector<DomainWallFermionD *> FermActs;
-  
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-  std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
-  std::cout<<GridLogMessage <<"======================"<<std::endl;
-
-  for(auto mass: masses) {
-
-    std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl;
-    FermActs.push_back(new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5));
-    std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl;
-   
-  }
-
-  LatticePropagator point_source(UGrid);
-
-  Coordinate Origin({0,0,0,0});
-  PointSource   (Origin,point_source);
-  
-  //  std::vector<LatticePropagator> PointProps(nmass,UGrid);
-  std::vector<LatticePropagator> FreeProps(nmass,UGrid);
-  LatticePropagator delta(UGrid);
-
-  for(int m=0;m<nmass;m++) {
-    //    Solve(*FermActs[m],point_source   ,PointProps[m]);
-    MasslessFreePropagator(*FermActs[m],point_source   ,FreeProps[m]);
-
-    //    delta = PointProps[m] - FreeProps[m];
-    //    std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl;
-  }
-
-  LatticeComplex phase(UGrid);
-  Coordinate mom({0,0,0,0});
-  MakePhase(mom,phase);
-  
-  for(int m1=0 ;m1<nmass;m1++) {
-  for(int m2=m1;m2<nmass;m2++) {
-    std::stringstream ssp,ssg,ssz;
-
-    ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
-    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
-
-    //    std::cout << "CG determined VV correlation function"<<std::endl;
-    //    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
-    
-    std::cout << "FFT derived VV correlation function"<<std::endl;
-    MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase);
-  }}
-
-  Grid_finalize();
-}
-
-
-
diff --git a/systems/Linux-cuda/config-command b/systems/Linux-cuda/config-command
index ebdb5356..94e2287c 100644
--- a/systems/Linux-cuda/config-command
+++ b/systems/Linux-cuda/config-command
@@ -4,6 +4,8 @@
     --enable-gen-simd-width=64 \
     --enable-shm=nvlink \
     --with-lime=$CLIME \
+    --with-hdf5=$HDF5 \
+    --with-fftw=$FFTW \
     --with-gmp=$GMP \
     --with-mpfr=$MPFR \
     --enable-accelerator=cuda \
diff --git a/systems/Linux-cuda/sourceme.sh b/systems/Linux-cuda/sourceme.sh
index fbec47a4..207a1371 100644
--- a/systems/Linux-cuda/sourceme.sh
+++ b/systems/Linux-cuda/sourceme.sh
@@ -3,10 +3,14 @@ spack load cuda@12.0.0
 spack load c-lime
 spack load gmp
 spack load mpfr
+spack load hdf5
+spack load fftw
 spack load openmpi
+export FFTW=`spack find --paths fftw | grep fftw | cut -c 14-`
+export HDF5=`spack find --paths hdf5 | grep hdf5 | cut -c 14-`
 export CUDA=`spack find --paths cuda@11.8.0 | grep cuda | cut -c 14-`
 export CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
 export GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
 export MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
 export NVIDIALIB=$CUDA/targets/x86_64-linux/lib/
-export LD_LIBRARY_PATH=$NVIDIALIB:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$NVIDIALIB:$LD_LIBRARY_PATH:$HDF5/lib:$FFTW/lib:$CLIME/lib/:$MPFR/lib
diff --git a/tests/Test_meson_field.cc b/tests/Test_meson_field.cc
index 62aeae6d..fa428d6a 100644
--- a/tests/Test_meson_field.cc
+++ b/tests/Test_meson_field.cc
@@ -31,7 +31,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 using namespace Grid;
 
 const int TSRC = 0;  //timeslice where rho is nonzero
-const int VDIM = 5; //length of each vector
+const int VDIM = 8; //length of each vector
 
 typedef typename DomainWallFermionD::ComplexField ComplexField;
 typedef typename DomainWallFermionD::FermionField FermionField;
@@ -55,15 +55,26 @@ int main(int argc, char *argv[])
   pRNG.SeedFixedIntegers(seeds);
 
   // MesonField lhs and rhs vectors
+  const int Nem=1;
   std::vector<FermionField> phi(VDIM,&grid);
+  std::vector<ComplexField> B0(Nem,&grid);
+  std::vector<ComplexField> B1(Nem,&grid);
   std::cout << GridLogMessage << "Initialising random meson fields" << std::endl;
   for (unsigned int i = 0; i < VDIM; ++i){
     random(pRNG,phi[i]);
   }
+  for (unsigned int i = 0; i < Nem; ++i){
+    random(pRNG,B0[i]);
+    random(pRNG,B1[i]);
+  }
   std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl;
 
   // Gamma matrices used in the contraction
   std::vector<Gamma::Algebra> Gmu = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT,
     Gamma::Algebra::GammaX,
     Gamma::Algebra::GammaY,
     Gamma::Algebra::GammaZ,
@@ -74,11 +85,15 @@ int main(int argc, char *argv[])
   std::vector<std::vector<double>> momenta = {
 	  {0.,0.,0.},
 	  {1.,0.,0.},
+	  {-1.,0.,0.},
+	  {0,1.,0.},
+	  {0,-1.,0.},
+	  {0,0,1.},
+	  {0,0,-1.},
 	  {1.,1.,0.},
 	  {1.,1.,1.},
 	  {2.,0.,0.}
   };
-  // 5 momenta x VDIMxVDIM = 125 calls (x 16 spins) 1.4s => 1400/125 ~10ms per call
   std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl;
 
   std::cout << GridLogMessage << "Computing complex phases" << std::endl;
@@ -98,46 +113,28 @@ int main(int argc, char *argv[])
   std::cout << GridLogMessage << "Computing complex phases done." << std::endl;
 
   Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
-  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpp_gpu(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
+  Eigen::Tensor<ComplexD,5, Eigen::RowMajor> App(B0.size(),1,Nt,VDIM,VDIM);
 
   // timer
   double start,stop;
 
+  /////////////////////////////////////////////////////////////////////////
   //execute meson field routine
-  std::cout << GridLogMessage << "Meson Field Warmup Begin" << std::endl;
+  /////////////////////////////////////////////////////////////////////////
   A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
-  std::cout << GridLogMessage << "Meson Field Timing Begin" << std::endl;
   start = usecond();
   A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
   stop = usecond();
   std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl;
 
-  std::cout << GridLogMessage << "Meson Field GPU Warmup Begin" << std::endl;
-  A2Autils<WilsonImplR>::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp);
-  std::cout << GridLogMessage << "Meson Field GPU Timing Begin" << std::endl;
+  /////////////////////////////////////////////////////////////////////////
+  //execute aslash field routine
+  /////////////////////////////////////////////////////////////////////////
+  A2Autils<WilsonImplR>::AslashField(App,&phi[0],&phi[0],B0,B1,Tp);
   start = usecond();
-  A2Autils<WilsonImplR>::MesonFieldGPU(Mpp_gpu,&phi[0],&phi[0],Gmu,phases,Tp);
+  A2Autils<WilsonImplR>::AslashField(App,&phi[0],&phi[0],B0,B1,Tp);
   stop = usecond();
-  std::cout << GridLogMessage << "M_gpu(phi,phi) created, execution time " << stop-start << " us" << std::endl;
-
-  for(int mom=0;mom<momenta.size();mom++){
-    for(int mu=0;mu<Gmu.size();mu++){
-      for(int t=0;t<Nt;t++){
-	for(int v=0;v<VDIM;v++){
-	  for(int w=0;w<VDIM;w++){
-	    std::cout << GridLogMessage
-		      << " " << mom
-		      << " " << mu
-		      << " " << t
-		      << " " << v
-		      << " " << w
-		      << " " << Mpp_gpu(mom,mu,t,v,w)
-		      << " " << Mpp(mom,mu,t,v,w) << std::endl;
-	  }
-	}
-      }
-    }
-  }
+  std::cout << GridLogMessage << "Alash(phi,phi) created, execution time " << stop-start << " us" << std::endl;
   
   std::string FileName = "Meson_Fields";
 #ifdef HAVE_HDF5
@@ -151,8 +148,8 @@ int main(int argc, char *argv[])
 #endif
   {
     Default_Writer w(FileName);
-    write(w,"phi_phi",Mpp);
-    write(w,"phi_phi_gpu",Mpp_gpu);
+    write(w,"MesonField",Mpp);
+    write(w,"AslashField",App);
   }
   // epilogue
   std::cout << GridLogMessage << "Grid is finalizing now" << std::endl;