diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
index 4977ea68..84ac25c1 100644
--- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h
@@ -397,6 +397,7 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
 template <class Impl>
 void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
 {
+  DhopCalls+=2;
   conformable(in.Grid(), _grid);  // verifies full grid
   conformable(in.Grid(), out.Grid());
 
@@ -408,6 +409,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
 template <class Impl>
 void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
 {
+  DhopCalls++;
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
@@ -420,6 +422,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
 template <class Impl>
 void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls++;
   conformable(in.Grid(), _cbgrid);    // verifies half grid
   conformable(in.Grid(), out.Grid());  // drops the cb check
 
diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h
index 70055754..77b7de52 100644
--- a/Grid/qcd/hmc/integrators/Integrator.h
+++ b/Grid/qcd/hmc/integrators/Integrator.h
@@ -33,6 +33,7 @@ directory
 #define INTEGRATOR_INCLUDED
 
 #include <memory>
+#include "MomentumFilter.h"
 
 NAMESPACE_BEGIN(Grid);
 
@@ -78,8 +79,19 @@ protected:
   RepresentationPolicy Representations;
   IntegratorParameters Params;
 
+  //Filters allow the user to manipulate the conjugate momentum, for example to freeze links in DDHMC
+  //It is applied whenever the momentum is updated / refreshed
+  //The default filter does nothing
+  MomentumFilterBase<MomentaField> const* MomFilter;
+
   const ActionSet<Field, RepresentationPolicy> as;
 
+  //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
+  static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){ 
+    static MomentumFilterNone<MomentaField> filter;
+    return &filter;
+  }
+
   void update_P(Field& U, int level, double ep) 
   {
     t_P[level] += ep;
@@ -135,6 +147,8 @@ protected:
 
     // Force from the other representations
     as[level].apply(update_P_hireps, Representations, Mom, U, ep);
+
+    MomFilter->applyFilter(Mom);
   }
 
   void update_U(Field& U, double ep) 
@@ -174,11 +188,23 @@ public:
     t_P.resize(levels, 0.0);
     t_U = 0.0;
     // initialization of smearer delegated outside of Integrator
+
+    //Default the momentum filter to "do-nothing"
+    MomFilter = getDefaultMomFilter();
   };
 
   virtual ~Integrator() {}
 
   virtual std::string integrator_name() = 0;
+  
+  //Set the momentum filter allowing for manipulation of the conjugate momentum
+  void setMomentumFilter(const MomentumFilterBase<MomentaField> &filter){
+    MomFilter = &filter;
+  }
+
+  //Access the conjugate momentum
+  const MomentaField & getMomentum() const{ return P; }
+  
 
   void print_parameters()
   {
@@ -249,6 +275,8 @@ public:
       // Refresh the higher representation actions
       as[level].apply(refresh_hireps, Representations, pRNG);
     }
+
+    MomFilter->applyFilter(P);
   }
 
   // to be used by the actionlevel class to iterate
diff --git a/Grid/qcd/hmc/integrators/MomentumFilter.h b/Grid/qcd/hmc/integrators/MomentumFilter.h
new file mode 100644
index 00000000..2a15d80c
--- /dev/null
+++ b/Grid/qcd/hmc/integrators/MomentumFilter.h
@@ -0,0 +1,94 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/hmc/integrators/MomentumFilter.h
+
+Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+//--------------------------------------------------------------------
+#ifndef MOMENTUM_FILTER
+#define MOMENTUM_FILTER
+
+NAMESPACE_BEGIN(Grid);
+
+//These filter objects allow the user to manipulate the conjugate momentum as part of the update / refresh
+
+template<typename MomentaField>
+struct MomentumFilterBase{
+  virtual void applyFilter(MomentaField &P) const;
+};
+
+//Do nothing
+template<typename MomentaField>
+struct MomentumFilterNone: public MomentumFilterBase<MomentaField>{
+  void applyFilter(MomentaField &P) const override{}
+};
+
+//Multiply each site/direction by a Lorentz vector complex number field
+//Can be used to implement a mask, zeroing out sites
+template<typename MomentaField>
+struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
+  typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
+  typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
+  typedef iVector<iScalar<iScalar<vector_type> >, Nd > LorentzScalarType; //complex phase for each site/direction
+  typedef Lattice<LorentzScalarType> LatticeLorentzScalarType;
+  
+  LatticeLorentzScalarType phase;
+ 
+  MomentumFilterApplyPhase(const LatticeLorentzScalarType _phase): phase(_phase){}
+
+  //Default to uniform field of (1,0)
+  MomentumFilterApplyPhase(GridBase* _grid): phase(_grid){
+    LorentzScalarType one;
+    for(int mu=0;mu<Nd;mu++)
+      one(mu)()() = scalar_type(1.);
+    
+    phase = one;
+  }
+
+  void applyFilter(MomentaField &P) const override{
+    conformable(P,phase);
+    autoView( P_v , P, AcceleratorWrite);
+    autoView( phase_v , phase, AcceleratorRead);
+
+    accelerator_for(ss,P_v.size(),MomentaField::vector_type::Nsimd(),{
+    	auto site_mom = P_v(ss);
+    	auto site_phase = phase_v(ss);
+	for(int mu=0;mu<Nd;mu++)
+	  site_mom(mu) = site_mom(mu) * site_phase(mu);
+    	coalescedWrite(P_v[ss], site_mom);
+      });
+    
+  }
+
+
+};
+
+
+
+
+NAMESPACE_END(Grid);
+
+#endif
diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index 15516b56..4ac5f685 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -40,28 +40,21 @@ public:
   typedef typename FImpl::FermionField FermionField;
   typedef typename FImpl::PropagatorField PropagatorField;
 
-  typedef typename FImpl::SitePropagator pobj;
-  typedef typename ComplexField::vector_object vobj;
-
   typedef Lattice<iSpinMatrix<typename FImpl::Simd>> SpinMatrixField;
-  typedef typename SpinMatrixField::vector_object sobj;
-
-  static const int epsilon[6][3] ;
-  static const Real epsilon_sgn[6];
 
   private: 
-  template <class mobj, class robj>
+  template <class mobj, class robj> accelerator_inline
   static void BaryonSite(const mobj &D1,
-				 const mobj &D2,
-				 const mobj &D3,
-				 const Gamma GammaA_left,
-				 const Gamma GammaB_left,
-				 const Gamma GammaA_right,
-				 const Gamma GammaB_right,
-				 const int parity,
-				 const bool * wick_contractions,
-  				 robj &result);
-  template <class mobj, class robj>
+         const mobj &D2,
+         const mobj &D3,
+         const Gamma GammaA_left,
+         const Gamma GammaB_left,
+         const Gamma GammaA_right,
+         const Gamma GammaB_right,
+         const int parity,
+         const int wick_contractions,
+         robj &result);
+  template <class mobj, class robj> accelerator_inline
   static void BaryonSiteMatrix(const mobj &D1,
          const mobj &D2,
          const mobj &D3,
@@ -69,22 +62,22 @@ public:
          const Gamma GammaB_left,
          const Gamma GammaA_right,
          const Gamma GammaB_right,
-         const bool * wick_contractions,
-           robj &result);
+         const int wick_contractions,
+         robj &result);
   public:
   static void WickContractions(std::string qi, 
                  std::string qf, 
-                 bool* wick_contractions);
+                 int &wick_contractions);
   static void ContractBaryons(const PropagatorField &q1_left,
-				 const PropagatorField &q2_left,
-				 const PropagatorField &q3_left,
-				 const Gamma GammaA_left,
-				 const Gamma GammaB_left,
-				 const Gamma GammaA_right,
-				 const Gamma GammaB_right,
-				 const bool* wick_contractions,
-				 const int parity,
-				 ComplexField &baryon_corr);
+         const PropagatorField &q2_left,
+         const PropagatorField &q3_left,
+         const Gamma GammaA_left,
+         const Gamma GammaB_left,
+         const Gamma GammaA_right,
+         const Gamma GammaB_right,
+         const int wick_contractions,
+         const int parity,
+         ComplexField &baryon_corr);
   static void ContractBaryonsMatrix(const PropagatorField &q1_left,
          const PropagatorField &q2_left,
          const PropagatorField &q3_left,
@@ -92,20 +85,20 @@ public:
          const Gamma GammaB_left,
          const Gamma GammaA_right,
          const Gamma GammaB_right,
-         const bool* wick_contractions,
+         const int wick_contractions,
          SpinMatrixField &baryon_corr);
   template <class mobj, class robj>
   static void ContractBaryonsSliced(const mobj &D1,
-				 const mobj &D2,
-				 const mobj &D3,
-				 const Gamma GammaA_left,
-				 const Gamma GammaB_left,
-				 const Gamma GammaA_right,
-				 const Gamma GammaB_right,
-				 const bool* wick_contractions,
-				 const int parity,
-				 const int nt,
-				 robj &result);
+         const mobj &D2,
+         const mobj &D3,
+         const Gamma GammaA_left,
+         const Gamma GammaB_left,
+         const Gamma GammaA_right,
+         const Gamma GammaB_right,
+         const int wick_contractions,
+         const int parity,
+         const int nt,
+         robj &result);
   template <class mobj, class robj>
   static void ContractBaryonsSlicedMatrix(const mobj &D1,
          const mobj &D2,
@@ -114,43 +107,43 @@ public:
          const Gamma GammaB_left,
          const Gamma GammaA_right,
          const Gamma GammaB_right,
-         const bool* wick_contractions,
+         const int wick_contractions,
          const int nt,
          robj &result);
   private:
-  template <class mobj, class mobj2, class robj>
+  template <class mobj, class mobj2, class robj> accelerator_inline
   static void BaryonGamma3ptGroup1Site(
            const mobj &Dq1_ti,
            const mobj2 &Dq2_spec,
            const mobj2 &Dq3_spec,
            const mobj &Dq4_tf,
-                   const Gamma GammaJ,
-                   const Gamma GammaBi,
-                   const Gamma GammaBf,
+           const Gamma GammaJ,
+           const Gamma GammaBi,
+           const Gamma GammaBf,
            int wick_contraction,
            robj &result);
 
-  template <class mobj, class mobj2, class robj>
+  template <class mobj, class mobj2, class robj> accelerator_inline
   static void BaryonGamma3ptGroup2Site(
            const mobj2 &Dq1_spec,
            const mobj &Dq2_ti,
            const mobj2 &Dq3_spec,
            const mobj &Dq4_tf,
-                   const Gamma GammaJ,
-                   const Gamma GammaBi,
-                   const Gamma GammaBf,
+           const Gamma GammaJ,
+           const Gamma GammaBi,
+           const Gamma GammaBf,
            int wick_contraction,
            robj &result);
 
-  template <class mobj, class mobj2, class robj>
+  template <class mobj, class mobj2, class robj> accelerator_inline
   static void BaryonGamma3ptGroup3Site(
            const mobj2 &Dq1_spec,
            const mobj2 &Dq2_spec,
            const mobj &Dq3_ti,
            const mobj &Dq4_tf,
-                   const Gamma GammaJ,
-                   const Gamma GammaBi,
-                   const Gamma GammaBf,
+           const Gamma GammaJ,
+           const Gamma GammaBi,
+           const Gamma GammaBf,
            int wick_contraction,
            robj &result);
   public:
@@ -162,321 +155,321 @@ public:
            const PropagatorField &q_tf,
            int group,
            int wick_contraction,
-                   const Gamma GammaJ,
-                   const Gamma GammaBi,
-                   const Gamma GammaBf,
+           const Gamma GammaJ,
+           const Gamma GammaBi,
+           const Gamma GammaBf,
            SpinMatrixField &stn_corr);
   private: 
-  template <class mobj, class mobj2, class robj>
+  template <class mobj, class mobj2, class robj> accelerator_inline
   static void SigmaToNucleonQ1EyeSite(const mobj &Dq_loop,
-						 const mobj2 &Du_spec,
-						 const mobj &Dd_tf,
-						 const mobj &Ds_ti,
-				                 const Gamma Gamma_H,
-				                 const Gamma GammaB_sigma,
-		                 		 const Gamma GammaB_nucl,
-						 robj &result);
-  template <class mobj, class mobj2, class robj>
+             const mobj2 &Du_spec,
+             const mobj &Dd_tf,
+             const mobj &Ds_ti,
+             const Gamma Gamma_H,
+             const Gamma GammaB_sigma,
+             const Gamma GammaB_nucl,
+             robj &result);
+  template <class mobj, class mobj2, class robj> accelerator_inline
   static void SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti,
-						 const mobj &Du_tf,
-						 const mobj2 &Du_spec,
-						 const mobj &Dd_tf,
-						 const mobj &Ds_ti,
-				                 const Gamma Gamma_H,
-				                 const Gamma GammaB_sigma,
-		                 		 const Gamma GammaB_nucl,
-						 robj &result);
+             const mobj &Du_tf,
+             const mobj2 &Du_spec,
+             const mobj &Dd_tf,
+             const mobj &Ds_ti,
+             const Gamma Gamma_H,
+             const Gamma GammaB_sigma,
+             const Gamma GammaB_nucl,
+             robj &result);
 
 
-  template <class mobj, class mobj2, class robj>
+  template <class mobj, class mobj2, class robj> accelerator_inline
   static void SigmaToNucleonQ2EyeSite(const mobj &Dq_loop,
-						 const mobj2 &Du_spec,
-						 const mobj &Dd_tf,
-						 const mobj &Ds_ti,
-				                 const Gamma Gamma_H,
-				                 const Gamma GammaB_sigma,
-		                 		 const Gamma GammaB_nucl,
-						 robj &result);
-  template <class mobj, class mobj2, class robj>
+             const mobj2 &Du_spec,
+             const mobj &Dd_tf,
+             const mobj &Ds_ti,
+             const Gamma Gamma_H,
+             const Gamma GammaB_sigma,
+             const Gamma GammaB_nucl,
+             robj &result);
+  template <class mobj, class mobj2, class robj> accelerator_inline
   static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti,
-						 const mobj &Du_tf,
-						 const mobj2 &Du_spec,
-						 const mobj &Dd_tf,
-						 const mobj &Ds_ti,
-				                 const Gamma Gamma_H,
-				                 const Gamma GammaB_sigma,
-		                 		 const Gamma GammaB_nucl,
-						 robj &result);
+             const mobj &Du_tf,
+             const mobj2 &Du_spec,
+             const mobj &Dd_tf,
+             const mobj &Ds_ti,
+             const Gamma Gamma_H,
+             const Gamma GammaB_sigma,
+             const Gamma GammaB_nucl,
+             robj &result);
   public:
   template <class mobj>
   static void SigmaToNucleonEye(const PropagatorField &qq_loop,
-				 const mobj &Du_spec,
-				 const PropagatorField &qd_tf,
-				 const PropagatorField &qs_ti,
-				 const Gamma Gamma_H,
-				 const Gamma GammaB_sigma,
-				 const Gamma GammaB_nucl,
-		                 const std::string op,
-				 SpinMatrixField &stn_corr);
+         const mobj &Du_spec,
+         const PropagatorField &qd_tf,
+         const PropagatorField &qs_ti,
+         const Gamma Gamma_H,
+         const Gamma GammaB_sigma,
+         const Gamma GammaB_nucl,
+         const std::string op,
+         SpinMatrixField &stn_corr);
   template <class mobj>
   static void SigmaToNucleonNonEye(const PropagatorField &qq_ti,
-				 const PropagatorField &qq_tf,
-				 const mobj &Du_spec,
-				 const PropagatorField &qd_tf,
-				 const PropagatorField &qs_ti,
-				 const Gamma Gamma_H,
-				 const Gamma GammaB_sigma,
-				 const Gamma GammaB_nucl,
-		                 const std::string op,
-				 SpinMatrixField &stn_corr);
+         const PropagatorField &qq_tf,
+         const mobj &Du_spec,
+         const PropagatorField &qd_tf,
+         const PropagatorField &qs_ti,
+         const Gamma Gamma_H,
+         const Gamma GammaB_sigma,
+         const Gamma GammaB_nucl,
+         const std::string op,
+         SpinMatrixField &stn_corr);
 };
-
-template <class FImpl> 
-const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
-/*template <class FImpl> 
-const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
-						    Complex(1),
-						    Complex(1),
-						    Complex(-1),
-						    Complex(-1),
-						    Complex(-1)};
-*/
-template <class FImpl> 
-const Real BaryonUtils<FImpl>::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.};
-
-//This is the old version
+//This computes a baryon contraction on a lattice site, including the spin-trace of the correlation matrix
 template <class FImpl>
-template <class mobj, class robj>
+template <class mobj, class robj> accelerator_inline
 void BaryonUtils<FImpl>::BaryonSite(const mobj &D1,
                 const mobj &D2,
                 const mobj &D3,
-                         const Gamma GammaA_i,
-                         const Gamma GammaB_i,
-                         const Gamma GammaA_f,
-                         const Gamma GammaB_f,
+                const Gamma GammaA_i,
+                const Gamma GammaB_i,
+                const Gamma GammaA_f,
+                const Gamma GammaB_f,
                 const int parity,
-                const bool * wick_contraction,
+                const int wick_contraction,
                 robj &result)
 {
 
-    Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
+  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
     
-    auto D1_GAi =  D1 * GammaA_i;
-    auto D1_GAi_g4 = D1_GAi * g4;
-    auto D1_GAi_P = 0.5*(D1_GAi + (Real)parity * D1_GAi_g4);
-    auto GAf_D1_GAi_P = GammaA_f * D1_GAi_P;
-    auto GBf_D1_GAi_P = GammaB_f * D1_GAi_P;
+  auto D1_GAi =  D1 * GammaA_i;
+  auto D1_GAi_g4 = D1_GAi * g4;
+  auto D1_GAi_P = 0.5*(D1_GAi + (Real)parity * D1_GAi_g4);
+  auto GAf_D1_GAi_P = GammaA_f * D1_GAi_P;
+  auto GBf_D1_GAi_P = GammaB_f * D1_GAi_P;
 
-    auto D2_GBi = D2 * GammaB_i;
-    auto GBf_D2_GBi = GammaB_f * D2_GBi;
-    auto GAf_D2_GBi = GammaA_f * D2_GBi;
+  auto D2_GBi = D2 * GammaB_i;
+  auto GBf_D2_GBi = GammaB_f * D2_GBi;
+  auto GAf_D2_GBi = GammaA_f * D2_GBi;
 
-    auto GBf_D3 = GammaB_f * D3;
-    auto GAf_D3 = GammaA_f * D3;
+  auto GBf_D3 = GammaB_f * D3;
+  auto GAf_D3 = GammaA_f * D3;
 
-    for (int ie_f=0; ie_f < 6 ; ie_f++){
-        int a_f = epsilon[ie_f][0]; //a
-        int b_f = epsilon[ie_f][1]; //b
-        int c_f = epsilon[ie_f][2]; //c
+  Real ee;
+
+  for (int ie_f=0; ie_f < 6 ; ie_f++){
+    int a_f    = (ie_f < 3 ? ie_f       : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a
+    int b_f    = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b
+    int c_f    = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c
+    int eSgn_f = (ie_f < 3 ? 1 : -1);
     for (int ie_i=0; ie_i < 6 ; ie_i++){
-        int a_i = epsilon[ie_i][0]; //a'
-        int b_i = epsilon[ie_i][1]; //b'
-        int c_i = epsilon[ie_i][2]; //c'
+      int a_i = (ie_i < 3 ? ie_i       : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a'
+      int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b'
+      int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c'
+      int eSgn_i = (ie_i < 3 ? 1 : -1);
 
-        Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
-        //This is the \delta_{456}^{123} part
-        if (wick_contraction[0]){
-            for (int rho=0; rho<Ns; rho++){
-                auto GAf_D1_GAi_P_rr_cc = GAf_D1_GAi_P()(rho,rho)(c_f,c_i);
-                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                    result()()() += ee  * GAf_D1_GAi_P_rr_cc
+      ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s];
+      //This is the \delta_{456}^{123} part
+      if (wick_contraction & 1){
+        for (int rho=0; rho<Ns; rho++){
+          auto GAf_D1_GAi_P_rr_cc = GAf_D1_GAi_P()(rho,rho)(c_f,c_i);
+          for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            result()()() += ee  * GAf_D1_GAi_P_rr_cc
                                         * D2_GBi    ()(alpha_f,beta_i)(a_f,a_i)
                                         * GBf_D3    ()(alpha_f,beta_i)(b_f,b_i);
-                }}
-            }
-        }   
-        //This is the \delta_{456}^{231} part
-        if (wick_contraction[1]){
-            for (int rho=0; rho<Ns; rho++){
-            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                auto D1_GAi_P_ar_ac = D1_GAi_P()(alpha_f,rho)(a_f,c_i);
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                    result()()() += ee  * D1_GAi_P_ar_ac
+          }}
+        }
+      }   
+      //This is the \delta_{456}^{231} part
+      if (wick_contraction & 2){
+        for (int rho=0; rho<Ns; rho++){
+        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          auto D1_GAi_P_ar_ac = D1_GAi_P()(alpha_f,rho)(a_f,c_i);
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            result()()() += ee  * D1_GAi_P_ar_ac
                                         * GBf_D2_GBi    ()(alpha_f,beta_i)(b_f,a_i)
                                         * GAf_D3        ()(rho,beta_i)(c_f,b_i);
-                }
-            }}
-        }   
-        //This is the \delta_{456}^{312} part
-        if (wick_contraction[2]){
-            for (int rho=0; rho<Ns; rho++){
-                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                auto GBf_D1_GAi_P_ar_bc = GBf_D1_GAi_P()(alpha_f,rho)(b_f,c_i);
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                    result()()() += ee  * GBf_D1_GAi_P_ar_bc
+          }
+        }}
+      }   
+      //This is the \delta_{456}^{312} part
+      if (wick_contraction & 4){
+        for (int rho=0; rho<Ns; rho++){
+        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          auto GBf_D1_GAi_P_ar_bc = GBf_D1_GAi_P()(alpha_f,rho)(b_f,c_i);
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            result()()() += ee  * GBf_D1_GAi_P_ar_bc
                                         * GAf_D2_GBi    ()(rho,beta_i)(c_f,a_i)
                                         * D3            ()(alpha_f,beta_i)(a_f,b_i);
-                }
-            }}
-        }   
-        //This is the \delta_{456}^{132} part
-        if (wick_contraction[3]){
-            for (int rho=0; rho<Ns; rho++){
-                auto GAf_D1_GAi_P_rr_cc = GAf_D1_GAi_P()(rho,rho)(c_f,c_i);
-                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                    result()()() -= ee  * GAf_D1_GAi_P_rr_cc
+          }
+        }}
+      }   
+      //This is the \delta_{456}^{132} part
+      if (wick_contraction & 8){
+        for (int rho=0; rho<Ns; rho++){
+          auto GAf_D1_GAi_P_rr_cc = GAf_D1_GAi_P()(rho,rho)(c_f,c_i);
+          for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            result()()() -= ee  * GAf_D1_GAi_P_rr_cc
                                         * GBf_D2_GBi    ()(alpha_f,beta_i)(b_f,a_i)
                                         * D3            ()(alpha_f,beta_i)(a_f,b_i);
-              }
-        }}
-        }   
-        //This is the \delta_{456}^{321} part
-        if (wick_contraction[4]){
-            for (int rho=0; rho<Ns; rho++){
-            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                auto GBf_D1_GAi_P_ar_bc = GBf_D1_GAi_P()(alpha_f,rho)(b_f,c_i);
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                    result()()() -= ee  * GBf_D1_GAi_P_ar_bc
+          }}
+        }
+      }   
+      //This is the \delta_{456}^{321} part
+      if (wick_contraction & 16){
+        for (int rho=0; rho<Ns; rho++){
+        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          auto GBf_D1_GAi_P_ar_bc = GBf_D1_GAi_P()(alpha_f,rho)(b_f,c_i);
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            result()()() -= ee  * GBf_D1_GAi_P_ar_bc
                                         * D2_GBi    ()(alpha_f,beta_i)(a_f,a_i)
                                         * GAf_D3    ()(rho,beta_i)(c_f,b_i);
-                }
-            }}
-        }   
-        //This is the \delta_{456}^{213} part
-        if (wick_contraction[5]){
-            for (int rho=0; rho<Ns; rho++){
-            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                auto D1_GAi_P_ar_ac = D1_GAi_P()(alpha_f,rho)(a_f,c_i);
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                    result()()() -= ee  * D1_GAi_P_ar_ac
+          }
+        }}
+      }   
+      //This is the \delta_{456}^{213} part
+      if (wick_contraction & 32){
+        for (int rho=0; rho<Ns; rho++){
+        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          auto D1_GAi_P_ar_ac = D1_GAi_P()(alpha_f,rho)(a_f,c_i);
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            result()()() -= ee  * D1_GAi_P_ar_ac
                                         * GAf_D2_GBi    ()(rho,beta_i)(c_f,a_i)
                                         * GBf_D3        ()(alpha_f,beta_i)(b_f,b_i);
-                }
-            }}
-        }
-    }}
+          }
+        }}
+      }
+    }
+  }
 }
 
 //New version without parity projection or trace
 template <class FImpl>
-template <class mobj, class robj>
+template <class mobj, class robj> accelerator_inline
 void BaryonUtils<FImpl>::BaryonSiteMatrix(const mobj &D1,
                 const mobj &D2,
                 const mobj &D3,
-                         const Gamma GammaA_i,
-                         const Gamma GammaB_i,
-                         const Gamma GammaA_f,
-                         const Gamma GammaB_f,
-                const bool * wick_contraction,
+                const Gamma GammaA_i,
+                const Gamma GammaB_i,
+                const Gamma GammaA_f,
+                const Gamma GammaB_f,
+                const int wick_contraction,
                 robj &result)
 {
 
-    auto D1_GAi =  D1 * GammaA_i;
-    auto GAf_D1_GAi = GammaA_f * D1_GAi;
-    auto GBf_D1_GAi = GammaB_f * D1_GAi;
+  auto D1_GAi =  D1 * GammaA_i;
+  auto GAf_D1_GAi = GammaA_f * D1_GAi;
+  auto GBf_D1_GAi = GammaB_f * D1_GAi;
 
-    auto D2_GBi = D2 * GammaB_i;
-    auto GBf_D2_GBi = GammaB_f * D2_GBi;
-    auto GAf_D2_GBi = GammaA_f * D2_GBi;
+  auto D2_GBi = D2 * GammaB_i;
+  auto GBf_D2_GBi = GammaB_f * D2_GBi;
+  auto GAf_D2_GBi = GammaA_f * D2_GBi;
 
-    auto GBf_D3 = GammaB_f * D3;
-    auto GAf_D3 = GammaA_f * D3;
+  auto GBf_D3 = GammaB_f * D3;
+  auto GAf_D3 = GammaA_f * D3;
 
-    for (int ie_f=0; ie_f < 6 ; ie_f++){
-        int a_f = epsilon[ie_f][0]; //a
-        int b_f = epsilon[ie_f][1]; //b
-        int c_f = epsilon[ie_f][2]; //c
+  Real ee;
+
+  for (int ie_f=0; ie_f < 6 ; ie_f++){
+    int a_f    = (ie_f < 3 ? ie_f       : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a
+    int b_f    = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b
+    int c_f    = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c
+    int eSgn_f = (ie_f < 3 ? 1 : -1);
     for (int ie_i=0; ie_i < 6 ; ie_i++){
-        int a_i = epsilon[ie_i][0]; //a'
-        int b_i = epsilon[ie_i][1]; //b'
-        int c_i = epsilon[ie_i][2]; //c'
+      int a_i = (ie_i < 3 ? ie_i       : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a'
+      int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b'
+      int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c'
+      int eSgn_i = (ie_i < 3 ? 1 : -1);
 
-        Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
-        //This is the \delta_{456}^{123} part
-        if (wick_contraction[0]){
-            for (int rho_i=0; rho_i<Ns; rho_i++){
-            for (int rho_f=0; rho_f<Ns; rho_f++){
-                auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
-                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                    result()(rho_f,rho_i)() += ee  * GAf_D1_GAi_rr_cc
+      ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s];
+      //This is the \delta_{456}^{123} part
+      if (wick_contraction & 1){
+        for (int rho_i=0; rho_i<Ns; rho_i++){
+        for (int rho_f=0; rho_f<Ns; rho_f++){
+          auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
+          for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            result()(rho_f,rho_i)() += ee  * GAf_D1_GAi_rr_cc
                                         * D2_GBi    ()(alpha_f,beta_i)(a_f,a_i)
                                         * GBf_D3    ()(alpha_f,beta_i)(b_f,b_i);
-                }}
-            }}
-        }   
-        //This is the \delta_{456}^{231} part
-        if (wick_contraction[1]){
-            for (int rho_i=0; rho_i<Ns; rho_i++){
-            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                  auto GBf_D2_GBi_ab_ba = GBf_D2_GBi ()(alpha_f,beta_i)(b_f,a_i);
-                for (int rho_f=0; rho_f<Ns; rho_f++){
-                    result()(rho_f,rho_i)() += ee  * D1_GAi_ar_ac
+          }}
+        }}
+      }   
+      //This is the \delta_{456}^{231} part
+      if (wick_contraction & 2){
+        for (int rho_i=0; rho_i<Ns; rho_i++){
+        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            auto GBf_D2_GBi_ab_ba = GBf_D2_GBi ()(alpha_f,beta_i)(b_f,a_i);
+            for (int rho_f=0; rho_f<Ns; rho_f++){
+              result()(rho_f,rho_i)() += ee  * D1_GAi_ar_ac
                                         * GBf_D2_GBi_ab_ba
                                         * GAf_D3        ()(rho_f,beta_i)(c_f,b_i);
-                }}
-            }}
-        }   
-        //This is the \delta_{456}^{312} part
-        if (wick_contraction[2]){
-            for (int rho_i=0; rho_i<Ns; rho_i++){
-            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                  auto D3_ab_ab = D3 ()(alpha_f,beta_i)(a_f,b_i);
-                for (int rho_f=0; rho_f<Ns; rho_f++){
-                    result()(rho_f,rho_i)() += ee  * GBf_D1_GAi_ar_bc
+            }
+          }
+        }}
+      }   
+      //This is the \delta_{456}^{312} part
+      if (wick_contraction & 4){
+        for (int rho_i=0; rho_i<Ns; rho_i++){
+        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            auto D3_ab_ab = D3 ()(alpha_f,beta_i)(a_f,b_i);
+            for (int rho_f=0; rho_f<Ns; rho_f++){
+              result()(rho_f,rho_i)() += ee  * GBf_D1_GAi_ar_bc
                                         * GAf_D2_GBi    ()(rho_f,beta_i)(c_f,a_i)
                                         * D3_ab_ab;
-                }}
-            }}
-        }   
-        //This is the \delta_{456}^{132} part
-        if (wick_contraction[3]){
-            for (int rho_i=0; rho_i<Ns; rho_i++){
-            for (int rho_f=0; rho_f<Ns; rho_f++){
-                auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
-                for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                    result()(rho_f,rho_i)() -= ee  * GAf_D1_GAi_rr_cc
+            }
+          }
+        }}
+      }   
+      //This is the \delta_{456}^{132} part
+      if (wick_contraction & 8){
+        for (int rho_i=0; rho_i<Ns; rho_i++){
+        for (int rho_f=0; rho_f<Ns; rho_f++){
+          auto GAf_D1_GAi_rr_cc = GAf_D1_GAi()(rho_f,rho_i)(c_f,c_i);
+          for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            result()(rho_f,rho_i)() -= ee  * GAf_D1_GAi_rr_cc
                                         * GBf_D2_GBi    ()(alpha_f,beta_i)(b_f,a_i)
                                         * D3            ()(alpha_f,beta_i)(a_f,b_i);
-                }}
-            }}
-        }   
-        //This is the \delta_{456}^{321} part
-        if (wick_contraction[4]){
-            for (int rho_i=0; rho_i<Ns; rho_i++){
-            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                  auto D2_GBi_ab_aa = D2_GBi()(alpha_f,beta_i)(a_f,a_i);
-                for (int rho_f=0; rho_f<Ns; rho_f++){
-                    result()(rho_f,rho_i)() -= ee  * GBf_D1_GAi_ar_bc
+          }}
+        }}
+      }   
+      //This is the \delta_{456}^{321} part
+      if (wick_contraction & 16){
+        for (int rho_i=0; rho_i<Ns; rho_i++){
+        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          auto GBf_D1_GAi_ar_bc = GBf_D1_GAi()(alpha_f,rho_i)(b_f,c_i);
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            auto D2_GBi_ab_aa = D2_GBi()(alpha_f,beta_i)(a_f,a_i);
+            for (int rho_f=0; rho_f<Ns; rho_f++){
+              result()(rho_f,rho_i)() -= ee  * GBf_D1_GAi_ar_bc
                                         * D2_GBi_ab_aa
                                         * GAf_D3    ()(rho_f,beta_i)(c_f,b_i);
-                }}
-            }}
-        }   
-        //This is the \delta_{456}^{213} part
-        if (wick_contraction[5]){
-            for (int rho_i=0; rho_i<Ns; rho_i++){
-            for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-                auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
-                for (int beta_i=0; beta_i<Ns; beta_i++){
-                  auto GBf_D3_ab_bb = GBf_D3()(alpha_f,beta_i)(b_f,b_i);
-                for (int rho_f=0; rho_f<Ns; rho_f++){
-                    result()(rho_f,rho_i)() -= ee  * D1_GAi_ar_ac
+            }
+          }
+        }}
+      }   
+      //This is the \delta_{456}^{213} part
+      if (wick_contraction & 32){
+        for (int rho_i=0; rho_i<Ns; rho_i++){
+        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+          auto D1_GAi_ar_ac = D1_GAi()(alpha_f,rho_i)(a_f,c_i);
+          for (int beta_i=0; beta_i<Ns; beta_i++){
+            auto GBf_D3_ab_bb = GBf_D3()(alpha_f,beta_i)(b_f,b_i);
+            for (int rho_f=0; rho_f<Ns; rho_f++){
+              result()(rho_f,rho_i)() -= ee  * D1_GAi_ar_ac
                                         * GAf_D2_GBi    ()(rho_f,beta_i)(c_f,a_i)
                                         * GBf_D3_ab_bb;
-                }}
-            }}
-        }
-    }}
+            }
+          }
+        }}
+      }
+    }
+  }
 }
 
 /* Computes which wick contractions should be performed for a    *
@@ -484,13 +477,14 @@ void BaryonUtils<FImpl>::BaryonSiteMatrix(const mobj &D1,
  * flavours.                                                     *
  * The array wick_contractions must be of length 6               */
 template<class FImpl>
-void BaryonUtils<FImpl>::WickContractions(std::string qi, std::string qf, bool* wick_contractions) {
+void BaryonUtils<FImpl>::WickContractions(std::string qi, std::string qf, int &wick_contractions) {
+    assert(qi.size() == 3 && qf.size() == 3 && "Only sets of 3 quarks accepted.");
     const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
+    wick_contractions=0;
     for (int ie=0; ie < 6 ; ie++) {
-        wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3
-                                    && qi[0] == qf[epsilon[ie][0]] 
+        wick_contractions += ( ( qi[0] == qf[epsilon[ie][0]] 
                                     && qi[1] == qf[epsilon[ie][1]] 
-                                    && qi[2] == qf[epsilon[ie][2]]);
+                                    && qi[2] == qf[epsilon[ie][2]]) ? 1 : 0) << ie;
     }
 }
 
@@ -501,15 +495,15 @@ void BaryonUtils<FImpl>::WickContractions(std::string qi, std::string qf, bool*
  * Wick_Contractions function above                               */
 template<class FImpl>
 void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
-						 const PropagatorField &q2_left,
-						 const PropagatorField &q3_left,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const bool* wick_contractions,
-						 const int parity,
-						 ComplexField &baryon_corr)
+             const PropagatorField &q2_left,
+             const PropagatorField &q3_left,
+             const Gamma GammaA_left,
+             const Gamma GammaB_left,
+             const Gamma GammaA_right,
+             const Gamma GammaB_right,
+             const int wick_contractions,
+             const int parity,
+             ComplexField &baryon_corr)
 {
 
   assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
@@ -519,31 +513,31 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 
   GridBase *grid = q1_left.Grid();
   
-  autoView(vbaryon_corr, baryon_corr,CpuWrite);
-  autoView( v1 , q1_left, CpuRead);
-  autoView( v2 , q2_left, CpuRead);
-  autoView( v3 , q3_left, CpuRead);
+  autoView(vbaryon_corr , baryon_corr , AcceleratorWrite);
+  autoView( v1          , q1_left     , AcceleratorRead);
+  autoView( v2          , q2_left     , AcceleratorRead);
+  autoView( v3          , q3_left     , AcceleratorRead);
 
   Real bytes =0.;
   bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
   for (int ie=0; ie < 6 ; ie++){
     if(ie==0 or ie==3){
-       bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie];
-    }
-    else{
-       bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie];
+       bytes += ( wick_contractions & (1 << ie) ) ? grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) : 0.;
+    } else{
+       bytes += ( wick_contractions & (1 << ie) ) ? grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) : 0.;
     }
   }
   Real t=0.;
   t =-usecond();
 
   accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-    auto D1 = v1[ss];
-    auto D2 = v2[ss];
-    auto D3 = v3[ss];
-    vobj result=Zero();
+    auto D1 = v1(ss);
+    auto D2 = v2(ss);
+    auto D3 = v3(ss);
+    typedef decltype(coalescedRead(vbaryon_corr[0])) cVec;
+    cVec result=Zero();
     BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result);
-    vbaryon_corr[ss] = result; 
+    coalescedWrite(vbaryon_corr[ss],result);
   }  );//end loop over lattice sites
 
   t += usecond();
@@ -555,50 +549,33 @@ template<class FImpl>
 void BaryonUtils<FImpl>::ContractBaryonsMatrix(const PropagatorField &q1_left,
              const PropagatorField &q2_left,
              const PropagatorField &q3_left,
-                         const Gamma GammaA_left,
-                         const Gamma GammaB_left,
-                         const Gamma GammaA_right,
-                         const Gamma GammaB_right,
-             const bool* wick_contractions,
+             const Gamma GammaA_left,
+             const Gamma GammaB_left,
+             const Gamma GammaA_right,
+             const Gamma GammaB_right,
+             const int wick_contractions,
              SpinMatrixField &baryon_corr)
 {
 
   assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
   assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
- 
-  GridBase *grid = q1_left.Grid();
-  
-  autoView(vbaryon_corr, baryon_corr,CpuWrite);
-  autoView( v1 , q1_left, CpuRead);
-  autoView( v2 , q2_left, CpuRead);
-  autoView( v3 , q3_left, CpuRead);
 
-  // Real bytes =0.;
-  // bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
-  // for (int ie=0; ie < 6 ; ie++){
-  //   if(ie==0 or ie==3){
-  //      bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie];
-  //   }
-  //   else{
-  //      bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie];
-  //   }
-  // }
-  // Real t=0.;
-  // t =-usecond();
+  GridBase *grid = q1_left.Grid();
+
+  autoView(vbaryon_corr , baryon_corr , AcceleratorWrite);
+  autoView( v1          , q1_left     , AcceleratorRead);
+  autoView( v2          , q2_left     , AcceleratorRead);
+  autoView( v3          , q3_left     , AcceleratorRead);
 
   accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-    auto D1 = v1[ss];
-    auto D2 = v2[ss];
-    auto D3 = v3[ss];
-    sobj result=Zero();
+    auto D1 = v1(ss);
+    auto D2 = v2(ss);
+    auto D3 = v3(ss);
+    typedef decltype(coalescedRead(vbaryon_corr[0])) spinor;
+    spinor result=Zero();
     BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result);
-    vbaryon_corr[ss] = result; 
+    coalescedWrite(vbaryon_corr[ss],result);
   }  );//end loop over lattice sites
-
-  // t += usecond();
-
-  // std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl;
-
 }
 
 /* The array wick_contractions must be of length 6. The order     * 
@@ -609,16 +586,16 @@ void BaryonUtils<FImpl>::ContractBaryonsMatrix(const PropagatorField &q1_left,
 template <class FImpl>
 template <class mobj, class robj>
 void BaryonUtils<FImpl>::ContractBaryonsSliced(const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const bool* wick_contractions,
-						 const int parity,
-						 const int nt,
-						 robj &result)
+             const mobj &D2,
+             const mobj &D3,
+                         const Gamma GammaA_left,
+                         const Gamma GammaB_left,
+                         const Gamma GammaA_right,
+                         const Gamma GammaB_right,
+             const int wick_contractions,
+             const int parity,
+             const int nt,
+             robj &result)
 {
 
   assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
@@ -636,11 +613,11 @@ template <class mobj, class robj>
 void BaryonUtils<FImpl>::ContractBaryonsSlicedMatrix(const mobj &D1,
              const mobj &D2,
              const mobj &D3,
-                         const Gamma GammaA_left,
-                         const Gamma GammaB_left,
-                         const Gamma GammaA_right,
-                         const Gamma GammaB_right,
-             const bool* wick_contractions,
+             const Gamma GammaA_left,
+             const Gamma GammaB_left,
+             const Gamma GammaA_right,
+             const Gamma GammaB_right,
+             const int wick_contractions,
              const int nt,
              robj &result)
 {
@@ -664,91 +641,93 @@ void BaryonUtils<FImpl>::ContractBaryonsSlicedMatrix(const mobj &D1,
  * Dq3_spec is a quark line from t_i to t_f
  * Dq4_tf is a quark line from t_f to t_J */
 template<class FImpl>
-template <class mobj, class mobj2, class robj>
+template <class mobj, class mobj2, class robj> accelerator_inline
 void BaryonUtils<FImpl>::BaryonGamma3ptGroup1Site(
                         const mobj &Dq1_ti,
                         const mobj2 &Dq2_spec,
                         const mobj2 &Dq3_spec,
                         const mobj &Dq4_tf,
-                                const Gamma GammaJ,
-                                const Gamma GammaBi,
-                                const Gamma GammaBf,
+                        const Gamma GammaJ,
+                        const Gamma GammaBi,
+                        const Gamma GammaBf,
                         int wick_contraction,
                         robj &result)
 {
-    Gamma g5(Gamma::Algebra::Gamma5); 
+  Gamma g5(Gamma::Algebra::Gamma5); 
 
-    auto adjD4_g_D1     = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq1_ti;
-    auto Gf_adjD4_g_D1  = GammaBf * adjD4_g_D1;
-    auto D2_Gi          = Dq2_spec * GammaBi;
-    auto Gf_D2_Gi       = GammaBf * D2_Gi;
-    auto Gf_D3          = GammaBf * Dq3_spec;
+  auto adjD4          = g5 * adj(Dq4_tf) * g5 ;
+  auto adjD4_g_D1     = adjD4 * GammaJ * Dq1_ti;
+  auto Gf_adjD4_g_D1  = GammaBf * adjD4_g_D1;
+  auto D2_Gi          = Dq2_spec * GammaBi;
+  auto Gf_D2_Gi       = GammaBf * D2_Gi;
+  auto Gf_D3          = GammaBf * Dq3_spec;  
 
-    int a_f, b_f, c_f;
-    int a_i, b_i, c_i;
+  Real ee;
 
-    Real ee;
-
-    for (int ie_f=0; ie_f < 6 ; ie_f++){
-        a_f = epsilon[ie_f][0]; //a
-        b_f = epsilon[ie_f][1]; //b
-        c_f = epsilon[ie_f][2]; //c
+  for (int ie_f=0; ie_f < 6 ; ie_f++){
+    int a_f    = (ie_f < 3 ? ie_f       : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a
+    int b_f    = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b
+    int c_f    = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c
+    int eSgn_f = (ie_f < 3 ? 1 : -1);
     for (int ie_i=0; ie_i < 6 ; ie_i++){
-        a_i = epsilon[ie_i][0]; //a'
-        b_i = epsilon[ie_i][1]; //b'
-        c_i = epsilon[ie_i][2]; //c'
+      int a_i = (ie_i < 3 ? ie_i       : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a'
+      int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b'
+      int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c'
+      int eSgn_i = (ie_i < 3 ? 1 : -1);
 
-        ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
+      ee = Real(eSgn_f * eSgn_i);
 
-        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-        for (int beta_i=0; beta_i<Ns; beta_i++){
-            auto D2_Gi_ab_aa        = D2_Gi     ()(alpha_f,beta_i)(a_f,a_i);
-            auto Gf_D3_ab_bb        = Gf_D3     ()(alpha_f,beta_i)(b_f,b_i);
-            auto Gf_D2_Gi_ab_ba     = Gf_D2_Gi  ()(alpha_f,beta_i)(b_f,a_i);
-            auto Dq3_spec_ab_ab     = Dq3_spec  ()(alpha_f,beta_i)(a_f,b_i);
+      for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+      for (int beta_i=0; beta_i<Ns; beta_i++){
+        auto D2_Gi_ab_aa        = D2_Gi     ()(alpha_f,beta_i)(a_f,a_i);
+        auto Gf_D3_ab_bb        = Gf_D3     ()(alpha_f,beta_i)(b_f,b_i);
+        auto Gf_D2_Gi_ab_ba     = Gf_D2_Gi  ()(alpha_f,beta_i)(b_f,a_i);
+        auto Dq3_spec_ab_ab     = Dq3_spec  ()(alpha_f,beta_i)(a_f,b_i);
 
-            for (int gamma_i=0; gamma_i<Ns; gamma_i++){
-                auto ee_adjD4_g_D1_ag_ac        = ee * adjD4_g_D1   ()(alpha_f,gamma_i)(a_f,c_i);
-                auto ee_Gf_adjD4_g_D1_ag_bc     = ee * Gf_adjD4_g_D1()(alpha_f,gamma_i)(b_f,c_i);
-            for (int gamma_f=0; gamma_f<Ns; gamma_f++){
-                auto ee_adjD4_g_D1_gg_cc        = ee * adjD4_g_D1   ()(gamma_f,gamma_i)(c_f,c_i);
-                auto Dq3_spec_gb_cb             = Dq3_spec          ()(gamma_f,beta_i)(c_f,b_i);
-                auto D2_Gi_gb_ca                = D2_Gi             ()(gamma_f,beta_i)(c_f,a_i);
+        for (int gamma_i=0; gamma_i<Ns; gamma_i++){
+          auto ee_adjD4_g_D1_ag_ac        = ee * adjD4_g_D1   ()(alpha_f,gamma_i)(a_f,c_i);
+          auto ee_Gf_adjD4_g_D1_ag_bc     = ee * Gf_adjD4_g_D1()(alpha_f,gamma_i)(b_f,c_i);
+          for (int gamma_f=0; gamma_f<Ns; gamma_f++){
+            auto ee_adjD4_g_D1_gg_cc        = ee * adjD4_g_D1   ()(gamma_f,gamma_i)(c_f,c_i);
+            auto Dq3_spec_gb_cb             = Dq3_spec          ()(gamma_f,beta_i)(c_f,b_i);
+            auto D2_Gi_gb_ca                = D2_Gi             ()(gamma_f,beta_i)(c_f,a_i);
 
 
-                if(wick_contraction == 1) { // Do contraction I1
-                    result()(gamma_f,gamma_i)() -= ee_adjD4_g_D1_gg_cc
+            if(wick_contraction == 1) { // Do contraction I1
+              result()(gamma_f,gamma_i)() -= ee_adjD4_g_D1_gg_cc
                                                         * D2_Gi_ab_aa
                                                         * Gf_D3_ab_bb;
-                }
-                if(wick_contraction == 2) { // Do contraction I2
-                    result()(gamma_f,gamma_i)() -= ee_adjD4_g_D1_ag_ac
+            }
+            if(wick_contraction == 2) { // Do contraction I2
+              result()(gamma_f,gamma_i)() -= ee_adjD4_g_D1_ag_ac
                                                         * Gf_D2_Gi_ab_ba
                                                         * Dq3_spec_gb_cb;
-                }
-                if(wick_contraction == 3) { // Do contraction I3
-                    result()(gamma_f,gamma_i)() -= ee_Gf_adjD4_g_D1_ag_bc
+            }
+            if(wick_contraction == 3) { // Do contraction I3
+              result()(gamma_f,gamma_i)() -= ee_Gf_adjD4_g_D1_ag_bc
                                                         * D2_Gi_gb_ca
                                                         * Dq3_spec_ab_ab;
-                }
-                if(wick_contraction == 4) { // Do contraction I4
-                    result()(gamma_f,gamma_i)() += ee_adjD4_g_D1_gg_cc
+            }
+            if(wick_contraction == 4) { // Do contraction I4
+              result()(gamma_f,gamma_i)() += ee_adjD4_g_D1_gg_cc
                                                         * Gf_D2_Gi_ab_ba
                                                         * Dq3_spec_ab_ab;
-                }
-                if(wick_contraction == 5) { // Do contraction I5
-                    result()(gamma_f,gamma_i)() += ee_Gf_adjD4_g_D1_ag_bc
+            }
+            if(wick_contraction == 5) { // Do contraction I5
+              result()(gamma_f,gamma_i)() += ee_Gf_adjD4_g_D1_ag_bc
                                                         * D2_Gi_ab_aa
                                                         * Dq3_spec_gb_cb;
-                }
-                if(wick_contraction == 6) { // Do contraction I6
-                    result()(gamma_f,gamma_i)() += ee_adjD4_g_D1_ag_ac
+            }
+            if(wick_contraction == 6) { // Do contraction I6
+              result()(gamma_f,gamma_i)() += ee_adjD4_g_D1_ag_ac
                                                         * D2_Gi_gb_ca
                                                         * Gf_D3_ab_bb;
-                }
-            }}
-        }}
-    }}
+            }
+          }
+        }
+      }}
+    }
+  }
 }
 
 /* Dq1_spec is a quark line from t_i to t_f
@@ -756,89 +735,90 @@ void BaryonUtils<FImpl>::BaryonGamma3ptGroup1Site(
  * Dq3_spec is a quark line from t_i to t_f
  * Dq4_tf is a quark line from t_f to t_J */
 template<class FImpl>
-template <class mobj, class mobj2, class robj>
+template <class mobj, class mobj2, class robj> accelerator_inline
 void BaryonUtils<FImpl>::BaryonGamma3ptGroup2Site(
                         const mobj2 &Dq1_spec,
                         const mobj &Dq2_ti,
                         const mobj2 &Dq3_spec,
                         const mobj &Dq4_tf,
-                                const Gamma GammaJ,
-                                const Gamma GammaBi,
-                                const Gamma GammaBf,
+                        const Gamma GammaJ,
+                        const Gamma GammaBi,
+                        const Gamma GammaBf,
                         int wick_contraction,
                         robj &result)
 {
-    Gamma g5(Gamma::Algebra::Gamma5); 
+  Gamma g5(Gamma::Algebra::Gamma5); 
 
-    auto adjD4_g_D2_Gi      = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi;
-    auto Gf_adjD4_g_D2_Gi   = GammaBf * adjD4_g_D2_Gi;
-    auto Gf_D1              = GammaBf * Dq1_spec;
-    auto Gf_D3              = GammaBf * Dq3_spec;
+  auto adjD4_g_D2_Gi      = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi;
+  auto Gf_adjD4_g_D2_Gi   = GammaBf * adjD4_g_D2_Gi;
+  auto Gf_D1              = GammaBf * Dq1_spec;
+  auto Gf_D3              = GammaBf * Dq3_spec;
 
-    int a_f, b_f, c_f;
-    int a_i, b_i, c_i;
+  Real ee;
 
-    Real ee;
-
-    for (int ie_f=0; ie_f < 6 ; ie_f++){
-        a_f = epsilon[ie_f][0]; //a
-        b_f = epsilon[ie_f][1]; //b
-        c_f = epsilon[ie_f][2]; //c
+  for (int ie_f=0; ie_f < 6 ; ie_f++){
+    int a_f    = (ie_f < 3 ? ie_f       : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a
+    int b_f    = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b
+    int c_f    = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c
+    int eSgn_f = (ie_f < 3 ? 1 : -1);
     for (int ie_i=0; ie_i < 6 ; ie_i++){
-        a_i = epsilon[ie_i][0]; //a'
-        b_i = epsilon[ie_i][1]; //b'
-        c_i = epsilon[ie_i][2]; //c'
+      int a_i = (ie_i < 3 ? ie_i       : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a'
+      int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b'
+      int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c'
+      int eSgn_i = (ie_i < 3 ? 1 : -1);
 
-        ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
+      ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s];
 
-        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-        for (int beta_i=0; beta_i<Ns; beta_i++){
-            auto adjD4_g_D2_Gi_ab_aa        = adjD4_g_D2_Gi     ()(alpha_f,beta_i)(a_f,a_i);
-            auto Gf_D3_ab_bb                = Gf_D3             ()(alpha_f,beta_i)(b_f,b_i);
-            auto Gf_adjD4_g_D2_Gi_ab_ba     = Gf_adjD4_g_D2_Gi  ()(alpha_f,beta_i)(b_f,a_i);
-            auto Dq3_spec_ab_ab             = Dq3_spec          ()(alpha_f,beta_i)(a_f,b_i);
+      for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+      for (int beta_i=0; beta_i<Ns; beta_i++){
+        auto adjD4_g_D2_Gi_ab_aa        = adjD4_g_D2_Gi     ()(alpha_f,beta_i)(a_f,a_i);
+        auto Gf_D3_ab_bb                = Gf_D3             ()(alpha_f,beta_i)(b_f,b_i);
+        auto Gf_adjD4_g_D2_Gi_ab_ba     = Gf_adjD4_g_D2_Gi  ()(alpha_f,beta_i)(b_f,a_i);
+        auto Dq3_spec_ab_ab             = Dq3_spec          ()(alpha_f,beta_i)(a_f,b_i);
 
-            for (int gamma_i=0; gamma_i<Ns; gamma_i++){ 
-                auto ee_Dq1_spec_ag_ac      = ee * Dq1_spec     ()(alpha_f,gamma_i)(a_f,c_i);
-                auto ee_Gf_D1_ag_bc         = ee * Gf_D1        ()(alpha_f,gamma_i)(b_f,c_i);
-            for (int gamma_f=0; gamma_f<Ns; gamma_f++){
-                auto ee_Dq1_spec_gg_cc      = ee * Dq1_spec     ()(gamma_f,gamma_i)(c_f,c_i);
-                auto Dq3_spec_gb_cb         = Dq3_spec          ()(gamma_f,beta_i)(c_f,b_i);
-                auto adjD4_g_D2_Gi_gb_ca    = adjD4_g_D2_Gi     ()(gamma_f,beta_i)(c_f,a_i);
+        for (int gamma_i=0; gamma_i<Ns; gamma_i++){ 
+          auto ee_Dq1_spec_ag_ac      = ee * Dq1_spec     ()(alpha_f,gamma_i)(a_f,c_i);
+          auto ee_Gf_D1_ag_bc         = ee * Gf_D1        ()(alpha_f,gamma_i)(b_f,c_i);
+          for (int gamma_f=0; gamma_f<Ns; gamma_f++){
+            auto ee_Dq1_spec_gg_cc      = ee * Dq1_spec     ()(gamma_f,gamma_i)(c_f,c_i);
+            auto Dq3_spec_gb_cb         = Dq3_spec          ()(gamma_f,beta_i)(c_f,b_i);
+            auto adjD4_g_D2_Gi_gb_ca    = adjD4_g_D2_Gi     ()(gamma_f,beta_i)(c_f,a_i);
 
-                if(wick_contraction == 1) { // Do contraction II1
-                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_gg_cc
+            if(wick_contraction == 1) { // Do contraction II1
+              result()(gamma_f,gamma_i)() -= ee_Dq1_spec_gg_cc
                                                         * adjD4_g_D2_Gi_ab_aa
                                                         * Gf_D3_ab_bb;
-                }
-                if(wick_contraction == 2) { // Do contraction II2
-                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_ag_ac
+            }
+            if(wick_contraction == 2) { // Do contraction II2
+              result()(gamma_f,gamma_i)() -= ee_Dq1_spec_ag_ac
                                                         * Gf_adjD4_g_D2_Gi_ab_ba
                                                         * Dq3_spec_gb_cb;
-                }
-                if(wick_contraction == 3) { // Do contraction II3
-                    result()(gamma_f,gamma_i)() -= ee_Gf_D1_ag_bc
+            }
+            if(wick_contraction == 3) { // Do contraction II3
+              result()(gamma_f,gamma_i)() -= ee_Gf_D1_ag_bc
                                                         * adjD4_g_D2_Gi_gb_ca
                                                         * Dq3_spec_ab_ab;
-                }
-                if(wick_contraction == 4) { // Do contraction II4
-                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_gg_cc
+            }
+            if(wick_contraction == 4) { // Do contraction II4
+              result()(gamma_f,gamma_i)() += ee_Dq1_spec_gg_cc
                                                         * Gf_adjD4_g_D2_Gi_ab_ba
                                                         * Dq3_spec_ab_ab;
-                }
-                if(wick_contraction == 5) { // Do contraction II5
-                    result()(gamma_f,gamma_i)() += ee_Gf_D1_ag_bc
+            }
+            if(wick_contraction == 5) { // Do contraction II5
+              result()(gamma_f,gamma_i)() += ee_Gf_D1_ag_bc
                                                         * adjD4_g_D2_Gi_ab_aa
                                                         * Dq3_spec_gb_cb;
-                }
-                if(wick_contraction == 6) { // Do contraction II6
-                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_ag_ac
+            }
+            if(wick_contraction == 6) { // Do contraction II6
+              result()(gamma_f,gamma_i)() += ee_Dq1_spec_ag_ac
                                                         * adjD4_g_D2_Gi_gb_ca
                                                         * Gf_D3_ab_bb;
-                }
-            }}
-        }}
-    }}
+            }
+          }
+        }
+      }}
+    }
+  }
 }
 
 /* Dq1_spec is a quark line from t_i to t_f
@@ -846,90 +826,91 @@ void BaryonUtils<FImpl>::BaryonGamma3ptGroup2Site(
  * Dq3_ti is a quark line from t_i to t_J
  * Dq4_tf is a quark line from t_f to t_J */
 template<class FImpl>
-template <class mobj, class mobj2, class robj>
+template <class mobj, class mobj2, class robj> accelerator_inline
 void BaryonUtils<FImpl>::BaryonGamma3ptGroup3Site(
                         const mobj2 &Dq1_spec,
                         const mobj2 &Dq2_spec,
                         const mobj &Dq3_ti,
                         const mobj &Dq4_tf,
-                                const Gamma GammaJ,
-                                const Gamma GammaBi,
-                                const Gamma GammaBf,
+                        const Gamma GammaJ,
+                        const Gamma GammaBi,
+                        const Gamma GammaBf,
                         int wick_contraction,
                         robj &result)
 {
-    Gamma g5(Gamma::Algebra::Gamma5);
+  Gamma g5(Gamma::Algebra::Gamma5);
 
-    auto adjD4_g_D3     = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti;
-    auto Gf_adjD4_g_D3  = GammaBf * adjD4_g_D3;
-    auto Gf_D1          = GammaBf * Dq1_spec;
-    auto D2_Gi          = Dq2_spec * GammaBi;
-    auto Gf_D2_Gi       = GammaBf * D2_Gi;
+  auto adjD4_g_D3     = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti;
+  auto Gf_adjD4_g_D3  = GammaBf * adjD4_g_D3;
+  auto Gf_D1          = GammaBf * Dq1_spec;
+  auto D2_Gi          = Dq2_spec * GammaBi;
+  auto Gf_D2_Gi       = GammaBf * D2_Gi;
 
-    int a_f, b_f, c_f;
-    int a_i, b_i, c_i;
+  Real ee;
 
-    Real ee;
-
-    for (int ie_f=0; ie_f < 6 ; ie_f++){
-        a_f = epsilon[ie_f][0]; //a
-        b_f = epsilon[ie_f][1]; //b
-        c_f = epsilon[ie_f][2]; //c
+  for (int ie_f=0; ie_f < 6 ; ie_f++){
+    int a_f    = (ie_f < 3 ? ie_f       : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a
+    int b_f    = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b
+    int c_f    = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c
+    int eSgn_f = (ie_f < 3 ? 1 : -1);
     for (int ie_i=0; ie_i < 6 ; ie_i++){
-        a_i = epsilon[ie_i][0]; //a'
-        b_i = epsilon[ie_i][1]; //b'
-        c_i = epsilon[ie_i][2]; //c'
+      int a_i = (ie_i < 3 ? ie_i       : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a'
+      int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b'
+      int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c'
+      int eSgn_i = (ie_i < 3 ? 1 : -1);
 
-        ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i];
+      ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s];
 
-        for (int alpha_f=0; alpha_f<Ns; alpha_f++){
-        for (int beta_i=0; beta_i<Ns; beta_i++){
-            auto D2_Gi_ab_aa            = D2_Gi         ()(alpha_f,beta_i)(a_f,a_i);
-            auto Gf_adjD4_g_D3_ab_bb    = Gf_adjD4_g_D3 ()(alpha_f,beta_i)(b_f,b_i);
-            auto Gf_D2_Gi_ab_ba         = Gf_D2_Gi      ()(alpha_f,beta_i)(b_f,a_i);
-            auto adjD4_g_D3_ab_ab       = adjD4_g_D3    ()(alpha_f,beta_i)(a_f,b_i);
+      for (int alpha_f=0; alpha_f<Ns; alpha_f++){
+      for (int beta_i=0; beta_i<Ns; beta_i++){
+        auto D2_Gi_ab_aa            = D2_Gi         ()(alpha_f,beta_i)(a_f,a_i);
+        auto Gf_adjD4_g_D3_ab_bb    = Gf_adjD4_g_D3 ()(alpha_f,beta_i)(b_f,b_i);
+        auto Gf_D2_Gi_ab_ba         = Gf_D2_Gi      ()(alpha_f,beta_i)(b_f,a_i);
+        auto adjD4_g_D3_ab_ab       = adjD4_g_D3    ()(alpha_f,beta_i)(a_f,b_i);
 
-            for (int gamma_i=0; gamma_i<Ns; gamma_i++) {
-                auto ee_Dq1_spec_ag_ac  = ee * Dq1_spec ()(alpha_f,gamma_i)(a_f,c_i);
-                auto ee_Gf_D1_ag_bc     = ee * Gf_D1    ()(alpha_f,gamma_i)(b_f,c_i);
-            for (int gamma_f=0; gamma_f<Ns; gamma_f++) {
-                auto ee_Dq1_spec_gg_cc  = ee * Dq1_spec ()(gamma_f,gamma_i)(c_f,c_i);
-                auto adjD4_g_D3_gb_cb   = adjD4_g_D3    ()(gamma_f,beta_i)(c_f,b_i);
-                auto D2_Gi_gb_ca        = D2_Gi         ()(gamma_f,beta_i)(c_f,a_i);
+        for (int gamma_i=0; gamma_i<Ns; gamma_i++) {
+          auto ee_Dq1_spec_ag_ac  = ee * Dq1_spec ()(alpha_f,gamma_i)(a_f,c_i);
+          auto ee_Gf_D1_ag_bc     = ee * Gf_D1    ()(alpha_f,gamma_i)(b_f,c_i);
+          for (int gamma_f=0; gamma_f<Ns; gamma_f++) {
+            auto ee_Dq1_spec_gg_cc  = ee * Dq1_spec ()(gamma_f,gamma_i)(c_f,c_i);
+            auto adjD4_g_D3_gb_cb   = adjD4_g_D3    ()(gamma_f,beta_i)(c_f,b_i);
+            auto D2_Gi_gb_ca        = D2_Gi         ()(gamma_f,beta_i)(c_f,a_i);
 
-                if(wick_contraction == 1) { // Do contraction III1
-                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_gg_cc
+            if(wick_contraction == 1) { // Do contraction III1
+              result()(gamma_f,gamma_i)() -= ee_Dq1_spec_gg_cc
                                                         * D2_Gi_ab_aa
                                                         * Gf_adjD4_g_D3_ab_bb;
-                }
-                if(wick_contraction == 2) { // Do contraction III2
-                    result()(gamma_f,gamma_i)() -= ee_Dq1_spec_ag_ac
+            }
+            if(wick_contraction == 2) { // Do contraction III2
+              result()(gamma_f,gamma_i)() -= ee_Dq1_spec_ag_ac
                                                         * Gf_D2_Gi_ab_ba
                                                         * adjD4_g_D3_gb_cb;
-                }
-                if(wick_contraction == 3) { // Do contraction III3
-                    result()(gamma_f,gamma_i)() -= ee_Gf_D1_ag_bc
+            }
+            if(wick_contraction == 3) { // Do contraction III3
+              result()(gamma_f,gamma_i)() -= ee_Gf_D1_ag_bc
                                                         * D2_Gi_gb_ca
                                                         * adjD4_g_D3_ab_ab;
-                }
-                if(wick_contraction == 4) { // Do contraction III4
-                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_gg_cc
+            }
+            if(wick_contraction == 4) { // Do contraction III4
+              result()(gamma_f,gamma_i)() += ee_Dq1_spec_gg_cc
                                                         * Gf_D2_Gi_ab_ba
                                                         * adjD4_g_D3_ab_ab;
-                }
-                if(wick_contraction == 5) { // Do contraction III5
-                    result()(gamma_f,gamma_i)() += ee_Gf_D1_ag_bc
+            }
+            if(wick_contraction == 5) { // Do contraction III5
+              result()(gamma_f,gamma_i)() += ee_Gf_D1_ag_bc
                                                         * D2_Gi_ab_aa
                                                         * adjD4_g_D3_gb_cb;
-                }
-                if(wick_contraction == 6) { // Do contraction III6
-                    result()(gamma_f,gamma_i)() += ee_Dq1_spec_ag_ac
+            }
+            if(wick_contraction == 6) { // Do contraction III6
+              result()(gamma_f,gamma_i)() += ee_Dq1_spec_ag_ac
                                                         * D2_Gi_gb_ca
                                                         * Gf_adjD4_g_D3_ab_bb;
-                }
-            }}
-        }}
-    }}
+            }
+          }
+        }
+      }}
+    }
+  }
 }
 
 /* The group indicates which inital state quarks the current is  * 
@@ -946,49 +927,58 @@ void BaryonUtils<FImpl>::BaryonGamma3pt(
                         const PropagatorField &q_tf,
                         int group,
                         int wick_contraction,
-                                const Gamma GammaJ,
-                                const Gamma GammaBi,
-                                const Gamma GammaBf,
+                        const Gamma GammaJ,
+                        const Gamma GammaBi,
+                        const Gamma GammaBf,
                         SpinMatrixField &stn_corr)
 {
-    GridBase *grid = q_tf.Grid();
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
-    autoView( vcorr, stn_corr, CpuWrite);
-    autoView( vq_ti , q_ti, CpuRead);
-    autoView( vq_tf , q_tf, CpuRead);
+  GridBase *grid = q_tf.Grid();
 
-    if (group == 1) {
-        accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-            auto Dq_ti = vq_ti[ss];
-            auto Dq_tf = vq_tf[ss];
-            sobj result=Zero();
-            BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
-            vcorr[ss] += result; 
-        });//end loop over lattice sites
-    } else if (group == 2) {
-        accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-            auto Dq_ti = vq_ti[ss];
-            auto Dq_tf = vq_tf[ss];
-            sobj result=Zero();
-            BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
-            vcorr[ss] += result; 
-        });//end loop over lattice sites
-    } else if (group == 3) {
-        accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-            auto Dq_ti = vq_ti[ss];
-            auto Dq_tf = vq_tf[ss];
-            sobj result=Zero();
-            BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
+  autoView( vcorr , stn_corr , AcceleratorWrite);
+  autoView( vq_ti , q_ti     , AcceleratorRead);
+  autoView( vq_tf , q_tf     , AcceleratorRead);
+
+  Vector<mobj> my_Dq_spec{Dq_spec1,Dq_spec2};
+  mobj * Dq_spec_p = &my_Dq_spec[0];
+
+  if (group == 1) {
+    accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+      auto Dq_ti = vq_ti(ss);
+      auto Dq_tf = vq_tf(ss);
+      typedef decltype(coalescedRead(vcorr[0])) spinor;
+      spinor result=Zero();
+      BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec_p[0],Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result);
+      coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); 
+    });//end loop over lattice sites
+  } else if (group == 2) {
+    accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+      auto Dq_ti = vq_ti(ss);
+      auto Dq_tf = vq_tf(ss);
+      typedef decltype(coalescedRead(vcorr[0])) spinor;
+      spinor result=Zero();
+      BaryonGamma3ptGroup2Site(Dq_spec_p[0],Dq_ti,Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); 
+      coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); 
+    });//end loop over lattice sites
+  } else if (group == 3) {
+    accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+      auto Dq_ti = vq_ti(ss);
+      auto Dq_tf = vq_tf(ss);
+      typedef decltype(coalescedRead(vcorr[0])) spinor;
+      spinor result=Zero();
+      BaryonGamma3ptGroup3Site(Dq_spec_p[0],Dq_spec_p[1],Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); 
+      coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); 
+    });//end loop over lattice sites
+  }
 
-            vcorr[ss] += result; 
-        });//end loop over lattice sites
-    }
 }
 
 
 /***********************************************************************
  * End of BaryonGamma3pt-function code.                                *
- *																	   *
+ *                                     *
  * The following code is for Sigma -> N rare hypeon decays             *
  **********************************************************************/
 
@@ -997,46 +987,56 @@ void BaryonUtils<FImpl>::BaryonGamma3pt(
  * Dd_tf is a quark line from t_f to t_H
  * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
-template <class mobj, class mobj2, class robj>
+template <class mobj, class mobj2, class robj> accelerator_inline
 void BaryonUtils<FImpl>::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop,
-						 const mobj2 &Du_spec,
-						 const mobj &Dd_tf,
-						 const mobj &Ds_ti,
-				                 const Gamma Gamma_H,
-				                 const Gamma GammaB_sigma,
-		                 		 const Gamma GammaB_nucl,
-						 robj &result)
+             const mobj2 &Du_spec,
+             const mobj &Dd_tf,
+             const mobj &Ds_ti,
+             const Gamma Gamma_H,
+             const Gamma GammaB_sigma,
+             const Gamma GammaB_nucl,
+             robj &result)
 {
 
   Gamma g5(Gamma::Algebra::Gamma5); 
 
-  auto DuG = Du_spec * GammaB_nucl;
-  // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5)
-  auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5;
-  // Dq_loop * \gamma_\mu^L
-  auto DqG = Dq_loop * Gamma_H;
+  auto adjDd_GH_Ds      = g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti;
+  auto Gn_adjDd_GH_Ds   = GammaB_nucl * adjDd_GH_Ds;
+  auto Du_Gs            = Du_spec * GammaB_sigma;
+  auto Dq_GH            = Dq_loop * Gamma_H;
+  auto Tr_Dq_GH         = trace(Dq_GH)()()();
+
+  Real ee;
 
   for (int ie_n=0; ie_n < 6 ; ie_n++){
-    int a_n = epsilon[ie_n][0]; //a
-    int b_n = epsilon[ie_n][1]; //b
-    int c_n = epsilon[ie_n][2]; //c
+    int a_n    = (ie_n < 3 ? ie_n       : (6-ie_n)%3 ); //epsilon[ie_n][0]; //a
+    int b_n    = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b
+    int c_n    = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c
+    int eSgn_n = (ie_n < 3 ? 1 : -1);
     for (int ie_s=0; ie_s < 6 ; ie_s++){
-      int a_s = epsilon[ie_s][0]; //a'
-      int b_s = epsilon[ie_s][1]; //b'
-      int c_s = epsilon[ie_s][2]; //c'
-      for (int alpha_s=0; alpha_s<Ns; alpha_s++){
-      for (int beta_n=0; beta_n<Ns; beta_n++){
-        auto GDsGDd_ab_bb = GDsGDd()(alpha_s,beta_n)(b_s,b_n);
-        for (int tau2=0; tau2<Ns; tau2++){
-        for (int j=0; j<Nc; j++){
-          auto DqG_tt_jj = DqG()(tau2,tau2)(j,j);
-          auto ee_GDGDDG = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsGDd_ab_bb * DqG_tt_jj;
-          for (int gamma_s=0; gamma_s<Ns; gamma_s++){
-          for (int gamma_n=0; gamma_n<Ns; gamma_n++){
-            result()(gamma_s,gamma_n)() += ee_GDGDDG * DuG()(alpha_s, beta_n)(a_s,a_n) * Du_spec()(gamma_s,gamma_n)(c_s,c_n);
-            result()(gamma_s,gamma_n)() -= ee_GDGDDG * DuG()(gamma_s, beta_n)(c_s,a_n) * Du_spec()(alpha_s,gamma_n)(a_s,c_n);
-          }}
-	}}
+      int a_s = (ie_s < 3 ? ie_s       : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a'
+      int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b'
+      int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c'
+      int eSgn_s = (ie_s < 3 ? 1 : -1);
+
+      ee = Real(eSgn_n * eSgn_s); 
+      for (int alpha_n=0; alpha_n<Ns; alpha_n++){
+      for (int beta_s=0;  beta_s<Ns;  beta_s++){
+
+        auto Gn_adjDd_GH_Ds_ab_bb = Gn_adjDd_GH_Ds ()(alpha_n, beta_s)(b_n,b_s);
+
+        for (int gamma_s=0; gamma_s<Ns; gamma_s++){
+        for (int gamma_n=0; gamma_n<Ns; gamma_n++){
+          result()(gamma_n,gamma_s)() += ee   * Gn_adjDd_GH_Ds_ab_bb
+                                            * Du_spec         ()(gamma_n,gamma_s)(c_n,c_s)
+                                            * Du_Gs           ()(alpha_n, beta_s)(a_n,a_s) 
+                                            * Tr_Dq_GH;
+
+          result()(gamma_n,gamma_s)() -= ee   * Gn_adjDd_GH_Ds_ab_bb
+                                            * Du_spec         ()(alpha_n,gamma_s)(a_n,c_s)
+                                            * Du_Gs           ()(gamma_n, beta_s)(c_n,a_s) 
+                                            * Tr_Dq_GH;
+        }}
       }}
     }
   }
@@ -1048,54 +1048,65 @@ void BaryonUtils<FImpl>::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop,
  * Dd_tf is a quark line from t_f to t_H
  * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
-template <class mobj, class mobj2, class robj>
+template <class mobj, class mobj2, class robj> accelerator_inline
 void BaryonUtils<FImpl>::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti,
-						 const mobj &Du_tf,
-						 const mobj2 &Du_spec,
-						 const mobj &Dd_tf,
-						 const mobj &Ds_ti,
-				                 const Gamma Gamma_H,
-				                 const Gamma GammaB_sigma,
-		                 		 const Gamma GammaB_nucl,
-						 robj &result)
+             const mobj &Du_tf,
+             const mobj2 &Du_spec,
+             const mobj &Dd_tf,
+             const mobj &Ds_ti,
+             const Gamma Gamma_H,
+             const Gamma GammaB_sigma,
+             const Gamma GammaB_nucl,
+             robj &result)
 {
 
   Gamma g5(Gamma::Algebra::Gamma5); 
 
-  auto DuG = Du_spec * GammaB_nucl;
-  auto adjDu = g5 * adj(Du_tf) * g5;
-  auto adjDuG = adjDu * GammaB_nucl;
-  // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5)
-  auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5;
-  // Dq_loop * \gamma_\mu^L
-  auto DuGH = Du_ti * Gamma_H;
+  auto Du_Gs          = Du_spec * GammaB_sigma;
+  auto adjDd_GH_Ds = g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti;
+  auto Gn_adjDd_GH_Ds = GammaB_nucl * adjDd_GH_Ds;
+  auto adjDu_GH_Du    = g5 * adj(Du_tf) * g5 * Gamma_H * Du_ti;
+  auto adjDu_GH_Du_Gs = adjDu_GH_Du * GammaB_sigma;
+
+  Real ee;
 
   for (int ie_n=0; ie_n < 6 ; ie_n++){
-    int a_n = epsilon[ie_n][0]; //a
-    int b_n = epsilon[ie_n][1]; //b
-    int c_n = epsilon[ie_n][2]; //c
+    int a_n    = (ie_n < 3 ? ie_n       : (6-ie_n)%3 ); //epsilon[ie_n][0]; //a
+    int b_n    = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b
+    int c_n    = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c
+    int eSgn_n = (ie_n < 3 ? 1 : -1);
     for (int ie_s=0; ie_s < 6 ; ie_s++){
-      int a_s = epsilon[ie_s][0]; //a'
-      int b_s = epsilon[ie_s][1]; //b'
-      int c_s = epsilon[ie_s][2]; //c'
-      for (int alpha_s=0; alpha_s<Ns; alpha_s++){
-      for (int beta_n=0; beta_n<Ns; beta_n++){
-        auto GDsGDd_ab_bb = GDsGDd()(alpha_s,beta_n)(b_s,b_n);
-        for (int tau2=0; tau2<Ns; tau2++){
-        for (int j=0; j<Nc; j++){
-          auto DuGH_at_aj = DuGH()(alpha_s,tau2)(a_s,j);
-          auto ee_GDGDDG_a = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsGDd_ab_bb * DuGH_at_aj;
-          for (int gamma_s=0; gamma_s<Ns; gamma_s++){
-            auto DuGH_gt_cj = DuGH()(gamma_s,tau2)(c_s,j);
-            auto ee_GDGDDG_c = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsGDd_ab_bb * DuGH_gt_cj;
-            for (int gamma_n=0; gamma_n<Ns; gamma_n++){
-              result()(gamma_s,gamma_n)() += ee_GDGDDG_a * DuG()(gamma_s, beta_n)(c_s,a_n) * adjDu()(tau2,gamma_n)(j,c_n);
-              result()(gamma_s,gamma_n)() += ee_GDGDDG_c * adjDuG()(tau2, beta_n)(j,a_n) * Du_spec()(alpha_s,gamma_n)(a_s,c_n);
-              result()(gamma_s,gamma_n)() -= ee_GDGDDG_a * adjDuG()(tau2, beta_n)(j,a_n) * Du_spec()(gamma_s,gamma_n)(c_s,c_n);
-              result()(gamma_s,gamma_n)() -= ee_GDGDDG_c * DuG()(alpha_s, beta_n)(a_s,a_n) * adjDu()(tau2,gamma_n)(j,c_n);
-            }
-	  }
-	}}
+      int a_s = (ie_s < 3 ? ie_s       : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a'
+      int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b'
+      int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c'
+      int eSgn_s = (ie_s < 3 ? 1 : -1);
+
+      ee = Real(eSgn_n * eSgn_s); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s];
+
+      for (int alpha_n=0; alpha_n<Ns; alpha_n++){
+      for (int beta_s=0;  beta_s<Ns;  beta_s++){
+
+        auto Gn_adjDd_GH_Ds_ab_bb = Gn_adjDd_GH_Ds  ()(alpha_n, beta_s)(b_n,b_s);
+
+        for (int gamma_s=0; gamma_s<Ns; gamma_s++){
+        for (int gamma_n=0; gamma_n<Ns; gamma_n++){
+
+          result()(gamma_n,gamma_s)() += ee * Gn_adjDd_GH_Ds_ab_bb
+                                          * adjDu_GH_Du     ()(alpha_n,gamma_s)(a_n,c_s)
+                                          * Du_Gs           ()(gamma_n, beta_s)(c_n,a_s);
+
+          result()(gamma_n,gamma_s)() += ee * Gn_adjDd_GH_Ds_ab_bb
+                                          * adjDu_GH_Du_Gs  ()(gamma_n, beta_s)(c_n,a_s)
+                                          * Du_spec         ()(alpha_n,gamma_s)(a_n,c_s);
+
+          result()(gamma_n,gamma_s)() -= ee * Gn_adjDd_GH_Ds_ab_bb
+                                          * adjDu_GH_Du_Gs  ()(alpha_n, beta_s)(a_n,a_s)
+                                          * Du_spec         ()(gamma_n,gamma_s)(c_n,c_s);
+
+          result()(gamma_n,gamma_s)() -= ee * Gn_adjDd_GH_Ds_ab_bb
+                                          * adjDu_GH_Du     ()(gamma_n,gamma_s)(c_n,c_s)
+                                          * Du_Gs           ()(alpha_n, beta_s)(a_n,a_s);
+        }}
       }}
     }
   }
@@ -1107,47 +1118,55 @@ void BaryonUtils<FImpl>::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti,
  * Dd_tf is a quark line from t_f to t_H
  * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
-template <class mobj, class mobj2, class robj>
+template <class mobj, class mobj2, class robj> accelerator_inline
 void BaryonUtils<FImpl>::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop,
-						 const mobj2 &Du_spec,
-						 const mobj &Dd_tf,
-						 const mobj &Ds_ti,
-				                 const Gamma Gamma_H,
-				                 const Gamma GammaB_sigma,
-		                 		 const Gamma GammaB_nucl,
-						 robj &result)
+             const mobj2 &Du_spec,
+             const mobj &Dd_tf,
+             const mobj &Ds_ti,
+             const Gamma Gamma_H,
+             const Gamma GammaB_sigma,
+             const Gamma GammaB_nucl,
+             robj &result)
 {
 
   Gamma g5(Gamma::Algebra::Gamma5); 
 
-  auto DuG = Du_spec * GammaB_nucl;
-  // Gamma^B * Ds * \gamma_\mu^L
-  auto GDsG = GammaB_sigma * Ds_ti * Gamma_H;
-  // Dq_loop * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5)
-  auto DqGDd = Dq_loop * Gamma_H * g5 * adj(Dd_tf) * g5;
+  auto adjDd_GH_Duloop_GH_Ds = g5 * adj(Dd_tf) * g5 * Gamma_H * Dq_loop * Gamma_H * Ds_ti;
+  auto Gn_adjDd_GH_Duloop_GH_Ds = GammaB_nucl * adjDd_GH_Duloop_GH_Ds;
+  auto Du_Gs = Du_spec * GammaB_sigma;
+
+  Real ee;
 
   for (int ie_n=0; ie_n < 6 ; ie_n++){
-    int a_n = epsilon[ie_n][0]; //a
-    int b_n = epsilon[ie_n][1]; //b
-    int c_n = epsilon[ie_n][2]; //c
+    int a_n    = (ie_n < 3 ? ie_n       : (6-ie_n)%3 ); //epsilon[ie_n][0]; //a
+    int b_n    = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b
+    int c_n    = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c
+    int eSgn_n = (ie_n < 3 ? 1 : -1);
     for (int ie_s=0; ie_s < 6 ; ie_s++){
-      int a_s = epsilon[ie_s][0]; //a'
-      int b_s = epsilon[ie_s][1]; //b'
-      int c_s = epsilon[ie_s][2]; //c'
-      for (int alpha_s=0; alpha_s<Ns; alpha_s++){
-      for (int tau=0; tau<Ns; tau++){
-      for (int i=0; i<Nc; i++){
-	auto GDsG_at_bi = GDsG()(alpha_s,tau)(b_s,i);
-        for (int beta_n=0; beta_n<Ns; beta_n++){
-          auto DqGDd_tb_ib = DqGDd()(tau,beta_n)(i,b_n);
-	  auto ee_GDGDGD = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsG_at_bi * DqGDd_tb_ib;
-          for (int gamma_s=0; gamma_s<Ns; gamma_s++){
-          for (int gamma_n=0; gamma_n<Ns; gamma_n++){
-            result()(gamma_s,gamma_n)() -= ee_GDGDGD * DuG()(alpha_s, beta_n)(a_s,a_n) * Du_spec()(gamma_s,gamma_n)(c_s,c_n);
-            result()(gamma_s,gamma_n)() += ee_GDGDGD * DuG()(gamma_s, beta_n)(c_s,a_n) * Du_spec()(alpha_s,gamma_n)(a_s,c_n);
-          }}
-	}
-      }}}
+      int a_s = (ie_s < 3 ? ie_s       : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a'
+      int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b'
+      int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c'
+      int eSgn_s = (ie_s < 3 ? 1 : -1);
+
+      ee = Real(eSgn_n * eSgn_s); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s];
+
+      for (int alpha_n=0; alpha_n<Ns; alpha_n++){
+      for (int beta_s=0; beta_s<Ns; beta_s++){
+
+        auto Gn_adjDd_GH_Duloop_GH_Ds_ab_bb = Gn_adjDd_GH_Duloop_GH_Ds ()(alpha_n,beta_s)(b_n,b_s);
+
+        for (int gamma_s=0; gamma_s<Ns; gamma_s++){
+        for (int gamma_n=0; gamma_n<Ns; gamma_n++){
+
+          result()(gamma_n,gamma_s)() -= ee   * Du_spec         ()(gamma_n,gamma_s)(c_n,c_s)
+                                            * Du_Gs           ()(alpha_n,beta_s)(a_n,a_s) 
+                                            * Gn_adjDd_GH_Duloop_GH_Ds_ab_bb;
+
+          result()(gamma_n,gamma_s)() += ee   * Du_Gs         ()(alpha_n,gamma_s)(a_n,c_s)
+                                            * Du_spec       ()(gamma_n,beta_s)(c_n,a_s) 
+                                            * Gn_adjDd_GH_Duloop_GH_Ds_ab_bb;
+        }}
+      }}
     }
   }
 }
@@ -1158,70 +1177,88 @@ void BaryonUtils<FImpl>::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop,
  * Dd_tf is a quark line from t_f to t_H
  * Ds_ti is a quark line from t_i to t_H */
 template <class FImpl>
-template <class mobj, class mobj2, class robj>
+template <class mobj, class mobj2, class robj> accelerator_inline
 void BaryonUtils<FImpl>::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti,
-						 const mobj &Du_tf,
-						 const mobj2 &Du_spec,
-						 const mobj &Dd_tf,
-						 const mobj &Ds_ti,
-				                 const Gamma Gamma_H,
-				                 const Gamma GammaB_sigma,
-		                 		 const Gamma GammaB_nucl,
-						 robj &result)
+             const mobj &Du_tf,
+             const mobj2 &Du_spec,
+             const mobj &Dd_tf,
+             const mobj &Ds_ti,
+             const Gamma Gamma_H,
+             const Gamma GammaB_sigma,
+             const Gamma GammaB_nucl,
+             robj &result)
 {
 
   Gamma g5(Gamma::Algebra::Gamma5); 
 
-  auto DuG = Du_spec * GammaB_nucl;
-  auto adjDu = g5 * adj(Du_tf) * g5;
-  auto adjDuG = adjDu * GammaB_nucl;
-  // Gamma^B * Ds * \gamma_\mu^L
-  auto GDsG = GammaB_sigma * Ds_ti * Gamma_H;
-  // Du * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5)
-  auto DuGDd = Du_ti * Gamma_H * g5 * adj(Dd_tf) * g5;
+  auto Du_Gs              = Du_spec * GammaB_sigma;
+  auto adjDu_GH_Ds        = g5 * adj(Du_tf) * g5 * Gamma_H * Ds_ti;
+  auto adjDd_GH_Du        = g5 * adj(Dd_tf) * g5 * Gamma_H * Du_ti;
+  auto Gn_adjDd_GH_Du     = GammaB_nucl * adjDd_GH_Du; // for some reason I needed to split this into two lines to avoid the compilation error 'error: identifier "Grid::Gamma::mul" is undefined in device code'
+
+  auto Gn_adjDd_GH_Du_Gs  = Gn_adjDd_GH_Du * GammaB_sigma;
+
+  Real ee;
+
 
   for (int ie_n=0; ie_n < 6 ; ie_n++){
-    int a_n = epsilon[ie_n][0]; //a
-    int b_n = epsilon[ie_n][1]; //b
-    int c_n = epsilon[ie_n][2]; //c
+    int a_n    = (ie_n < 3 ? ie_n       : (6-ie_n)%3 ); //epsilon[ie_n][0]; //a
+    int b_n    = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b
+    int c_n    = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c
+    int eSgn_n = (ie_n < 3 ? 1 : -1);
     for (int ie_s=0; ie_s < 6 ; ie_s++){
-      int a_s = epsilon[ie_s][0]; //a'
-      int b_s = epsilon[ie_s][1]; //b'
-      int c_s = epsilon[ie_s][2]; //c'
-      for (int alpha_s=0; alpha_s<Ns; alpha_s++){
-      for (int tau=0; tau<Ns; tau++){
-      for (int i=0; i<Nc; i++){
-	auto GDsG_at_bi = GDsG()(alpha_s,tau)(b_s,i);
-        for (int beta_n=0; beta_n<Ns; beta_n++){
-          auto DuGDd_ab_ab = DuGDd()(alpha_s,beta_n)(a_s,b_n);
-	  auto ee_GDGDGD_a = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsG_at_bi * DuGDd_ab_ab;
-          for (int gamma_s=0; gamma_s<Ns; gamma_s++){
-            auto DuGDd_gb_cb = DuGDd()(gamma_s,beta_n)(c_s,b_n);
-	    auto ee_GDGDGD_c = epsilon_sgn[ie_n] * epsilon_sgn[ie_s] * GDsG_at_bi * DuGDd_gb_cb;
-            for (int gamma_n=0; gamma_n<Ns; gamma_n++){
-              result()(gamma_s,gamma_n)() -= ee_GDGDGD_a * DuG()(gamma_s, beta_n)(c_s,a_n) * adjDu()(tau,gamma_n)(i,c_n);
-              result()(gamma_s,gamma_n)() -= ee_GDGDGD_c * adjDuG()(tau, beta_n)(i,a_n) * Du_spec()(alpha_s,gamma_n)(a_s,c_n);
-              result()(gamma_s,gamma_n)() += ee_GDGDGD_a * adjDuG()(tau, beta_n)(i,a_n) * Du_spec()(gamma_s,gamma_n)(c_s,c_n);
-              result()(gamma_s,gamma_n)() += ee_GDGDGD_c * DuG()(alpha_s, beta_n)(a_s,a_n) * adjDu()(tau,gamma_n)(i,c_n);
-            }
-	  }
-	}
-      }}}
+      int a_s = (ie_s < 3 ? ie_s       : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a'
+      int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b'
+      int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c'
+      int eSgn_s = (ie_s < 3 ? 1 : -1);
+
+      ee = Real(eSgn_n * eSgn_s); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s];
+
+      for (int alpha_n=0; alpha_n<Ns; alpha_n++){
+      for (int beta_s=0;  beta_s<Ns;   beta_s++){
+
+        auto adjDu_GH_Ds_ab_ab = adjDu_GH_Ds()(alpha_n, beta_s)(a_n,b_s);
+        auto Gn_adjDd_GH_Du_Gs_ab_ba = Gn_adjDd_GH_Du_Gs()(alpha_n, beta_s)(b_n,a_s);
+
+        for (int gamma_s=0; gamma_s<Ns; gamma_s++){
+          auto Gn_adjDd_GH_Du_ag_bc = Gn_adjDd_GH_Du()(alpha_n,gamma_s)(b_n,c_s);
+          for (int gamma_n=0; gamma_n<Ns; gamma_n++){
+            auto adjDu_GH_Ds_gb_cb = adjDu_GH_Ds()(gamma_n, beta_s)(c_n,b_s);
+
+            result()(gamma_n,gamma_s)() += ee * adjDu_GH_Ds_ab_ab
+                                          * Gn_adjDd_GH_Du_Gs_ab_ba
+                                          * Du_spec()(gamma_n,gamma_s)(c_n,c_s);
+
+            result()(gamma_n,gamma_s)() -= ee * adjDu_GH_Ds_gb_cb
+                                          * Gn_adjDd_GH_Du_Gs_ab_ba
+                                          * Du_spec()(alpha_n,gamma_s)(a_n,c_s);
+
+            result()(gamma_n,gamma_s)() += ee * adjDu_GH_Ds_gb_cb
+                                          * Gn_adjDd_GH_Du_ag_bc
+                                          * Du_Gs()(alpha_n, beta_s)(a_n,a_s);
+
+            result()(gamma_n,gamma_s)() -= ee * adjDu_GH_Ds_ab_ab
+                                          * Gn_adjDd_GH_Du_ag_bc
+                                          * Du_Gs()(gamma_n, beta_s)(c_n,a_s);
+          }
+        }
+      }}
     }
   }
+
 }
 
 template<class FImpl>
 template <class mobj>
 void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
-						 const mobj &Du_spec,
-						 const PropagatorField &qd_tf,
-						 const PropagatorField &qs_ti,
-				                 const Gamma Gamma_H,
-				                 const Gamma GammaB_sigma,
-		                 		 const Gamma GammaB_nucl,
-						 const std::string op,
-						 SpinMatrixField &stn_corr)
+             const mobj &Du_spec,
+             const PropagatorField &qd_tf,
+             const PropagatorField &qs_ti,
+             const Gamma Gamma_H,
+             const Gamma GammaB_sigma,
+             const Gamma GammaB_nucl,
+             const std::string op,
+             SpinMatrixField &stn_corr)
 {
 
   assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
@@ -1229,39 +1266,51 @@ void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
 
   GridBase *grid = qs_ti.Grid();
 
-  autoView( vcorr, stn_corr, CpuWrite);
-  autoView( vq_loop , qq_loop, CpuRead);
-  autoView( vd_tf , qd_tf, CpuRead);
-  autoView( vs_ti , qs_ti, CpuRead);
+  autoView( vcorr   , stn_corr , AcceleratorWrite);
+  autoView( vq_loop , qq_loop  , AcceleratorRead);
+  autoView( vd_tf   , qd_tf    , AcceleratorRead);
+  autoView( vs_ti   , qs_ti    , AcceleratorRead);
 
-  accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-    auto Dq_loop = vq_loop[ss];
-    auto Dd_tf = vd_tf[ss];
-    auto Ds_ti = vs_ti[ss];
-    sobj result=Zero();
-    if(op == "Q1"){
-      SigmaToNucleonQ1EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
-    } else if(op == "Q2"){
-      SigmaToNucleonQ2EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
-    } else {
-      assert(0 && "Weak Operator not correctly specified");
-    }
-      vcorr[ss] = result; 
-  }  );//end loop over lattice sites
+  Vector<mobj> my_Dq_spec{Du_spec};
+  mobj * Dq_spec_p = &my_Dq_spec[0];
+
+  if(op == "Q1"){
+    accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+      auto Dq_loop = vq_loop(ss);
+      auto Dd_tf = vd_tf(ss);
+      auto Ds_ti = vs_ti(ss);
+      typedef decltype(coalescedRead(vcorr[0])) spinor;
+      spinor result=Zero();
+      SigmaToNucleonQ1EyeSite(Dq_loop,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      coalescedWrite(vcorr[ss],result);
+    });//end loop over lattice sites
+  } else if(op == "Q2"){
+    accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+      auto Dq_loop = vq_loop(ss);
+      auto Dd_tf = vd_tf(ss);
+      auto Ds_ti = vs_ti(ss);
+      typedef decltype(coalescedRead(vcorr[0])) spinor;
+      spinor result=Zero();
+      SigmaToNucleonQ2EyeSite(Dq_loop,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      coalescedWrite(vcorr[ss],result);
+    });//end loop over lattice sites
+  } else {
+    assert(0 && "Weak Operator not correctly specified");
+  }
 }
 
 template<class FImpl>
 template <class mobj>
 void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
-						 const PropagatorField &qq_tf,
-						 const mobj &Du_spec,
-						 const PropagatorField &qd_tf,
-						 const PropagatorField &qs_ti,
-				                 const Gamma Gamma_H,
-				                 const Gamma GammaB_sigma,
-		                 		 const Gamma GammaB_nucl,
-						 const std::string op,
-						 SpinMatrixField &stn_corr)
+             const PropagatorField &qq_tf,
+             const mobj &Du_spec,
+             const PropagatorField &qd_tf,
+             const PropagatorField &qs_ti,
+             const Gamma Gamma_H,
+             const Gamma GammaB_sigma,
+             const Gamma GammaB_nucl,
+             const std::string op,
+             SpinMatrixField &stn_corr)
 {
 
   assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
@@ -1269,27 +1318,40 @@ void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
 
   GridBase *grid = qs_ti.Grid();
 
-  autoView( vcorr , stn_corr, CpuWrite);
-  autoView( vq_ti , qq_ti, CpuRead);
-  autoView( vq_tf , qq_tf, CpuRead);
-  autoView( vd_tf , qd_tf, CpuRead);
-  autoView( vs_ti , qs_ti, CpuRead);
- // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-  thread_for(ss,grid->oSites(),{
-    auto Dq_ti = vq_ti[ss];
-    auto Dq_tf = vq_tf[ss];
-    auto Dd_tf = vd_tf[ss];
-    auto Ds_ti = vs_ti[ss];
-    sobj result=Zero();
-    if(op == "Q1"){
-      SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
-    } else if(op == "Q2"){
-      SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
-    } else {
-      assert(0 && "Weak Operator not correctly specified");
-    }
-      vcorr[ss] = result; 
-  }  );//end loop over lattice sites
+  autoView( vcorr , stn_corr , AcceleratorWrite );
+  autoView( vq_ti , qq_ti    , AcceleratorRead  );
+  autoView( vq_tf , qq_tf    , AcceleratorRead  );
+  autoView( vd_tf , qd_tf    , AcceleratorRead  );
+  autoView( vs_ti , qs_ti    , AcceleratorRead  );
+  
+  Vector<mobj> my_Dq_spec{Du_spec};
+  mobj * Dq_spec_p = &my_Dq_spec[0];
+
+  if(op == "Q1"){
+    accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+      auto Dq_ti = vq_ti(ss);
+      auto Dq_tf = vq_tf(ss);
+      auto Dd_tf = vd_tf(ss);
+      auto Ds_ti = vs_ti(ss);
+      typedef decltype(coalescedRead(vcorr[0])) spinor;
+      spinor result=Zero();
+      SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      coalescedWrite(vcorr[ss],result);
+    });//end loop over lattice sites
+  } else if(op == "Q2"){
+    accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+      auto Dq_ti = vq_ti(ss);
+      auto Dq_tf = vq_tf(ss);
+      auto Dd_tf = vd_tf(ss);
+      auto Ds_ti = vs_ti(ss);
+      typedef decltype(coalescedRead(vcorr[0])) spinor;
+      spinor result=Zero();
+      SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result);
+      coalescedWrite(vcorr[ss],result);
+    });//end loop over lattice sites
+  } else {
+    assert(0 && "Weak Operator not correctly specified");
+  }
 }
 
 NAMESPACE_END(Grid);
diff --git a/configure.ac b/configure.ac
index f077ca93..fb0c78fc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -7,7 +7,12 @@ AM_INIT_AUTOMAKE([subdir-objects 1.13])
 AM_EXTRA_RECURSIVE_TARGETS([tests bench])
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_SRCDIR([Grid/Grid.h])
-AC_CONFIG_HEADERS([Grid/Config.h],[sed -i 's|PACKAGE_|GRID_|' Grid/Config.h])
+AC_CONFIG_HEADERS([Grid/Config.h],[[$SED_INPLACE -e 's|PACKAGE_|GRID_|' -e 's|[[:space:]]PACKAGE[[:space:]]| GRID_PACKAGE |' -e 's|[[:space:]]VERSION[[:space:]]| GRID_PACKAGE_VERSION |' Grid/Config.h]],
+    [if test x"$host_os" == x"${host_os#darwin}" ; then]
+        [SED_INPLACE="sed -i"]
+    [else]
+        [SED_INPLACE="sed -i .bak"]
+    [fi])
 m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
 ################ Get git info
@@ -125,7 +130,7 @@ esac
 
 ############### fermions
 AC_ARG_ENABLE([fermion-reps],
-     [AC_HELP_STRING([--fermion-reps=yes|no], [enable extra fermion representation support])],
+     [AC_HELP_STRING([--enable-fermion-reps=yes|no], [enable extra fermion representation support])],
      [ac_FERMION_REPS=${enable_fermion_reps}], [ac_FERMION_REPS=yes])
 
 AM_CONDITIONAL(BUILD_FERMION_REPS, [ test "${ac_FERMION_REPS}X" == "yesX" ])
diff --git a/tests/forces/Test_momentum_filter.cc b/tests/forces/Test_momentum_filter.cc
new file mode 100644
index 00000000..856ea0f2
--- /dev/null
+++ b/tests/forces/Test_momentum_filter.cc
@@ -0,0 +1,154 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/Test_wilson_force.cc
+
+    Copyright (C) 2015
+
+Author: Christopher Kelly <ckelly@bnl.gov>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+//Get the mu-directected links on the upper boundary and the bulk remainder
+template<typename Field>
+void getLinksBoundaryBulk(Field &bound, Field &bulk,  Field &from, const Coordinate &latt_size){
+  bound = Zero(); bulk = Zero();
+  for(int mu=0;mu<Nd;mu++){
+    LatticeInteger mucoor(bound.Grid());
+    LatticeCoordinate(mucoor, mu);
+
+    bound = where( mucoor == (Integer)(latt_size[mu] - 1), from, bound );
+    bulk = where( mucoor != (Integer)(latt_size[mu] - 1), from, bulk );
+  }
+}
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  Coordinate latt_size   = GridDefaultLatt();
+  Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
+  Coordinate mpi_layout  = GridDefaultMpi();
+
+  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
+  GridRedBlackCartesian     RBGrid(&Grid);
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+
+  std::vector<int> seeds({1,2,3,4});
+
+  GridParallelRNG          pRNG(&Grid);
+  pRNG.SeedFixedIntegers(seeds);
+
+  typedef PeriodicGimplR Gimpl;
+  typedef WilsonGaugeAction<Gimpl> GaugeAction;
+  typedef NoHirep Representation; //fundamental
+  typedef NoSmearing<Gimpl> Smearing;
+  typedef MinimumNorm2<Gimpl, Smearing> Omelyan;
+  typedef Gimpl::Field Field;
+  typedef MomentumFilterApplyPhase<Field> Filter;
+  Filter filter(&Grid);
+  
+  //Setup a filter that disables link update on links passing through the global lattice boundary
+  typedef Filter::LatticeLorentzScalarType MaskType;
+  typedef Filter::LorentzScalarType MaskSiteType;
+
+  MaskSiteType zero, one;
+  for(int mu=0;mu<Nd;mu++){
+    zero(mu)()() = 0.;
+    one(mu)()() = 1.;
+  }
+  MaskType zeroField(&Grid), oneField(&Grid);
+  zeroField = zero;
+  oneField = one;
+
+  
+  filter.phase = oneField; //make every site 1.0
+
+  //Zero mu-directed links at upper boundary
+  for(int mu=0;mu<Nd;mu++){
+    LatticeInteger mucoor(&Grid);
+    LatticeCoordinate(mucoor, mu);
+    
+    filter.phase = where( mucoor == (Integer)(latt_size[mu] - 1) , zeroField, filter.phase );
+  }
+
+  //Start with a random gauge field
+  Field U(&Grid);
+  SU<Nc>::HotConfiguration(pRNG,U);
+
+  //Get the original links on the bulk and boundary for later use
+  Field Ubnd_orig(&Grid), Ubulk_orig(&Grid);
+  getLinksBoundaryBulk(Ubnd_orig, Ubulk_orig, U, latt_size);
+
+  ActionSet<Field,Representation> actions(1);  
+  double beta=6;
+  GaugeAction gauge_action(beta);
+  actions[0].push_back(&gauge_action);
+
+  Smearing smear;
+  IntegratorParameters params(1,1.); //1 MD step
+  Omelyan integrator(&Grid, params, actions, smear);
+  
+  integrator.setMomentumFilter(filter);
+
+  integrator.refresh(U, pRNG); //doesn't actually change the gauge field
+
+  //Check the momentum is zero on the boundary
+  const auto &P = integrator.getMomentum();
+  Field Pbnd(&Grid), Pbulk(&Grid);
+  getLinksBoundaryBulk(Pbnd, Pbulk, const_cast<Field&>(P), latt_size);
+
+  RealD Pbnd_nrm = norm2(Pbnd); //expect zero
+  std::cout << GridLogMessage << "After refresh, norm2 of mu-directed conjugate momentum on boundary is: " << Pbnd_nrm << " (expect 0)" << std::endl;
+  RealD Pbulk_nrm = norm2(Pbulk); //expect non-zero
+  std::cout << GridLogMessage << "After refresh, norm2 of bulk conjugate momentum is: " << Pbulk_nrm << " (expect non-zero)" << std::endl;
+
+  //Evolve the gauge field
+  integrator.integrate(U);
+
+  //Check momentum is still zero on boundary
+  getLinksBoundaryBulk(Pbnd, Pbulk, const_cast<Field&>(P), latt_size);
+  
+  Pbnd_nrm = norm2(Pbnd); //expect zero
+  std::cout << GridLogMessage << "After integrate, norm2 of mu-directed conjugate momentum on boundary is: " << Pbnd_nrm << " (expect 0)" << std::endl;
+  Pbulk_nrm = norm2(Pbulk); //expect non-zero
+  std::cout << GridLogMessage << "After integrate, norm2 of bulk conjugate momentum is: " << Pbulk_nrm << " (expect non-zero)" << std::endl;
+
+  //Get the new bulk and bound links
+  Field Ubnd_new(&Grid), Ubulk_new(&Grid);
+  getLinksBoundaryBulk(Ubnd_new, Ubulk_new, U, latt_size);
+
+  Field Ubnd_diff = Ubnd_new - Ubnd_orig;
+  Field Ubulk_diff = Ubulk_new - Ubulk_orig;
+
+  RealD Ubnd_change = norm2( Ubnd_diff );
+  RealD Ubulk_change = norm2( Ubulk_diff );
+  std::cout << GridLogMessage << "After integrate, norm2 of change in mu-directed boundary links is : " << Ubnd_change << " (expect 0)" << std::endl;
+  std::cout << GridLogMessage << "After integrate, norm2 of change in bulk links is : " << Ubulk_change << " (expect non-zero)" << std::endl;
+
+  Grid_finalize();
+}