Merge branch 'develop' into feature/distil

* develop: (27 commits) Update README.md result layout standardised, iterator size more elegant updated syntac in Test_hadrons_spectrum chroma-regression test now prints difference correctly baryon input strings are now pairs of pairs of gammas - still ugly!! second update to pull request Changing back interface for Gamma3pt Removing old debug code Changes to A2Autils suggested changes for 1st pull request implemented changed input parameters for easier use Should compile everywhere now changed baryon interface added author information ready for pull request code compiling now - still need to test Baryons module works in 1 of 3 cases - still need SlicedProp and Msource part!! thread_for caused the problems - slow for loop for now still bugfix weird bug... ... # Conflicts: # Hadrons/Modules.hpp # Hadrons/modules.inc
2026-07-17 15:43:27 +01:00 · 2019-10-30 14:13:00 +00:00
parent ca234325bc f31e3278a6
commit eb8848a071
16 changed files with 1886 additions and 483 deletions
@@ -317,116 +317,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
  }
 }

-template<class vobj>
-static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
-{
-  // std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
-
-  typedef typename vobj::scalar_type scalar_type;
-  std::vector<scalar_type> lsSum;
-  localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
-  globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
-  // std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
-}
-
-template <class vobj>
-static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
-{
-  // std::cout << GridLogMessage << "Start prep" << std::endl;
-  typedef typename vobj::vector_type   vector_type;
-  typedef typename vobj::scalar_type   scalar_type;
-  GridBase  *grid = lhs.Grid();
-  assert(grid!=NULL);
-  conformable(grid,rhs.Grid());
-
-  const int    Nd = grid->_ndimension;
-  const int Nsimd = grid->Nsimd();
-
-  assert(orthogdim >= 0);
-  assert(orthogdim < Nd);
-
-  int fd=grid->_fdimensions[orthogdim];
-  int ld=grid->_ldimensions[orthogdim];
-  int rd=grid->_rdimensions[orthogdim];
-  // std::cout << GridLogMessage << "Start alloc" << std::endl;
-
-  Vector<vector_type> lvSum(rd); // will locally sum vectors first
-  lsSum.resize(ld,scalar_type(0.0));                    // sum across these down to scalars
-  ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD  
-  // std::cout << GridLogMessage << "End alloc" << std::endl;
-
-  result.resize(fd); // And then global sum to return the same vector to every node for IO to file
-  for(int r=0;r<rd;r++){
-    lvSum[r]=Zero();
-  }
-
-  int e1=    grid->_slice_nblock[orthogdim];
-  int e2=    grid->_slice_block [orthogdim];
-  int stride=grid->_slice_stride[orthogdim];
-  // std::cout << GridLogMessage << "End prep" << std::endl;
-  // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
-  vector_type vv;
-  auto l_v=lhs.View();
-  auto r_v=rhs.View();
-  thread_for( r,rd,{
-
-    int so=r*grid->_ostride[orthogdim]; // base offset for start of plane 
-
-    for(int n=0;n<e1;n++){
-      for(int b=0;b<e2;b++){
-        int ss = so + n * stride + b;
-        vv = TensorRemove(innerProduct(l_v[ss], r_v[ss]));
-        lvSum[r] = lvSum[r] + vv;
-      }
-    }
-  });
-  // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
-
-  // Sum across simd lanes in the plane, breaking out orthog dir.
-  Coordinate icoor(Nd);
-  for(int rt=0;rt<rd;rt++){
-
-    iScalar<vector_type> temp; 
-    temp._internal = lvSum[rt];
-    extract(temp,extracted);
-
-    for(int idx=0;idx<Nsimd;idx++){
-
-      grid->iCoorFromIindex(icoor,idx);
-
-      int ldx =rt+icoor[orthogdim]*rd;
-
-      lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
-
-    }
-  }
-  // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
-}
-template <class vobj>
-static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
-{
-  typedef typename vobj::scalar_type scalar_type;
-  GridBase *grid = lhs.Grid();
-  int fd = result.size();
-  int ld = lsSum.size();
-  // sum over nodes.
-  std::vector<scalar_type> gsum;
-  gsum.resize(fd, scalar_type(0.0));
-  // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
-  for(int t=0;t<fd;t++){
-    int pt = t/ld; // processor plane
-    int lt = t%ld;
-    if ( pt == grid->_processor_coor[orthogdim] ) {
-      gsum[t]=lsSum[lt];
-    }
-  }
-  // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
-  // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
-  grid->GlobalSumVector(&gsum[0], fd);
-  // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
-
-  result = gsum;
-}
 template<class vobj>
 static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) 
 {
@@ -76,8 +76,21 @@ public:
        const std::vector<ComplexField> &emB1,
        int orthogdim, double *t_kernel = nullptr, double *t_gsum = nullptr);

-  static void ContractWWVV(std::vector<PropagatorField> &WWVV,
-			   const Eigen::Tensor<ComplexD,3> &WW_sd,
+  template <typename TensorType>
+  typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
+                           std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                           void>::type
+  static ContractWWVV(std::vector<PropagatorField> &WWVV,
+			   const TensorType &WW_sd,
+			   const FermionField *vs,
+			   const FermionField *vd);
+
+  template <typename TensorType>
+  typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
+                            std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                            void>::type
+  static ContractWWVV(std::vector<PropagatorField> &WWVV,
+			   const TensorType &WW_sd,
 			   const FermionField *vs,
 			   const FermionField *vd);

@@ -107,6 +120,11 @@ public:
 			const FermionField *vd,
 			int orthogdim);
 #endif
+private:
+  inline static void OuterProductWWVV(PropagatorField &WWVV,
+                               const vobj &lhs,
+                               const vobj &rhs,
+                               const int Ns, const int ss);
 };

 template<class FImpl>
@@ -1375,9 +1393,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
 // Take WW_sd v^dag_d (x) v_s
 // 

-template<class FImpl>
-void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
-				   const Eigen::Tensor<ComplexD,3> &WW_sd,
+template <class FImpl>
+template <typename TensorType>
+typename std::enable_if<(std::is_same<Eigen::Tensor<ComplexD,3>, TensorType>::value ||
+                         std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                         void>::type
+A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
+				   const TensorType &WW_sd,
 				   const FermionField *vs,
 				   const FermionField *vd)
 {
@@ -1399,39 +1421,100 @@ void A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
    for(int d_o=0;d_o<N_d;d_o+=d_unroll){
      for(int t=0;t<N_t;t++){
      for(int s=0;s<N_s;s++){
-	auto vs_v = vs[s].View();
-	auto tmp1 = vs_v[ss];
-	vobj tmp2 = Zero();
-	vobj tmp3 = Zero();
-	for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
-	  auto vd_v = vd[d].View();
-	  Scalar_v coeff = WW_sd(t,s,d);
-	  tmp3 = conjugate(vd_v[ss]);
-	  mac(&tmp2, &coeff, &tmp3);
-	}
+  auto vs_v = vs[s].View();
+  auto tmp1 = vs_v[ss];
+  vobj tmp2 = Zero();
+  vobj tmp3 = Zero();
+  for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
+    auto vd_v = vd[d].View();
+    Scalar_v coeff = WW_sd(t,s,d);
+    tmp3 = conjugate(vd_v[ss]);
+    mac(&tmp2, &coeff, &tmp3);
+  }

-	//////////////////////////
-	// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
-	//////////////////////////
-	auto WWVV_v = WWVV[t].View();
-	for(int s1=0;s1<Ns;s1++){
-	for(int s2=0;s2<Ns;s2++){
-	  WWVV_v[ss]()(s1,s2)(0,0) += tmp1()(s1)(0)*tmp2()(s2)(0);
-	  WWVV_v[ss]()(s1,s2)(0,1) += tmp1()(s1)(0)*tmp2()(s2)(1);
-	  WWVV_v[ss]()(s1,s2)(0,2) += tmp1()(s1)(0)*tmp2()(s2)(2);
-	  WWVV_v[ss]()(s1,s2)(1,0) += tmp1()(s1)(1)*tmp2()(s2)(0);
-	  WWVV_v[ss]()(s1,s2)(1,1) += tmp1()(s1)(1)*tmp2()(s2)(1);
-	  WWVV_v[ss]()(s1,s2)(1,2) += tmp1()(s1)(1)*tmp2()(s2)(2);
-	  WWVV_v[ss]()(s1,s2)(2,0) += tmp1()(s1)(2)*tmp2()(s2)(0);
-	  WWVV_v[ss]()(s1,s2)(2,1) += tmp1()(s1)(2)*tmp2()(s2)(1);
-	  WWVV_v[ss]()(s1,s2)(2,2) += tmp1()(s1)(2)*tmp2()(s2)(2);
-	}}
+  //////////////////////////
+  // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
+  //////////////////////////
+  OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);

      }}
    }
  });
 }

+template <class FImpl>
+template <typename TensorType>
+typename std::enable_if<!(std::is_same<Eigen::Tensor<ComplexD, 3>, TensorType>::value ||
+                          std::is_same<Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>>, TensorType>::value),
+                          void>::type
+A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
+                              const TensorType &WW_sd,
+                              const FermionField *vs,
+                              const FermionField *vd)
+{
+  GridBase *grid = vs[0].Grid();
+
+  int nd    = grid->_ndimension;
+  int Nsimd = grid->Nsimd();
+  int N_t = WW_sd.dimensions()[0];
+  int N_s = WW_sd.dimensions()[1];
+  int N_d = WW_sd.dimensions()[2];
+
+  int d_unroll = 32;// Empirical optimisation
+
+  Eigen::Matrix<Complex, -1, -1, Eigen::RowMajor> buf;
+
+  for(int t=0;t<N_t;t++){
+    WWVV[t] = Zero();
+  }
+
+  for (int t = 0; t < N_t; t++){
+    std::cout << GridLogMessage << "Contraction t = " << t << std::endl;
+    buf = WW_sd[t];
+    thread_for(ss,grid->oSites(),{
+      for(int d_o=0;d_o<N_d;d_o+=d_unroll){
+        for(int s=0;s<N_s;s++){
+    auto vs_v = vs[s].View();
+    auto tmp1 = vs_v[ss];
+    vobj tmp2 = Zero();
+    vobj tmp3 = Zero();
+    for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
+      auto vd_v = vd[d].View();
+      Scalar_v coeff = buf(s,d);
+      tmp3 = conjugate(vd_v[ss]);
+      mac(&tmp2, &coeff, &tmp3);
+    }
+
+    //////////////////////////
+    // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
+    //////////////////////////
+    OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
+      }}
+    });
+  }
+}
+
+template <class FImpl>
+inline void A2Autils<FImpl>::OuterProductWWVV(PropagatorField &WWVV,
+                                             const vobj &lhs,
+                                             const vobj &rhs,
+                                             const int Ns, const int ss)
+{
+  auto WWVV_v = WWVV.View();
+  for (int s1 = 0; s1 < Ns; s1++){
+    for (int s2 = 0; s2 < Ns; s2++){
+      WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
+      WWVV_v[ss]()(s1,s2)(0, 1) += lhs()(s1)(0) * rhs()(s2)(1);
+      WWVV_v[ss]()(s1,s2)(0, 2) += lhs()(s1)(0) * rhs()(s2)(2);
+      WWVV_v[ss]()(s1,s2)(1, 0) += lhs()(s1)(1) * rhs()(s2)(0);
+      WWVV_v[ss]()(s1,s2)(1, 1) += lhs()(s1)(1) * rhs()(s2)(1);
+      WWVV_v[ss]()(s1,s2)(1, 2) += lhs()(s1)(1) * rhs()(s2)(2);
+      WWVV_v[ss]()(s1,s2)(2, 0) += lhs()(s1)(2) * rhs()(s2)(0);
+      WWVV_v[ss]()(s1,s2)(2, 1) += lhs()(s1)(2) * rhs()(s2)(1);
+      WWVV_v[ss]()(s1,s2)(2, 2) += lhs()(s1)(2) * rhs()(s2)(2);
+    }
+  }
+}

 template<class FImpl>
 void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWVV0,
@@ -0,0 +1,252 @@
+/*************************************************************************************
+ 
+ Grid physics library, www.github.com/paboyle/Grid
+ 
+ Source file: ./lib/qcd/utils/BaryonUtils.h
+ 
+ Copyright (C) 2019
+ 
+ Author: Felix Erben <felix.erben@ed.ac.uk>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ 
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ 
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ 
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+/*  END LEGAL */
+#pragma once
+//#include <Grid/Hadrons/Global.hpp>
+#include <Grid/Eigen/unsupported/CXX11/Tensor>
+
+NAMESPACE_BEGIN(Grid);
+
+template <typename FImpl>
+class BaryonUtils 
+{
+public:
+  typedef typename FImpl::ComplexField ComplexField;
+  typedef typename FImpl::FermionField FermionField;
+  typedef typename FImpl::PropagatorField PropagatorField;
+
+  typedef typename FImpl::SitePropagator pobj;
+  typedef typename ComplexField::vector_object vobj;
+  
+  static constexpr int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
+  static constexpr Complex epsilon_sgn[6]= {1,1,1,-1,-1,-1};
+
+  private: 
+  template <class mobj, class robj>
+  static void baryon_site(const mobj &D1,
+				 const mobj &D2,
+				 const mobj &D3,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const int parity,
+				 const int * wick_contractions,
+  				 robj &result);
+  public:
+  static void ContractBaryons(const PropagatorField &q1_left,
+				 const PropagatorField &q2_left,
+				 const PropagatorField &q3_left,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const char * quarks_left,
+				 const char * quarks_right,
+				 const int parity,
+				 ComplexField &baryon_corr);
+  template <class mobj, class robj>
+  static void ContractBaryons_Sliced(const mobj &D1,
+				 const mobj &D2,
+				 const mobj &D3,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const char * quarks_left,
+				 const char * quarks_right,
+				 const int parity,
+				 robj &result);
+};
+
+template <class FImpl>
+constexpr int BaryonUtils<FImpl>::epsilon[6][3];
+template <class FImpl>
+constexpr Complex BaryonUtils<FImpl>::epsilon_sgn[6];
+
+template <class FImpl>
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 const int * wick_contraction,
+						 robj &result)
+{
+
+  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
+
+    auto gD1a = GammaA_left * GammaA_right * D1;
+    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
+    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
+    auto gD3 = GammaB_right * D3;
+
+    for (int ie_left=0; ie_left < 6 ; ie_left++){
+      int a_left = epsilon[ie_left][0]; //a
+      int b_left = epsilon[ie_left][1]; //b
+      int c_left = epsilon[ie_left][2]; //c
+      for (int ie_right=0; ie_right < 6 ; ie_right++){
+        int a_right = epsilon[ie_right][0]; //a'
+        int b_right = epsilon[ie_right][1]; //b'
+        int c_right = epsilon[ie_right][2]; //c'
+        //This is the \delta_{456}^{123} part
+	if (wick_contraction[0]){
+          auto D2g = D2 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+          }}}
+  	}	  
+        //This is the \delta_{456}^{231} part
+	if (wick_contraction[1]){
+          auto pD1g = pD1 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+          }}}
+        }	  
+        //This is the \delta_{456}^{312} part
+	if (wick_contraction[2]){
+          auto gD3g = gD3 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+          }}}
+        }	  
+        //This is the \delta_{456}^{132} part
+	if (wick_contraction[3]){
+          auto gD3g = gD3 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+          }}}
+        }	  
+        //This is the \delta_{456}^{321} part
+	if (wick_contraction[4]){
+          auto D2g = D2 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+          }}}
+        }	  
+        //This is the \delta_{456}^{213} part
+	if (wick_contraction[5]){
+          auto pD1g = pD1 * GammaB_left;
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+          }}}
+        }	  
+      }
+    }
+}
+
+template<class FImpl>
+void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
+						 const PropagatorField &q2_left,
+						 const PropagatorField &q3_left,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const char * quarks_left,
+						 const char * quarks_right,
+						 const int parity,
+						 ComplexField &baryon_corr)
+{
+  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
+    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
+    std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
+    std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
+ 
+  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+
+  GridBase *grid = q1_left.Grid();
+
+  int wick_contraction[6];
+  for (int ie=0; ie < 6 ; ie++)
+    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
+
+  auto vbaryon_corr= baryon_corr.View();
+  auto v1 = q1_left.View();
+  auto v2 = q2_left.View();
+  auto v3 = q3_left.View();
+
+ // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+  thread_for(ss,grid->oSites(),{
+  //for(int ss=0; ss < grid->oSites(); ss++){
+
+    auto D1 = v1[ss];
+    auto D2 = v2[ss];
+    auto D3 = v3[ss];
+
+    vobj result=Zero();
+    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+    vbaryon_corr[ss] = result; 
+  }  );//end loop over lattice sites
+}
+template <class FImpl>
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const char * quarks_left,
+						 const char * quarks_right,
+						 const int parity,
+						 robj &result)
+{
+  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+    std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
+    std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
+    std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
+    std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
+ 
+  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+
+  int wick_contraction[6];
+  for (int ie=0; ie < 6 ; ie++)
+    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
+
+     result=Zero();
+     baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+}
+NAMESPACE_END(Grid);
@@ -108,7 +108,7 @@ public:
    void saveBlock(const A2AMatrixSet<T> &m, const unsigned int ext, const unsigned int str,
                   const unsigned int i, const unsigned int j);
    template <template <class> class Vec, typename VecT>
-    void load(Vec<VecT> &v, double *tRead = nullptr);
+    void load(Vec<VecT> &v, double *tRead = nullptr, GridBase *grid = nullptr);
 private:
    std::string  filename_{""}, dataname_{""};
    unsigned int nt_{0}, ni_{0}, nj_{0};
@@ -506,44 +506,53 @@ void A2AMatrixIo<T>::saveBlock(const A2AMatrixSet<T> &m,

 template <typename T>
 template <template <class> class Vec, typename VecT>
-void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead)
+void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead, GridBase *grid)
 {
 #ifdef HAVE_HDF5
-    Hdf5Reader           reader(filename_);
    std::vector<hsize_t> hdim;
    H5NS::DataSet        dataset;
    H5NS::DataSpace      dataspace;
    H5NS::CompType       datatype;
-    
-    push(reader, dataname_);
-    auto &group = reader.getGroup();
-    dataset     = group.openDataSet(HADRONS_A2AM_NAME);
-    datatype    = dataset.getCompType();
-    dataspace   = dataset.getSpace();
-    hdim.resize(dataspace.getSimpleExtentNdims());
-    dataspace.getSimpleExtentDims(hdim.data());
-    if ((nt_*ni_*nj_ != 0) and
-        ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
+
+    if (!(grid) || grid->IsBoss())
    {
-        HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
-            + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
-            + std::to_string(hdim[2]) + ", expected "
-            + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
-            + std::to_string(nj_));
-    }
-    else if (ni_*nj_ == 0)
-    {
-        if (hdim[0] != nt_)
+        Hdf5Reader reader(filename_);
+        push(reader, dataname_);
+        auto &group = reader.getGroup();
+        dataset = group.openDataSet(HADRONS_A2AM_NAME);
+        datatype = dataset.getCompType();
+        dataspace = dataset.getSpace();
+        hdim.resize(dataspace.getSimpleExtentNdims());
+        dataspace.getSimpleExtentDims(hdim.data());
+        if ((nt_ * ni_ * nj_ != 0) and
+            ((hdim[0] != nt_) or (hdim[1] != ni_) or (hdim[2] != nj_)))
        {
-            HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
-                + std::to_string(hdim[0]) + ", expected "
-                + std::to_string(nt_) + ")");
+            HADRONS_ERROR(Size, "all-to-all matrix size mismatch (got "
+                + std::to_string(hdim[0]) + "x" + std::to_string(hdim[1]) + "x"
+                + std::to_string(hdim[2]) + ", expected "
+                + std::to_string(nt_) + "x" + std::to_string(ni_) + "x"
+                + std::to_string(nj_));
        }
-        ni_ = hdim[1];
-        nj_ = hdim[2];
+        else if (ni_*nj_ == 0)
+        {
+            if (hdim[0] != nt_)
+            {
+                HADRONS_ERROR(Size, "all-to-all time size mismatch (got "
+                    + std::to_string(hdim[0]) + ", expected "
+                    + std::to_string(nt_) + ")");
+            }
+            ni_ = hdim[1];
+            nj_ = hdim[2];
+        }
+    }
+    if (grid)
+    {
+        grid->Broadcast(grid->BossRank(), &ni_, sizeof(unsigned int));
+        grid->Broadcast(grid->BossRank(), &nj_, sizeof(unsigned int));
    }

    A2AMatrix<T>         buf(ni_, nj_);
+    int broadcastSize =  sizeof(T) * buf.size();
    std::vector<hsize_t> count    = {1, static_cast<hsize_t>(ni_),
                                     static_cast<hsize_t>(nj_)},
                         stride   = {1, 1, 1},
@@ -565,10 +574,20 @@ void A2AMatrixIo<T>::load(Vec<VecT> &v, double *tRead)
            std::cout << " " << t;
            std::cout.flush();
        }
-        dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
-                                  stride.data(), block.data());
-        if (tRead) *tRead -= usecond();    
-        dataset.read(buf.data(), datatype, memspace, dataspace);
+        if (!(grid) || grid->IsBoss())
+        {
+            dataspace.selectHyperslab(H5S_SELECT_SET, count.data(), offset.data(),
+                                      stride.data(), block.data());
+        }
+        if (tRead) *tRead -= usecond();
+        if (!(grid) || grid->IsBoss())
+        {
+            dataset.read(buf.data(), datatype, memspace, dataspace);
+        }
+        if (grid)
+        {
+            grid->Broadcast(grid->BossRank(), buf.data(), broadcastSize);
+        }
        if (tRead) *tRead += usecond();
        v[t] = buf.template cast<VecT>();
    }
@@ -87,13 +87,20 @@ public:
    };
 public:
    DiskVectorBase(const std::string dirname, const unsigned int size = 0,
-                   const unsigned int cacheSize = 1, const bool clean = true);
+                   const unsigned int cacheSize = 1, const bool clean = true,
+                   GridBase *grid = nullptr);
    DiskVectorBase(DiskVectorBase<T> &&v) = default;
    virtual ~DiskVectorBase(void);
    const T & operator[](const unsigned int i) const;
    RwAccessHelper operator[](const unsigned int i);
    double hitRatio(void) const;
    void resetStat(void);
+    void setSize(unsigned int size_);
+    unsigned int getSize() const;
+    unsigned int dvSize;
+    void setGrid(GridBase *grid_);
+    GridBase *getGrid() const;
+    GridBase *dvGrid;
 private:
    virtual void load(T &obj, const std::string filename) const = 0;
    virtual void save(const std::string filename, const T &obj) const = 0;
@@ -107,6 +114,7 @@ private:
    unsigned int                                          size_, cacheSize_;
    double                                                access_{0.}, hit_{0.};
    bool                                                  clean_;
+    GridBase                                              *grid_;
    // using pointers to allow modifications when class is const
    // semantic: const means data unmodified, but cache modification allowed
    std::unique_ptr<std::vector<T>>                       cachePtr_;
@@ -158,66 +166,92 @@ public:
    {
        return (*this)[i](j, k);
    }
+    std::vector<int> dimensions() const
+    {
+        std::vector<int> dims(3);
+        dims[0] = (*this).getSize();
+        dims[1] = (*this)[0].rows();
+        dims[2] = (*this)[0].cols();
+        return dims;
+    }
 private:
    virtual void load(EigenDiskVectorMat<T> &obj, const std::string filename) const
    {
-        std::ifstream f(filename, std::ios::binary);
-        uint32_t      crc, check;
-        Eigen::Index  nRow, nCol;
-        size_t        matSize;
-        double        tRead, tHash;
-
-        f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
-        f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
-        f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
-        obj.resize(nRow, nCol);
-        matSize = nRow*nCol*sizeof(T);
-        tRead  = -usecond();
-        f.read(reinterpret_cast<char *>(obj.data()), matSize);
-        tRead += usecond();
-        tHash  = -usecond();
-#ifdef USE_IPP
-        check  = GridChecksum::crc32c(obj.data(), matSize);
-#else
-        check  = GridChecksum::crc32(obj.data(), matSize);
-#endif
-        tHash += usecond();
-        DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
-        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
-                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
-        if (crc != check)
+        GridBase *loadGrid;
+        loadGrid = (*this).getGrid();
+        if (!(loadGrid) || loadGrid->IsBoss())
        {
-            HADRONS_ERROR(Io, "checksum failed")
+            std::ifstream f(filename, std::ios::binary);
+            uint32_t      crc, check;
+            Eigen::Index  nRow, nCol;
+            size_t        matSize;
+            double        tRead, tHash;
+
+            f.read(reinterpret_cast<char *>(&crc), sizeof(crc));
+            f.read(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+            f.read(reinterpret_cast<char *>(&nCol), sizeof(nCol));
+            obj.resize(nRow, nCol);
+            matSize = nRow*nCol*sizeof(T);
+            tRead  = -usecond();
+            f.read(reinterpret_cast<char *>(obj.data()), matSize);
+            tRead += usecond();
+            tHash  = -usecond();
+    #ifdef USE_IPP
+            check  = GridChecksum::crc32c(obj.data(), matSize);
+    #else
+            check  = GridChecksum::crc32(obj.data(), matSize);
+    #endif
+            tHash += usecond();
+            DV_DEBUG_MSG(this, "Eigen read " << tRead/1.0e6 << " sec " << matSize/tRead*1.0e6/1024/1024 << " MB/s");
+            DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << check << std::dec 
+                        << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+            if (crc != check)
+            {
+                HADRONS_ERROR(Io, "checksum failed")
+            }
+        }
+        int broadcastSize;
+        broadcastSize = sizeof(T)*obj.size();
+        if (loadGrid)
+        {
+            loadGrid->Broadcast(loadGrid->BossRank(), obj.data(), broadcastSize);
+            loadGrid->Barrier();
        }
    }

    virtual void save(const std::string filename, const EigenDiskVectorMat<T> &obj) const
    {
-        std::ofstream f(filename, std::ios::binary);
-        uint32_t      crc;
-        Eigen::Index  nRow, nCol;
-        size_t        matSize;
-        double        tWrite, tHash;
-        
-        nRow    = obj.rows();
-        nCol    = obj.cols();
-        matSize = nRow*nCol*sizeof(T);
-        tHash   = -usecond();
-#ifdef USE_IPP
-        crc     = GridChecksum::crc32c(obj.data(), matSize);
-#else
-        crc     = GridChecksum::crc32(obj.data(), matSize);
-#endif
-        tHash  += usecond();
-        f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
-        f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
-        f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
-        tWrite = -usecond();
-        f.write(reinterpret_cast<const char *>(obj.data()), matSize);
-        tWrite += usecond();
-        DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
-        DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
-                     << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+        GridBase *saveGrid;
+        saveGrid = (*this).getGrid();
+        if (!(saveGrid) || saveGrid->IsBoss())
+        {
+            std::ofstream f(filename, std::ios::binary);
+            uint32_t      crc;
+            Eigen::Index  nRow, nCol;
+            size_t        matSize;
+            double        tWrite, tHash;
+            
+            nRow    = obj.rows();
+            nCol    = obj.cols();
+            matSize = nRow*nCol*sizeof(T);
+            tHash   = -usecond();
+    #ifdef USE_IPP
+            crc     = GridChecksum::crc32c(obj.data(), matSize);
+    #else
+            crc     = GridChecksum::crc32(obj.data(), matSize);
+    #endif
+            tHash  += usecond();
+            f.write(reinterpret_cast<char *>(&crc), sizeof(crc));
+            f.write(reinterpret_cast<char *>(&nRow), sizeof(nRow));
+            f.write(reinterpret_cast<char *>(&nCol), sizeof(nCol));
+            tWrite = -usecond();
+            f.write(reinterpret_cast<const char *>(obj.data()), matSize);
+            tWrite += usecond();
+            DV_DEBUG_MSG(this, "Eigen write " << tWrite/1.0e6 << " sec " << matSize/tWrite*1.0e6/1024/1024 << " MB/s");
+            DV_DEBUG_MSG(this, "Eigen crc32 " << std::hex << crc << std::dec
+                        << " " << tHash/1.0e6 << " sec " << matSize/tHash*1.0e6/1024/1024 << " MB/s");
+        }
+        if (saveGrid)   saveGrid->Barrier();
    }
 };

@@ -228,8 +262,9 @@ template <typename T>
 DiskVectorBase<T>::DiskVectorBase(const std::string dirname, 
                                  const unsigned int size,
                                  const unsigned int cacheSize,
-                                  const bool clean)
-: dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean)
+                                  const bool clean,
+                                  GridBase *grid)
+: dirname_(dirname), size_(size), cacheSize_(cacheSize), clean_(clean), grid_(grid)
 , cachePtr_(new std::vector<T>(size))
 , modifiedPtr_(new std::vector<bool>(size, false))
 , indexPtr_(new std::map<unsigned int, unsigned int>())
@@ -238,15 +273,21 @@ DiskVectorBase<T>::DiskVectorBase(const std::string dirname,
 {
    struct stat s;

-    if(stat(dirname.c_str(), &s) == 0)
+    if (!(grid_) || grid_->IsBoss())
    {
-        HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
+        if(stat(dirname.c_str(), &s) == 0)
+        {
+            HADRONS_ERROR(Io, "directory '" + dirname + "' already exists")
+        }
+        mkdir(dirname);
    }
-    mkdir(dirname);
+    if (grid_)  grid_->Barrier();
    for (unsigned int i = 0; i < cacheSize_; ++i)
    {
        freePtr_->push(i);
    }
+    setSize(size_);
+    setGrid(grid_);
 }

 template <typename T>
@@ -258,6 +299,30 @@ DiskVectorBase<T>::~DiskVectorBase(void)
    }
 }

+template <typename T>
+void DiskVectorBase<T>::setSize(unsigned int size_)
+{
+    dvSize = size_;
+}
+
+template <typename T>
+unsigned int DiskVectorBase<T>::getSize() const
+{
+    return dvSize;
+}
+
+template <typename T>
+void DiskVectorBase<T>::setGrid(GridBase *grid_)
+{
+    dvGrid = grid_;
+}
+
+template <typename T>
+GridBase *DiskVectorBase<T>::getGrid() const
+{
+    return dvGrid;
+}
+
 template <typename T>
 const T & DiskVectorBase<T>::operator[](const unsigned int i) const
 {
@@ -299,7 +364,7 @@ const T & DiskVectorBase<T>::operator[](const unsigned int i) const
    }
    DV_DEBUG_MSG(this, "in cache: " << msg);
 #endif
-
+    if (grid_)  grid_->Barrier();
    return cache[index.at(i)];
 }

@@ -358,6 +423,7 @@ void DiskVectorBase<T>::evict(void) const
        index.erase(i);
        loads.pop_front();
    }
+    if (grid_)  grid_->Barrier();
 }

 template <typename T>
@@ -395,27 +461,14 @@ void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
    auto &freeInd  = *freePtr_;
    auto &loads    = *loadsPtr_;

-    // cache miss, evict and store
-    if (index.find(i) == index.end())
-    {
-        evict();
-        index[i] = freeInd.top();
-        freeInd.pop();
-        cache[index.at(i)] = obj;
-        loads.push_back(i);
-        modified[index.at(i)] = false;
-    }
-    // cache hit, modify current value
-    else
-    {
-        auto pos = std::find(loads.begin(), loads.end(), i);
-        
-        cache[index.at(i)]    = obj;
-        modified[index.at(i)] = true;
-        loads.erase(pos);
-        loads.push_back(i);
-    }
+    evict();
+    index[i] = freeInd.top();
+    freeInd.pop();
+    cache[index.at(i)] = obj;
+    loads.push_back(i);
+    modified[index.at(i)] = false;

+    if (grid_)  grid_->Barrier();
 #ifdef DV_DEBUG
    std::string msg;

@@ -434,21 +487,23 @@ void DiskVectorBase<T>::cacheInsert(const unsigned int i, const T &obj) const
 template <typename T>
 void DiskVectorBase<T>::clean(void)
 {
-    auto unlink = [](const char *fpath, const struct stat *sb, 
-                     int typeflag, struct FTW *ftwbuf)
+    if (!(grid_) || grid_->IsBoss())
    {
-        int rv = remove(fpath);
+        auto unlink = [](const char *fpath, const struct stat *sb,
+                         int typeflag, struct FTW *ftwbuf) {
+            int rv = remove(fpath);

-        if (rv)
-        {
-            HADRONS_ERROR(Io, "cannot remove '" + std::string(fpath) + "': "
-                          + std::string(std::strerror(errno)));
-        }
+            if (rv)
+            {
+                HADRONS_ERROR(Io, "cannot remove '" + std::string(fpath) + "': " + std::string(std::strerror(errno)));
+            }

-        return rv;
-    };
+            return rv;
+        };

-    nftw(dirname_.c_str(), unlink, 64, FTW_DEPTH | FTW_PHYS);
+        nftw(dirname_.c_str(), unlink, 64, FTW_DEPTH | FTW_PHYS);
+    }
+    if (grid_)  grid_->Barrier();
 }

 END_HADRONS_NAMESPACE
@@ -1,78 +1,79 @@
-#include <Hadrons/Modules/MSource/Gauss.hpp>
-#include <Hadrons/Modules/MSource/Momentum.hpp>
-#include <Hadrons/Modules/MSource/MomentumPhase.hpp>
-#include <Hadrons/Modules/MSource/SeqAslash.hpp>
-#include <Hadrons/Modules/MSource/Z2.hpp>
-#include <Hadrons/Modules/MSource/Point.hpp>
-#include <Hadrons/Modules/MSource/SeqGamma.hpp>
-#include <Hadrons/Modules/MSource/Convolution.hpp>
-#include <Hadrons/Modules/MSource/Wall.hpp>
-#include <Hadrons/Modules/MSource/SeqConserved.hpp>
-#include <Hadrons/Modules/MScalarSUN/Div.hpp>
-#include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
-#include <Hadrons/Modules/MScalarSUN/TrPhi.hpp>
-#include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
-#include <Hadrons/Modules/MScalarSUN/Grad.hpp>
-#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
-#include <Hadrons/Modules/MScalarSUN/StochFreeField.hpp>
-#include <Hadrons/Modules/MScalarSUN/EMT.hpp>
-#include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
-#include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
-#include <Hadrons/Modules/MScalarSUN/TransProj.hpp>
-#include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
-#include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
-#include <Hadrons/Modules/MScalar/FreeProp.hpp>
-#include <Hadrons/Modules/MScalar/Scalar.hpp>
-#include <Hadrons/Modules/MScalar/ChargedProp.hpp>
-#include <Hadrons/Modules/MAction/Wilson.hpp>
-#include <Hadrons/Modules/MAction/ScaledDWF.hpp>
-#include <Hadrons/Modules/MAction/MobiusDWF.hpp>
-#include <Hadrons/Modules/MAction/WilsonClover.hpp>
-#include <Hadrons/Modules/MAction/ZMobiusDWF.hpp>
-#include <Hadrons/Modules/MAction/DWF.hpp>
-#include <Hadrons/Modules/MGauge/UnitEm.hpp>
-#include <Hadrons/Modules/MGauge/Electrify.hpp>
-#include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
-#include <Hadrons/Modules/MGauge/Random.hpp>
-#include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
-#include <Hadrons/Modules/MGauge/GaugeFix.hpp>
-#include <Hadrons/Modules/MGauge/Unit.hpp>
-#include <Hadrons/Modules/MGauge/StochEm.hpp>
-#include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
-#include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
+#include <Hadrons/Modules/MFermion/GaugeProp.hpp>
+#include <Hadrons/Modules/MFermion/EMLepton.hpp>
+#include <Hadrons/Modules/MFermion/FreeProp.hpp>
 #include <Hadrons/Modules/MIO/LoadCosmHol.hpp>
-#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
 #include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
+#include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
+#include <Hadrons/Modules/MIO/LoadA2AMatrixDiskVector.hpp>
+#include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
 #include <Hadrons/Modules/MIO/LoadNersc.hpp>
 #include <Hadrons/Modules/MIO/LoadPerambulator.hpp>
 #include <Hadrons/Modules/MIO/LoadBinary.hpp>
-#include <Hadrons/Modules/MIO/LoadCoarseEigenPack.hpp>
-#include <Hadrons/Modules/MContraction/WeakEye3pt.hpp>
-#include <Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp>
-#include <Hadrons/Modules/MContraction/Gamma3pt.hpp>
-#include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
-#include <Hadrons/Modules/MContraction/A2ALoop.hpp>
-#include <Hadrons/Modules/MContraction/WeakNonEye3pt.hpp>
+#include <Hadrons/Modules/MAction/ZMobiusDWF.hpp>
+#include <Hadrons/Modules/MAction/MobiusDWF.hpp>
+#include <Hadrons/Modules/MAction/Wilson.hpp>
+#include <Hadrons/Modules/MAction/DWF.hpp>
+#include <Hadrons/Modules/MAction/WilsonClover.hpp>
+#include <Hadrons/Modules/MAction/ScaledDWF.hpp>
+#include <Hadrons/Modules/MUtilities/RandomVectors.hpp>
+#include <Hadrons/Modules/MUtilities/PrecisionCast.hpp>
+#include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
+#include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MContraction/DiscLoop.hpp>
-#include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
-#include <Hadrons/Modules/MContraction/Baryon.hpp>
 #include <Hadrons/Modules/MContraction/Meson.hpp>
+#include <Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp>
+#include <Hadrons/Modules/MContraction/A2AMesonField.hpp>
+#include <Hadrons/Modules/MContraction/WeakNonEye3pt.hpp>
+#include <Hadrons/Modules/MContraction/Gamma3pt.hpp>
+#include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
+#include <Hadrons/Modules/MContraction/A2AFourQuarkContraction.hpp>
+#include <Hadrons/Modules/MContraction/Baryon.hpp>
+#include <Hadrons/Modules/MContraction/WeakEye3pt.hpp>
+#include <Hadrons/Modules/MContraction/A2ALoop.hpp>
 #include <Hadrons/Modules/MDistil/DistilVectors.hpp>
 #include <Hadrons/Modules/MDistil/LapEvec.hpp>
 #include <Hadrons/Modules/MDistil/Noises.hpp>
 #include <Hadrons/Modules/MDistil/PerambFromSolve.hpp>
 #include <Hadrons/Modules/MDistil/Perambulator.hpp>
+#include <Hadrons/Modules/MSink/Point.hpp>
+#include <Hadrons/Modules/MSink/Smear.hpp>
+#include <Hadrons/Modules/MScalarSUN/StochFreeField.hpp>
+#include <Hadrons/Modules/MScalarSUN/EMT.hpp>
+#include <Hadrons/Modules/MScalarSUN/Utils.hpp>
+#include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
+#include <Hadrons/Modules/MScalarSUN/TransProj.hpp>
+#include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
+#include <Hadrons/Modules/MScalarSUN/TrPhi.hpp>
+#include <Hadrons/Modules/MScalarSUN/Grad.hpp>
+#include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
+#include <Hadrons/Modules/MScalarSUN/Div.hpp>
+#include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
+#include <Hadrons/Modules/MNPR/Amputate.hpp>
 #include <Hadrons/Modules/MNPR/FourQuark.hpp>
 #include <Hadrons/Modules/MNPR/Bilinear.hpp>
-#include <Hadrons/Modules/MNPR/Amputate.hpp>
-#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
-#include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
-#include <Hadrons/Modules/MSolver/Guesser.hpp>
-#include <Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp>
+#include <Hadrons/Modules/MSource/SeqAslash.hpp>
+#include <Hadrons/Modules/MSource/Momentum.hpp>
+#include <Hadrons/Modules/MSource/Z2.hpp>
+#include <Hadrons/Modules/MSource/Point.hpp>
+#include <Hadrons/Modules/MSource/Gauss.hpp>
+#include <Hadrons/Modules/MSource/SeqConserved.hpp>
+#include <Hadrons/Modules/MSource/Wall.hpp>
+#include <Hadrons/Modules/MSource/SeqGamma.hpp>
+#include <Hadrons/Modules/MSource/Convolution.hpp>
+#include <Hadrons/Modules/MGauge/Random.hpp>
+#include <Hadrons/Modules/MGauge/FundtoHirep.hpp>
+#include <Hadrons/Modules/MGauge/StochEm.hpp>
+#include <Hadrons/Modules/MGauge/UnitEm.hpp>
+#include <Hadrons/Modules/MGauge/GaugeFix.hpp>
+#include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
+#include <Hadrons/Modules/MGauge/Unit.hpp>
+#include <Hadrons/Modules/MGauge/Electrify.hpp>
 #include <Hadrons/Modules/MSolver/A2AVectors.hpp>
+#include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
+#include <Hadrons/Modules/MSolver/LocalCoherenceLanczos.hpp>
+#include <Hadrons/Modules/MSolver/Guesser.hpp>
 #include <Hadrons/Modules/MSolver/MixedPrecisionRBPrecCG.hpp>
-#include <Hadrons/Modules/MFermion/FreeProp.hpp>
-#include <Hadrons/Modules/MFermion/GaugeProp.hpp>
-#include <Hadrons/Modules/MFermion/EMLepton.hpp>
-#include <Hadrons/Modules/MSink/Smear.hpp>
-#include <Hadrons/Modules/MSink/Point.hpp>
+#include <Hadrons/Modules/MSolver/A2AAslashVectors.hpp>
+#include <Hadrons/Modules/MScalar/ChargedProp.hpp>
+#include <Hadrons/Modules/MScalar/Scalar.hpp>
+#include <Hadrons/Modules/MScalar/FreeProp.hpp>
@@ -0,0 +1,7 @@
+#include <Hadrons/Modules/MContraction/A2AFourQuarkContraction.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MContraction;
+
+template class Grid::Hadrons::MContraction::TA2AFourQuarkContraction<FIMPL>;
@@ -0,0 +1,138 @@
+#ifndef Hadrons_MContraction_A2AFourQuarkContraction_hpp_
+#define Hadrons_MContraction_A2AFourQuarkContraction_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/DiskVector.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         A2AFourQuarkContraction                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MContraction)
+
+class A2AFourQuarkContractionPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(A2AFourQuarkContractionPar,
+                                    std::string,  v1,
+                                    std::string,  v2,
+                                    std::string,  mf12,
+                                    bool,         allContr,
+                                    unsigned int, dt);
+};
+
+template <typename FImpl>
+class TA2AFourQuarkContraction: public Module<A2AFourQuarkContractionPar>
+{
+  public:
+    FERM_TYPE_ALIASES(FImpl, );
+    // constructor
+    TA2AFourQuarkContraction(const std::string name);
+    // destructor
+    virtual ~TA2AFourQuarkContraction(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+  private:
+    unsigned int nt_;
+};
+
+MODULE_REGISTER_TMP(A2AFourQuarkContraction, TA2AFourQuarkContraction<FIMPL>, MContraction);
+
+/******************************************************************************
+ *                 TA2AFourQuarkContraction implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TA2AFourQuarkContraction<FImpl>::TA2AFourQuarkContraction(const std::string name)
+: Module<A2AFourQuarkContractionPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TA2AFourQuarkContraction<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().v1, par().v2, par().mf12};
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TA2AFourQuarkContraction<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TA2AFourQuarkContraction<FImpl>::setup(void)
+{
+    if (par().allContr)
+    {
+        nt_ = env().getDim(Tp);
+        envTmp(std::vector<PropagatorField>, "tmpWWVV", 1, nt_, envGetGrid(PropagatorField));
+        envCreate(std::vector<PropagatorField>, getName(), 1, nt_, envGetGrid(PropagatorField));
+    }
+    else
+    {
+        envTmp(std::vector<PropagatorField>, "tmpWWVV", 1, 1, envGetGrid(PropagatorField));
+        envCreate(PropagatorField, getName(), 1, envGetGrid(PropagatorField));
+    }
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TA2AFourQuarkContraction<FImpl>::execute(void)
+{
+    auto &v1   = envGet(std::vector<FermionField>, par().v1);
+    auto &v2   = envGet(std::vector<FermionField>, par().v2);
+    auto &mf12 = envGet(EigenDiskVector<Complex>, par().mf12);
+
+    envGetTmp(std::vector<PropagatorField>, tmpWWVV);
+
+    unsigned int dt = par().dt;
+    unsigned int nt = env().getDim(Tp);
+
+    if (par().allContr)
+    {
+        LOG(Message) << "Computing 4 quark contraction for " << getName()
+                     << " for all t0 time translations "
+                     << "with nt = " << nt_ << " and dt = " << dt << std::endl;
+
+        auto &WWVV = envGet(std::vector<PropagatorField>, getName());
+        A2Autils<FImpl>::ContractWWVV(tmpWWVV, mf12, &v1[0], &v2[0]);
+        for(unsigned int t = 0; t < nt_; t++){
+            unsigned int t0 = (t + dt) % nt_;
+            WWVV[t] = tmpWWVV[t0];
+        }
+    }
+    else
+    {
+        LOG(Message) << "Computing 4 quark contraction for: " << getName()
+                     << " for time dt = " << dt << std::endl;
+
+        auto &WWVV = envGet(PropagatorField, getName());
+        int ni = v1.size();
+        int nj = v2.size();
+        Eigen::Matrix<Complex, -1, -1, Eigen::RowMajor> mf;
+        mf = mf12[dt];
+        Eigen::TensorMap<Eigen::Tensor<Complex, 3, Eigen::RowMajor>> mfT(mf.data(), 1, ni, nj);
+        A2Autils<FImpl>::ContractWWVV(tmpWWVV, mfT, &v1[0], &v2[0]);
+        WWVV = tmpWWVV[0];
+    }
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MContraction_A2AFourQuarkContraction_hpp_
@@ -7,7 +7,7 @@ Source file: Hadrons/Modules/MContraction/Baryon.hpp
 Copyright (C) 2015-2019

 Author: Antonin Portelli <antonin.portelli@me.com>
-Author: Lanny91 <andrew.lawson@gmail.com>
+Author: Felix Erben <felix.erben@ed.ac.uk>

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -33,6 +33,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Global.hpp>
 #include <Hadrons/Module.hpp>
 #include <Hadrons/ModuleFactory.hpp>
+#include <Grid/qcd/utils/BaryonUtils.h>

 BEGIN_HADRONS_NAMESPACE

@@ -41,6 +42,9 @@ BEGIN_HADRONS_NAMESPACE
 ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)

+typedef std::pair<Gamma::Algebra, Gamma::Algebra> GammaAB;
+typedef std::pair<GammaAB, GammaAB> GammaABPair;
+
 class BaryonPar: Serializable
 {
 public:
@@ -48,6 +52,11 @@ public:
                                    std::string, q1,
                                    std::string, q2,
                                    std::string, q3,
+                                    std::string, gammas,
+                                    std::string, quarks,
+                                    std::string, prefactors,
+                                    std::string, parity,
+                                    std::string, sink,
                                    std::string, output);
 };

@@ -58,12 +67,21 @@ public:
    FERM_TYPE_ALIASES(FImpl1, 1);
    FERM_TYPE_ALIASES(FImpl2, 2);
    FERM_TYPE_ALIASES(FImpl3, 3);
-    class Result: Serializable
+    BASIC_TYPE_ALIASES(ScalarImplCR, Scalar);
+    SINK_TYPE_ALIASES(Scalar);
+    class Metadata: Serializable
    {
    public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
-                                        std::vector<std::vector<std::vector<Complex>>>, corr);
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Metadata,
+                                        Gamma::Algebra, gammaA_left,
+                                        Gamma::Algebra, gammaB_left,
+                                        Gamma::Algebra, gammaA_right,
+                                        Gamma::Algebra, gammaB_right,
+                                        std::string, quarks,
+                                        std::string, prefactors,
+                                        int, parity);
    };
+    typedef Correlator<Metadata> Result;
 public:
    // constructor
    TBaryon(const std::string name);
@@ -72,11 +90,14 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
+    virtual void parseGammaString(std::vector<GammaABPair> &gammaList);
 protected:
    // setup
    virtual void setup(void);
    // execution
    virtual void execute(void);
+    // Which gamma algebra was specified
+    Gamma::Algebra  al;
 };

 MODULE_REGISTER_TMP(Baryon, ARG(TBaryon<FIMPL, FIMPL, FIMPL>), MContraction);
@@ -94,7 +115,7 @@ TBaryon<FImpl1, FImpl2, FImpl3>::TBaryon(const std::string name)
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 std::vector<std::string> TBaryon<FImpl1, FImpl2, FImpl3>::getInput(void)
 {
-    std::vector<std::string> input = {par().q1, par().q2, par().q3};
+    std::vector<std::string> input = {par().q1, par().q2, par().q3, par().sink};
    
    return input;
 }
@@ -107,30 +128,199 @@ std::vector<std::string> TBaryon<FImpl1, FImpl2, FImpl3>::getOutput(void)
    return out;
 }

+template <typename FImpl1, typename FImpl2, typename FImpl3>
+void TBaryon<FImpl1, FImpl2,FImpl3>::parseGammaString(std::vector<GammaABPair> &gammaList)
+{
+    gammaList.clear();
+    
+    std::string gammaString = par().gammas;
+    //Shorthands for standard baryon operators
+    gammaString = regex_replace(gammaString, std::regex("j12"),"(Identity SigmaXZ)");
+    gammaString = regex_replace(gammaString, std::regex("j32X"),"(Identity MinusGammaZGamma5)");
+    gammaString = regex_replace(gammaString, std::regex("j32Y"),"(Identity GammaT)");
+    gammaString = regex_replace(gammaString, std::regex("j32Z"),"(Identity GammaXGamma5)");
+    //Shorthands for less common baryon operators
+    gammaString = regex_replace(gammaString, std::regex("j12_alt1"),"(Gamma5 MinusSigmaYT)");
+    gammaString = regex_replace(gammaString, std::regex("j12_alt2"),"(Identity GammaYGamma5)");
+    
+    //A single gamma matrix 
+    std::regex rex_g("([0-9a-zA-Z]+)");
+    //The full string we expect
+    std::regex rex("( *\\(( *\\(([0-9a-zA-Z]+) +([0-9a-zA-Z]+) *\\)){2} *\\) *)+");
+    std::smatch sm;
+    std::regex_match(gammaString, sm, rex);
+    assert(sm[0].matched && "invalid gamma structure.");
+
+    auto gamma_begin = std::sregex_iterator(gammaString.begin(), gammaString.end(), rex_g);
+    auto gamma_end = std::sregex_iterator();
+
+    int nGamma = std::distance(gamma_begin, gamma_end); 
+    //couldn't find out how to count the size in the iterator, other than looping through it...
+  /*  int nGamma=0;
+    for (std::sregex_iterator i = gamma_begin; i != gamma_end; ++i) {
+	nGamma++;
+    }
+*/   
+    gammaList.resize(nGamma/4);
+    std::vector<std::string> gS;
+    gS.resize(nGamma);
+    //even more ugly workarounds here...
+    int iG=0;
+    for (std::sregex_iterator i = gamma_begin; i != gamma_end; ++i) {
+        std::smatch match = *i;                                                 
+        gS[iG] = match.str(); 
+	iG++;
+    }
+    for (int i = 0; i < gammaList.size(); i++){
+	std::vector<Gamma::Algebra> gS1 = strToVec<Gamma::Algebra>(gS[4*i]);
+	std::vector<Gamma::Algebra> gS2 = strToVec<Gamma::Algebra>(gS[4*i+1]);
+	std::vector<Gamma::Algebra> gS3 = strToVec<Gamma::Algebra>(gS[4*i+2]);
+	std::vector<Gamma::Algebra> gS4 = strToVec<Gamma::Algebra>(gS[4*i+3]);
+        gammaList[i].first.first=gS1[0];
+        gammaList[i].first.second=gS2[0];
+        gammaList[i].second.first=gS3[0];
+        gammaList[i].second.second=gS4[0];
+    }
+}
+
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TBaryon<FImpl1, FImpl2, FImpl3>::setup(void)
 {
    envTmpLat(LatticeComplex, "c");
+    envTmpLat(LatticeComplex, "c2");
 }

 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
 {
-    LOG(Message) << "Computing baryon contractions '" << getName() << "' using"
-                 << " quarks '" << par().q1 << "', '" << par().q2 << "', and '"
-                 << par().q3 << "'" << std::endl;
-    
-    auto       &q1 = envGet(PropagatorField1, par().q1);
-    auto       &q2 = envGet(PropagatorField2, par().q2);
-    auto       &q3 = envGet(PropagatorField3, par().q2);
+
+    std::vector<std::string> quarks = strToVec<std::string>(par().quarks);    
+    std::vector<double> prefactors = strToVec<double>(par().prefactors);    
+    int nQ=quarks.size();
+    const int  parity {par().parity.size()>0 ? std::stoi(par().parity) : 1};
+
+    std::vector<GammaABPair> gammaList;
+    parseGammaString(gammaList);
+
+    assert(prefactors.size()==nQ && "number of prefactors needs to match number of quark-structures.");
+    for (int iQ = 0; iQ < nQ; iQ++)
+        assert(quarks[iQ].size()==3 && "quark-structures must consist of 3 quarks each.");
+
+    LOG(Message) << "Computing baryon contractions '" << getName() << "'" << std::endl;
+    for (int iQ1 = 0; iQ1 < nQ; iQ1++)
+        for (int iQ2 = 0; iQ2 < nQ; iQ2++)
+            LOG(Message) << prefactors[iQ1]*prefactors[iQ2] << "*<" << quarks[iQ1] << "|" << quarks[iQ2] << ">" << std::endl;
+    LOG(Message) << " using quarks " << par().q1 << "', " << par().q2 << "', and '" << par().q3 << std::endl;
+     for (int iG = 0; iG < gammaList.size(); iG++)
+         LOG(Message) << "' with (Gamma^A,Gamma^B)_left = ( " << gammaList[iG].first.first << " , " << gammaList[iG].first.second << "') and (Gamma^A,Gamma^B)_right = ( " << gammaList[iG].second.first << " , " << gammaList[iG].second.second << ")" << std::endl; 
+      LOG(Message) << "and parity " << parity << " using sink " << par().sink << "." << std::endl;
+        
    envGetTmp(LatticeComplex, c);
-    Result     result;
-    
-    // FIXME: do contractions
-    
-    // saveResult(par().output, "meson", result);
+    envGetTmp(LatticeComplex, c2);
+    int nt = env().getDim(Tp);
+    std::vector<TComplex> buf;
+    TComplex cs;
+    TComplex ch;
+
+    std::vector<Result> result;
+    Result              r;
+    r.info.parity = parity;
+    r.info.quarks = par().quarks;
+    r.info.prefactors = par().prefactors;
+
+    if (envHasType(SlicedPropagator1, par().q1) and
+        envHasType(SlicedPropagator2, par().q2) and
+        envHasType(SlicedPropagator3, par().q3))
+    {
+        auto &q1 = envGet(SlicedPropagator1, par().q1);
+        auto &q2 = envGet(SlicedPropagator2, par().q2);
+        auto &q3 = envGet(SlicedPropagator3, par().q3);
+        for (unsigned int i = 0; i < gammaList.size(); ++i)
+        {
+            r.info.gammaA_left = gammaList[i].first.first;
+            r.info.gammaB_left = gammaList[i].first.second;
+            r.info.gammaA_right = gammaList[i].second.first;
+            r.info.gammaB_right = gammaList[i].second.second;
+
+            Gamma gAl(gammaList[i].first.first);
+            Gamma gBl(gammaList[i].first.second);
+            Gamma gAr(gammaList[i].second.first);
+            Gamma gBr(gammaList[i].second.second);
+        
+            LOG(Message) << "(propagator already sinked)" << std::endl;
+            r.corr.clear();
+            for (unsigned int t = 0; t < buf.size(); ++t)
+            {
+                cs = Zero();
+                for (int iQ1 = 0; iQ1 < nQ; iQ1++){
+                    for (int iQ2 = 0; iQ2 < nQ; iQ2++){
+                        BaryonUtils<FIMPL>::ContractBaryons_Sliced(q1[t],q2[t],q3[t],gAl,gBl,gAr,gBr,quarks[iQ1].c_str(),quarks[iQ2].c_str(),parity,ch);
+                        cs += prefactors[iQ1]*prefactors[iQ2]*ch;
+                    }
+                }
+                r.corr.push_back(TensorRemove(cs));
+            }
+            result.push_back(r);
+        }
+    }
+    else
+    {
+        auto       &q1 = envGet(PropagatorField1, par().q1);
+        auto       &q2 = envGet(PropagatorField2, par().q2);
+        auto       &q3 = envGet(PropagatorField3, par().q3);
+        for (unsigned int i = 0; i < gammaList.size(); ++i)
+        {
+            r.info.gammaA_left = gammaList[i].first.first;
+            r.info.gammaB_left = gammaList[i].first.second;
+            r.info.gammaA_right = gammaList[i].second.first;
+            r.info.gammaB_right = gammaList[i].second.second;
+
+            Gamma gAl(gammaList[i].first.first);
+            Gamma gBl(gammaList[i].first.second);
+            Gamma gAr(gammaList[i].second.first);
+            Gamma gBr(gammaList[i].second.second);
+        
+            std::string ns;
+                
+            ns = vm().getModuleNamespace(env().getObjectModule(par().sink));
+            if (ns == "MSource")
+            {
+                c=Zero();
+                for (int iQ1 = 0; iQ1 < nQ; iQ1++){
+                    for (int iQ2 = 0; iQ2 < nQ; iQ2++){
+                        BaryonUtils<FIMPL>::ContractBaryons(q1,q2,q3,gAl,gBl,gAr,gBr,quarks[iQ1].c_str(),quarks[iQ2].c_str(),parity,c2);
+                        c+=prefactors[iQ1]*prefactors[iQ2]*c2;
+                    }
+                }
+                PropagatorField1 &sink = envGet(PropagatorField1, par().sink);
+                auto test = closure(trace(sink*c));     
+                sliceSum(test, buf, Tp); 
+            }
+            else if (ns == "MSink")
+            {
+                c=Zero();
+                for (int iQ1 = 0; iQ1 < nQ; iQ1++){
+                    for (int iQ2 = 0; iQ2 < nQ; iQ2++){
+                        BaryonUtils<FIMPL>::ContractBaryons(q1,q2,q3,gAl,gBl,gAr,gBr,quarks[iQ1].c_str(),quarks[iQ2].c_str(),parity,c2);
+                        c+=prefactors[iQ1]*prefactors[iQ2]*c2;
+                    }
+                }
+                SinkFnScalar &sink = envGet(SinkFnScalar, par().sink);
+                buf = sink(c);
+            } 
+            r.corr.clear();
+            for (unsigned int t = 0; t < buf.size(); ++t)
+            {
+                r.corr.push_back(TensorRemove(buf[t]));
+            }
+            result.push_back(r);
+        }
+    }
+
+    saveResult(par().output, "baryon", result);
+
 }

 END_MODULE_NAMESPACE
@@ -57,7 +57,8 @@ BEGIN_HADRONS_NAMESPACE
 *   - q1: sink smeared propagator, source at i
 *   - q2: propagator, source at i
 *   - q3: propagator, source at f
- *   - gamma: gamma matrix to insert
+ *   - gammas: gamma matrices to insert
+ *             (space-separated strings e.g. "GammaT GammaX GammaY") 
 *   - tSnk: sink position for propagator q1.
 *
 */
@@ -71,12 +72,12 @@ class Gamma3ptPar: Serializable
 {
 public:
    GRID_SERIALIZABLE_CLASS_MEMBERS(Gamma3ptPar,
-                                    std::string,    q1,
-                                    std::string,    q2,
-                                    std::string,    q3,
-                                    Gamma::Algebra, gamma,
-                                    unsigned int,   tSnk,
-                                    std::string,    output);
+                                    std::string,  q1,
+                                    std::string,  q2,
+                                    std::string,  q3,
+                                    std::string,  gamma,
+                                    unsigned int, tSnk,
+                                    std::string,  output);
 };

 template <typename FImpl1, typename FImpl2, typename FImpl3>
@@ -100,6 +101,7 @@ public:
    // dependency relation
    virtual std::vector<std::string> getInput(void);
    virtual std::vector<std::string> getOutput(void);
+    virtual void parseGammaString(std::vector<Gamma::Algebra> &gammaList);
 protected:
    // setup
    virtual void setup(void);
@@ -142,37 +144,67 @@ void TGamma3pt<FImpl1, FImpl2, FImpl3>::setup(void)
    envTmpLat(LatticeComplex, "c");
 }

+template <typename FImpl1, typename FImpl2, typename FImpl3>
+void TGamma3pt<FImpl1, FImpl2, FImpl3>::parseGammaString(std::vector<Gamma::Algebra> &gammaList)
+{
+    gammaList.clear();
+    // Determine gamma matrices to insert at source/sink.
+    if (par().gamma.compare("all") == 0)
+    {
+        // Do all contractions.
+        for (unsigned int i = 1; i < Gamma::nGamma; i += 2)
+        {
+            gammaList.push_back((Gamma::Algebra)i);
+        }
+    }
+    else
+    {
+        // Parse individual contractions from input string.
+        gammaList = strToVec<Gamma::Algebra>(par().gamma);
+    } 
+}
 // execution ///////////////////////////////////////////////////////////////////
 template <typename FImpl1, typename FImpl2, typename FImpl3>
 void TGamma3pt<FImpl1, FImpl2, FImpl3>::execute(void)
 {
    LOG(Message) << "Computing 3pt contractions '" << getName() << "' using"
                 << " quarks '" << par().q1 << "', '" << par().q2 << "' and '"
-                 << par().q3 << "', with " << par().gamma << " insertion." 
+                 << par().q3 << "', with " << par().gamma << " insertions." 
                 << std::endl;

    // Initialise variables. q2 and q3 are normal propagators, q1 may be 
    // sink smeared.
-    auto                  &q1 = envGet(SlicedPropagator1, par().q1);
-    auto                  &q2 = envGet(PropagatorField2, par().q2);
-    auto                  &q3 = envGet(PropagatorField2, par().q3);
-    Gamma                 g5(Gamma::Algebra::Gamma5);
-    Gamma                 gamma(par().gamma);
-    std::vector<TComplex> buf;
-    Result                result;
+    auto                        &q1 = envGet(SlicedPropagator1, par().q1);
+    auto                        &q2 = envGet(PropagatorField2, par().q2);
+    auto                        &q3 = envGet(PropagatorField2, par().q3);
+    Gamma                       g5(Gamma::Algebra::Gamma5);
+    std::vector<Gamma::Algebra> gammaList;
+    std::vector<TComplex>       buf;
+    std::vector<Result>         result;
+    int                         nt = env().getDim(Tp);
+
+
+    parseGammaString(gammaList);
+    result.resize(gammaList.size());
+    for (unsigned int i = 0; i < result.size(); ++i)
+    {
+        result[i].gamma = gammaList[i];
+        result[i].corr.resize(nt);
+    }
    
    // Extract relevant timeslice of sinked propagator q1, then contract &
    // sum over all spacial positions of gamma insertion.
    SitePropagator1 q1Snk = q1[par().tSnk];
    envGetTmp(LatticeComplex, c);
-    c = trace(g5*q1Snk*adj(q2)*(g5*gamma)*q3);
-    sliceSum(c, buf, Tp);
-
-    result.gamma = par().gamma;
-    result.corr.resize(buf.size());
-    for (unsigned int t = 0; t < buf.size(); ++t)
+    for (unsigned int i = 0; i < result.size(); ++i)
    {
-        result.corr[t] = TensorRemove(buf[t]);
+        Gamma gamma(gammaList[i]);
+        c = trace(g5*q1Snk*adj(q2)*(g5*gamma)*q3);
+        sliceSum(c, buf, Tp);
+        for (unsigned int t = 0; t < buf.size(); ++t)
+        {
+            result[i].corr[t] = TensorRemove(buf[t]);
+        }
    }
    saveResult(par().output, "gamma3pt", result);
 }
@@ -0,0 +1,7 @@
+#include <Hadrons/Modules/MIO/LoadA2AMatrixDiskVector.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MIO;
+
+template class Grid::Hadrons::MIO::TLoadA2AMatrixDiskVector<FIMPL>;
@@ -0,0 +1,115 @@
+#ifndef Hadrons_MIO_LoadA2AMatrixDiskVector_hpp_
+#define Hadrons_MIO_LoadA2AMatrixDiskVector_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+#include <Hadrons/A2AMatrix.hpp>
+#include <Hadrons/DiskVector.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         LoadA2AMatrixDiskVector                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MIO)
+
+class LoadA2AMatrixDiskVectorPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(LoadA2AMatrixDiskVectorPar,
+                                    std::string,  file,
+                                    std::string,  dataset,
+                                    std::string,  diskVectorDir,
+                                    int,  cacheSize);
+};
+
+template <typename FImpl>
+class TLoadA2AMatrixDiskVector: public Module<LoadA2AMatrixDiskVectorPar>
+{
+  public:
+    FERM_TYPE_ALIASES(FImpl, );
+    // constructor
+    TLoadA2AMatrixDiskVector(const std::string name);
+    // destructor
+    virtual ~TLoadA2AMatrixDiskVector(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(LoadA2AMatrixDiskVector, TLoadA2AMatrixDiskVector<FIMPL>, MIO);
+
+/******************************************************************************
+ *                 TLoadA2AMatrixDiskVector implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TLoadA2AMatrixDiskVector<FImpl>::TLoadA2AMatrixDiskVector(const std::string name)
+: Module<LoadA2AMatrixDiskVectorPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TLoadA2AMatrixDiskVector<FImpl>::getInput(void)
+{
+    std::vector<std::string> in;
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TLoadA2AMatrixDiskVector<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TLoadA2AMatrixDiskVector<FImpl>::setup(void)
+{
+    int Ls = 1;
+
+    std::string dvDir = par().diskVectorDir;
+    std::string dataset = par().dataset;
+    std::string dvFile = dvDir + "/" + getName() + "." + std::to_string(vm().getTrajectory());
+
+    int nt = env().getDim(Tp);
+    int cacheSize = par().cacheSize;
+    bool clean = true;
+    GridBase *grid = envGetGrid(FermionField);
+
+    envCreate(EigenDiskVector<ComplexD>, getName(), Ls, dvFile, nt, cacheSize, clean, grid);
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TLoadA2AMatrixDiskVector<FImpl>::execute(void)
+{
+    int nt = env().getDim(Tp);
+    std::string file  = par().file;
+    std::string dataset  = par().dataset;
+    GridBase *grid = envGetGrid(FermionField);
+
+    auto &mesonFieldDV = envGet(EigenDiskVector<ComplexD>, getName());
+
+    int traj = vm().getTrajectory();
+    tokenReplace(file, "traj", traj);
+    LOG(Message) << "-- Loading '" << file << "'-- " << std::endl;
+    double t;
+    A2AMatrixIo<HADRONS_A2AM_IO_TYPE> mfIO(file, dataset, nt);
+    mfIO.load(mesonFieldDV, &t, grid);
+    LOG(Message) << "Read " << mfIO.getSize() << " bytes in " << t << " usec, " << mfIO.getSize() / t * 1.0e6 / 1024 / 1024 << " MB/s" << std::endl;
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MIO_LoadA2AMatrixDiskVector_hpp_
@@ -1,157 +1,158 @@
 modules_cc =\
-  Modules/MSource/SeqConserved.cc \
-  Modules/MSource/Convolution.cc \
-  Modules/MSource/SeqAslash.cc \
-  Modules/MSource/Wall.cc \
-  Modules/MSource/Point.cc \
-  Modules/MSource/Z2.cc \
-  Modules/MSource/Gauss.cc \
-  Modules/MSource/SeqGamma.cc \
-  Modules/MSource/Momentum.cc \
-  Modules/MSource/MomentumPhase.cc \
-  Modules/MScalarSUN/TwoPoint.cc \
-  Modules/MScalarSUN/TransProj.cc \
-  Modules/MScalarSUN/TwoPointNPR.cc \
-  Modules/MScalarSUN/EMT.cc \
-  Modules/MScalarSUN/TrKinetic.cc \
-  Modules/MScalarSUN/TrPhi.cc \
-  Modules/MScalarSUN/Div.cc \
-  Modules/MScalarSUN/Grad.cc \
-  Modules/MScalarSUN/StochFreeField.cc \
-  Modules/MScalarSUN/TrMag.cc \
-  Modules/MNoise/FullVolumeSpinColorDiagonal.cc \
-  Modules/MNoise/TimeDilutedSpinColorDiagonal.cc \
-  Modules/MScalar/FreeProp.cc \
-  Modules/MScalar/ChargedProp.cc \
-  Modules/MAction/Wilson.cc \
-  Modules/MAction/DWF.cc \
-  Modules/MAction/MobiusDWF.cc \
-  Modules/MAction/ScaledDWF.cc \
-  Modules/MAction/WilsonClover.cc \
-  Modules/MAction/ZMobiusDWF.cc \
-  Modules/MGauge/FundtoHirep.cc \
-  Modules/MGauge/StochEm.cc \
-  Modules/MGauge/Unit.cc \
-  Modules/MGauge/StoutSmearing.cc \
-  Modules/MGauge/Electrify.cc \
-  Modules/MGauge/Random.cc \
-  Modules/MGauge/UnitEm.cc \
-  Modules/MGauge/GaugeFix.cc \
-  Modules/MUtilities/PrecisionCast.cc \
-  Modules/MUtilities/RandomVectors.cc \
+  Modules/MFermion/FreeProp.cc \
+  Modules/MFermion/GaugeProp.cc \
+  Modules/MFermion/EMLepton.cc \
  Modules/MIO/LoadA2AVectors.cc \
-  Modules/MIO/LoadNersc.cc \
-  Modules/MIO/LoadBinary.cc \
-  Modules/MIO/LoadCoarseEigenPack.cc \
  Modules/MIO/LoadEigenPack.cc \
  Modules/MIO/LoadCosmHol.cc \
+  Modules/MIO/LoadBinary.cc \
+  Modules/MIO/LoadA2AMatrixDiskVector.cc \
+  Modules/MIO/LoadCoarseEigenPack.cc \
+  Modules/MIO/LoadNersc.cc \
+  Modules/MAction/ZMobiusDWF.cc \
+  Modules/MAction/ScaledDWF.cc \
+  Modules/MAction/Wilson.cc \
+  Modules/MAction/DWF.cc \
+  Modules/MAction/WilsonClover.cc \
+  Modules/MAction/MobiusDWF.cc \
+  Modules/MUtilities/RandomVectors.cc \
  Modules/MIO/LoadPerambulator.cc \
-  Modules/MContraction/A2ALoop.cc \
-  Modules/MContraction/WeakNonEye3pt.cc \
-  Modules/MContraction/WeakMesonDecayKl2.cc \
-  Modules/MContraction/A2AMesonField.cc \
+  Modules/MUtilities/PrecisionCast.cc \
+  Modules/MNoise/FullVolumeSpinColorDiagonal.cc \
+  Modules/MNoise/TimeDilutedSpinColorDiagonal.cc \
  Modules/MContraction/Gamma3pt.cc \
+  Modules/MContraction/A2AFourQuarkContraction.cc \
  Modules/MContraction/A2AAslashField.cc \
-  Modules/MContraction/Baryon.cc \
-  Modules/MContraction/DiscLoop.cc \
-  Modules/MContraction/Meson.cc \
+  Modules/MContraction/A2ALoop.cc \
  Modules/MContraction/WeakEye3pt.cc \
+  Modules/MContraction/WeakNonEye3pt.cc \
+  Modules/MContraction/A2AMesonField.cc \
+  Modules/MContraction/DiscLoop.cc \
+  Modules/MContraction/Baryon.cc \
+  Modules/MContraction/WeakMesonDecayKl2.cc \
+  Modules/MContraction/Meson.cc \
  Modules/MDistil/DistilVectors.cc \
  Modules/MDistil/LapEvec.cc \
  Modules/MDistil/Noises.cc \
  Modules/MDistil/PerambFromSolve.cc \
  Modules/MDistil/Perambulator.cc \
+  Modules/MSink/Point.cc \
+  Modules/MSink/Smear.cc \
+  Modules/MScalarSUN/TrPhi.cc \
+  Modules/MScalarSUN/TrMag.cc \
+  Modules/MScalarSUN/TrKinetic.cc \
+  Modules/MScalarSUN/TwoPoint.cc \
+  Modules/MScalarSUN/Grad.cc \
+  Modules/MScalarSUN/TwoPointNPR.cc \
+  Modules/MScalarSUN/StochFreeField.cc \
+  Modules/MScalarSUN/TransProj.cc \
+  Modules/MScalarSUN/EMT.cc \
+  Modules/MScalarSUN/Div.cc \
+  Modules/MNPR/Amputate.cc \
  Modules/MNPR/Bilinear.cc \
  Modules/MNPR/FourQuark.cc \
-  Modules/MNPR/Amputate.cc \
+  Modules/MSource/SeqGamma.cc \
+  Modules/MSource/Z2.cc \
+  Modules/MSource/Convolution.cc \
+  Modules/MSource/Momentum.cc \
+  Modules/MSource/Wall.cc \
+  Modules/MSource/Point.cc \
+  Modules/MSource/SeqAslash.cc \
+  Modules/MSource/Gauss.cc \
+  Modules/MSource/SeqConserved.cc \
+  Modules/MGauge/StoutSmearing.cc \
+  Modules/MGauge/GaugeFix.cc \
+  Modules/MGauge/Electrify.cc \
+  Modules/MGauge/Random.cc \
+  Modules/MGauge/Unit.cc \
+  Modules/MGauge/UnitEm.cc \
+  Modules/MGauge/FundtoHirep.cc \
+  Modules/MGauge/StochEm.cc \
+  Modules/MSolver/RBPrecCG.cc \
  Modules/MSolver/A2AAslashVectors.cc \
  Modules/MSolver/MixedPrecisionRBPrecCG.cc \
-  Modules/MSolver/RBPrecCG.cc \
-  Modules/MSolver/LocalCoherenceLanczos.cc \
  Modules/MSolver/A2AVectors.cc \
-  Modules/MFermion/FreeProp.cc \
-  Modules/MFermion/GaugeProp.cc \
-  Modules/MFermion/EMLepton.cc \
-  Modules/MSink/Smear.cc \
-  Modules/MSink/Point.cc
+  Modules/MSolver/LocalCoherenceLanczos.cc \
+  Modules/MScalar/ChargedProp.cc \
+  Modules/MScalar/FreeProp.cc

 modules_hpp =\
-  Modules/MSource/Gauss.hpp \
-  Modules/MSource/Momentum.hpp \
-  Modules/MSource/MomentumPhase.hpp \
-  Modules/MSource/SeqAslash.hpp \
-  Modules/MSource/Z2.hpp \
-  Modules/MSource/Point.hpp \
-  Modules/MSource/SeqGamma.hpp \
-  Modules/MSource/Convolution.hpp \
-  Modules/MSource/Wall.hpp \
-  Modules/MSource/SeqConserved.hpp \
-  Modules/MScalarSUN/Div.hpp \
-  Modules/MScalarSUN/TrKinetic.hpp \
-  Modules/MScalarSUN/TrPhi.hpp \
-  Modules/MScalarSUN/TwoPoint.hpp \
-  Modules/MScalarSUN/Grad.hpp \
-  Modules/MScalarSUN/Utils.hpp \
-  Modules/MScalarSUN/StochFreeField.hpp \
-  Modules/MScalarSUN/EMT.hpp \
-  Modules/MScalarSUN/TrMag.hpp \
-  Modules/MScalarSUN/TwoPointNPR.hpp \
-  Modules/MScalarSUN/TransProj.hpp \
-  Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp \
-  Modules/MNoise/FullVolumeSpinColorDiagonal.hpp \
-  Modules/MScalar/FreeProp.hpp \
-  Modules/MScalar/Scalar.hpp \
-  Modules/MScalar/ChargedProp.hpp \
-  Modules/MAction/Wilson.hpp \
-  Modules/MAction/ScaledDWF.hpp \
-  Modules/MAction/MobiusDWF.hpp \
-  Modules/MAction/WilsonClover.hpp \
-  Modules/MAction/ZMobiusDWF.hpp \
-  Modules/MAction/DWF.hpp \
-  Modules/MGauge/UnitEm.hpp \
-  Modules/MGauge/Electrify.hpp \
-  Modules/MGauge/StoutSmearing.hpp \
-  Modules/MGauge/Random.hpp \
-  Modules/MGauge/FundtoHirep.hpp \
-  Modules/MGauge/GaugeFix.hpp \
-  Modules/MGauge/Unit.hpp \
-  Modules/MGauge/StochEm.hpp \
-  Modules/MUtilities/RandomVectors.hpp \
-  Modules/MUtilities/PrecisionCast.hpp \
+  Modules/MFermion/GaugeProp.hpp \
+  Modules/MFermion/EMLepton.hpp \
+  Modules/MFermion/FreeProp.hpp \
  Modules/MIO/LoadCosmHol.hpp \
-  Modules/MIO/LoadA2AVectors.hpp \
  Modules/MIO/LoadEigenPack.hpp \
+  Modules/MIO/LoadA2AVectors.hpp \
+  Modules/MIO/LoadA2AMatrixDiskVector.hpp \
+  Modules/MIO/LoadCoarseEigenPack.hpp \
  Modules/MIO/LoadNersc.hpp \
  Modules/MIO/LoadBinary.hpp \
-  Modules/MIO/LoadCoarseEigenPack.hpp \
  Modules/MIO/LoadPerambulator.hpp \
-  Modules/MContraction/WeakEye3pt.hpp \
-  Modules/MContraction/WeakMesonDecayKl2.hpp \
-  Modules/MContraction/Gamma3pt.hpp \
-  Modules/MContraction/A2AMesonField.hpp \
-  Modules/MContraction/A2ALoop.hpp \
-  Modules/MContraction/WeakNonEye3pt.hpp \
+  Modules/MAction/ZMobiusDWF.hpp \
+  Modules/MAction/MobiusDWF.hpp \
+  Modules/MAction/Wilson.hpp \
+  Modules/MAction/DWF.hpp \
+  Modules/MAction/WilsonClover.hpp \
+  Modules/MAction/ScaledDWF.hpp \
+  Modules/MUtilities/RandomVectors.hpp \
+  Modules/MUtilities/PrecisionCast.hpp \
+  Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp \
+  Modules/MNoise/FullVolumeSpinColorDiagonal.hpp \
  Modules/MContraction/DiscLoop.hpp \
-  Modules/MContraction/A2AAslashField.hpp \
-  Modules/MContraction/Baryon.hpp \
  Modules/MContraction/Meson.hpp \
+  Modules/MContraction/WeakMesonDecayKl2.hpp \
+  Modules/MContraction/A2AMesonField.hpp \
+  Modules/MContraction/WeakNonEye3pt.hpp \
+  Modules/MContraction/Gamma3pt.hpp \
+  Modules/MContraction/A2AAslashField.hpp \
+  Modules/MContraction/A2AFourQuarkContraction.hpp \
+  Modules/MContraction/Baryon.hpp \
  Modules/MDistil/DistilVectors.hpp \
  Modules/MDistil/LapEvec.hpp \
  Modules/MDistil/Noises.hpp \
  Modules/MDistil/PerambFromSolve.hpp \
  Modules/MDistil/Perambulator.hpp \
+  Modules/MContraction/WeakEye3pt.hpp \
+  Modules/MContraction/A2ALoop.hpp \
+  Modules/MSink/Point.hpp \
+  Modules/MSink/Smear.hpp \
+  Modules/MScalarSUN/StochFreeField.hpp \
+  Modules/MScalarSUN/EMT.hpp \
+  Modules/MScalarSUN/Utils.hpp \
+  Modules/MScalarSUN/TwoPoint.hpp \
+  Modules/MScalarSUN/TransProj.hpp \
+  Modules/MScalarSUN/TwoPointNPR.hpp \
+  Modules/MScalarSUN/TrPhi.hpp \
+  Modules/MScalarSUN/Grad.hpp \
+  Modules/MScalarSUN/TrMag.hpp \
+  Modules/MScalarSUN/Div.hpp \
+  Modules/MScalarSUN/TrKinetic.hpp \
+  Modules/MNPR/Amputate.hpp \
  Modules/MNPR/FourQuark.hpp \
  Modules/MNPR/Bilinear.hpp \
-  Modules/MNPR/Amputate.hpp \
-  Modules/MSolver/A2AAslashVectors.hpp \
-  Modules/MSolver/RBPrecCG.hpp \
-  Modules/MSolver/Guesser.hpp \
-  Modules/MSolver/LocalCoherenceLanczos.hpp \
+  Modules/MSource/SeqAslash.hpp \
+  Modules/MSource/Momentum.hpp \
+  Modules/MSource/Z2.hpp \
+  Modules/MSource/Point.hpp \
+  Modules/MSource/Gauss.hpp \
+  Modules/MSource/SeqConserved.hpp \
+  Modules/MSource/Wall.hpp \
+  Modules/MSource/SeqGamma.hpp \
+  Modules/MSource/Convolution.hpp \
+  Modules/MGauge/Random.hpp \
+  Modules/MGauge/FundtoHirep.hpp \
+  Modules/MGauge/StochEm.hpp \
+  Modules/MGauge/UnitEm.hpp \
+  Modules/MGauge/GaugeFix.hpp \
+  Modules/MGauge/StoutSmearing.hpp \
+  Modules/MGauge/Unit.hpp \
+  Modules/MGauge/Electrify.hpp \
  Modules/MSolver/A2AVectors.hpp \
+  Modules/MSolver/RBPrecCG.hpp \
+  Modules/MSolver/LocalCoherenceLanczos.hpp \
+  Modules/MSolver/Guesser.hpp \
  Modules/MSolver/MixedPrecisionRBPrecCG.hpp \
-  Modules/MFermion/FreeProp.hpp \
-  Modules/MFermion/GaugeProp.hpp \
-  Modules/MFermion/EMLepton.hpp \
-  Modules/MSink/Smear.hpp \
-  Modules/MSink/Point.hpp
-
+  Modules/MSolver/A2AAslashVectors.hpp \
+  Modules/MScalar/ChargedProp.hpp \
+  Modules/MScalar/Scalar.hpp \
+  Modules/MScalar/FreeProp.hpp
@@ -49,6 +49,10 @@ GCC   v4.9.x (recommended)

 GCC   v6.3 and later

+### Specific machine compilation instructions - Summit, Tesseract
+
+[The Wiki contains specific instructions for some Summit, Tesseract and GPU compilation](https://github.com/paboyle/Grid/wiki)
+
 ### Important: 

 Some versions of GCC appear to have a bug under high optimisation (-O2, -O3).
@@ -46,6 +46,7 @@ int main(int argc, char *argv[])
    // run setup ///////////////////////////////////////////////////////////////
    Application              application;
    std::vector<std::string> flavour = {"l", "s", "c1", "c2", "c3"};
+    std::vector<std::string> flavour_baryon = {"l", "s", "a", "b", "c"}; //needs to be a single character
    std::vector<double>      mass    = {.01, .04, .2  , .25 , .3  };
    
    // global parameters
@@ -134,6 +135,10 @@ int main(int argc, char *argv[])
        barPar.q1     = "Qpt_" + flavour[i];
        barPar.q2     = "Qpt_" + flavour[j];
        barPar.q3     = "Qpt_" + flavour[k];
+        barPar.gammas     = "(j12 j12) (j32X j32Y)";
+        barPar.quarks     = flavour_baryon[i] + flavour_baryon[j] + flavour_baryon[k];
+        barPar.prefactors     = "1.0";
+        barPar.sink    = "sink";
        application.createModule<MContraction::Baryon>(
            "baryon_pt_" + flavour[i] + flavour[j] + flavour[k], barPar);
    }
@@ -0,0 +1,604 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./tests/qdpxx/Test_qdpxx_wilson.cc
+
+    Copyright (C) 2017
+
+    Author: Felix Erben <felix.erben@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+/*  END LEGAL */
+
+#include <chroma.h>
+#include <Grid/Grid.h>
+#include <Grid/qcd/utils/BaryonUtils.h>
+
+typedef Grid::LatticeGaugeField GaugeField;
+
+namespace Chroma
+{
+
+class ChromaWrapper
+{
+public:
+  typedef multi1d<LatticeColorMatrix> U;
+  typedef LatticeFermion T4;
+
+  static void ImportGauge(GaugeField &gr,
+                          QDP::multi1d<QDP::LatticeColorMatrix> &ch)
+  {
+    Grid::LorentzColourMatrix LCM;
+    Grid::Complex cc;
+    QDP::ColorMatrix cm;
+    QDP::Complex c;
+
+    std::vector<int> x(4);
+    QDP::multi1d<int> cx(4);
+    Grid::Coordinate gd = gr.Grid()->GlobalDimensions();
+
+    for (x[0] = 0; x[0] < gd[0]; x[0]++)
+    {
+      for (x[1] = 0; x[1] < gd[1]; x[1]++)
+      {
+        for (x[2] = 0; x[2] < gd[2]; x[2]++)
+        {
+          for (x[3] = 0; x[3] < gd[3]; x[3]++)
+          {
+            cx[0] = x[0];
+            cx[1] = x[1];
+            cx[2] = x[2];
+            cx[3] = x[3];
+            Grid::peekSite(LCM, gr, x);
+
+            for (int mu = 0; mu < 4; mu++)
+            {
+              for (int i = 0; i < 3; i++)
+              {
+                for (int j = 0; j < 3; j++)
+                {
+                  cc = LCM(mu)()(i, j);
+                  c = QDP::cmplx(QDP::Real(real(cc)), QDP::Real(imag(cc)));
+                  QDP::pokeColor(cm, c, i, j);
+                }
+              }
+              QDP::pokeSite(ch[mu], cm, cx);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  static void ExportGauge(GaugeField &gr,
+                          QDP::multi1d<QDP::LatticeColorMatrix> &ch)
+  {
+    Grid::LorentzColourMatrix LCM;
+    Grid::Complex cc;
+    QDP::ColorMatrix cm;
+    QDP::Complex c;
+
+    std::vector<int> x(4);
+    QDP::multi1d<int> cx(4);
+    Grid::Coordinate gd = gr.Grid()->GlobalDimensions();
+
+    for (x[0] = 0; x[0] < gd[0]; x[0]++)
+    {
+      for (x[1] = 0; x[1] < gd[1]; x[1]++)
+      {
+        for (x[2] = 0; x[2] < gd[2]; x[2]++)
+        {
+          for (x[3] = 0; x[3] < gd[3]; x[3]++)
+          {
+            cx[0] = x[0];
+            cx[1] = x[1];
+            cx[2] = x[2];
+            cx[3] = x[3];
+
+            for (int mu = 0; mu < 4; mu++)
+            {
+              for (int i = 0; i < 3; i++)
+              {
+                for (int j = 0; j < 3; j++)
+                {
+                  cm = QDP::peekSite(ch[mu], cx);
+                  c = QDP::peekColor(cm, i, j);
+                  cc = Grid::Complex(toDouble(real(c)), toDouble(imag(c)));
+                  LCM(mu)
+                  ()(i, j) = cc;
+                }
+              }
+            }
+            Grid::pokeSite(LCM, gr, x);
+          }
+        }
+      }
+    }
+  }
+
+  // Specific for Wilson Fermions
+  static void ImportPropagator(Grid::LatticePropagator &gr,
+                            QDP::LatticePropagator &ch)
+  {
+    Grid::LatticeSpinColourVector LF(gr.Grid());
+    QDP::LatticeFermion cLF;
+
+    int Nspin=4;
+    int Ncolour=3;
+
+    for (int is = 0; is < Nspin; is++){
+      for (int ic = 0; ic < Ncolour; ic++){
+	  Grid::PropToFerm<Grid::WilsonImplR>(LF,gr,is,ic);
+          ImportFermion(LF,cLF);
+          Chroma::FermToProp(cLF,ch,ic,is);
+      }
+    }
+  }
+  
+  static void ExportPropagator(Grid::LatticePropagator &gr,
+                            QDP::LatticePropagator &ch)
+  {
+    Grid::LatticeSpinColourVector LF(gr.Grid());
+    QDP::LatticeFermion cLF;
+
+    int Nspin=4;
+    int Ncolour=3;
+
+    for (int is = 0; is < Nspin; is++){
+      for (int ic = 0; ic < Ncolour; ic++){
+          Chroma::PropToFerm(ch,cLF,ic,is);
+          ExportFermion(LF,cLF);
+	  Grid::FermToProp<Grid::WilsonImplR>(gr,LF,is,ic);
+      }
+    }
+  }
+
+  // Specific for Wilson Fermions
+  static void ImportFermion(Grid::LatticeFermion &gr,
+                            QDP::LatticeFermion &ch)
+  {
+    Grid::SpinColourVector F;
+    Grid::Complex c;
+
+    QDP::Fermion cF;
+    QDP::SpinVector cS;
+    QDP::Complex cc;
+
+    std::vector<int> x(4); // explicit 4d fermions in Grid
+    QDP::multi1d<int> cx(4);
+    Grid::Coordinate gd = gr.Grid()->GlobalDimensions();
+
+    for (x[0] = 0; x[0] < gd[0]; x[0]++)
+    {
+      for (x[1] = 0; x[1] < gd[1]; x[1]++)
+      {
+        for (x[2] = 0; x[2] < gd[2]; x[2]++)
+        {
+          for (x[3] = 0; x[3] < gd[3]; x[3]++)
+          {
+            cx[0] = x[0];
+            cx[1] = x[1];
+            cx[2] = x[2];
+            cx[3] = x[3];
+
+            Grid::peekSite(F, gr, x);
+
+            for (int j = 0; j < 3; j++)
+            {
+              for (int sp = 0; sp < 4; sp++)
+              {
+
+                c = F()(sp)(j);
+
+                cc = QDP::cmplx(QDP::Real(real(c)), QDP::Real(imag(c)));
+
+                QDP::pokeSpin(cS, cc, sp);
+              }
+              QDP::pokeColor(cF, cS, j);
+            }
+            QDP::pokeSite(ch, cF, cx);
+          }
+        }
+      }
+    }
+  }
+
+  // Specific for 4d Wilson fermions
+  static void ExportFermion(Grid::LatticeFermion &gr,
+                            QDP::LatticeFermion &ch)
+  {
+    Grid::SpinColourVector F;
+    Grid::Complex c;
+
+    QDP::Fermion cF;
+    QDP::SpinVector cS;
+    QDP::Complex cc;
+
+    std::vector<int> x(4); // 4d fermions
+    QDP::multi1d<int> cx(4);
+    Grid::Coordinate gd = gr.Grid()->GlobalDimensions();
+
+    for (x[0] = 0; x[0] < gd[0]; x[0]++)
+    {
+      for (x[1] = 0; x[1] < gd[1]; x[1]++)
+      {
+        for (x[2] = 0; x[2] < gd[2]; x[2]++)
+        {
+          for (x[3] = 0; x[3] < gd[3]; x[3]++)
+          {
+            cx[0] = x[0];
+            cx[1] = x[1];
+            cx[2] = x[2];
+            cx[3] = x[3];
+
+            cF = QDP::peekSite(ch, cx);
+            for (int sp = 0; sp < 4; sp++)
+            {
+              for (int j = 0; j < 3; j++)
+              {
+                cS = QDP::peekColor(cF, j);
+                cc = QDP::peekSpin(cS, sp);
+                c = Grid::Complex(QDP::toDouble(QDP::real(cc)),
+                                  QDP::toDouble(QDP::imag(cc)));
+                F()
+                (sp)(j) = c;
+              }
+            }
+            Grid::pokeSite(F, gr, x);
+          }
+        }
+      }
+    }
+  }
+
+};
+} // namespace Chroma
+
+void make_gauge(GaugeField &Umu, Grid::LatticePropagator &q1,Grid::LatticePropagator &q2,Grid::LatticePropagator &q3)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  std::vector<int> seeds4({1, 2, 3, 4});
+
+  Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu.Grid();
+  Grid::GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+  Grid::SU3::HotConfiguration(RNG4, Umu);
+
+  // Propagator
+  Grid::gaussian(RNG4, q1);
+  Grid::gaussian(RNG4, q2);
+  Grid::gaussian(RNG4, q3);
+}
+
+void calc_chroma(GaugeField &lat, Grid::LatticePropagator &qU,Grid::LatticePropagator &qD,Grid::LatticePropagator &qS, std::vector<QDP::Complex> &res, std::string baryon)
+{
+  QDP::multi1d<QDP::LatticeColorMatrix> u(4);
+  Chroma::ChromaWrapper::ImportGauge(lat, u);
+
+  QDP::LatticePropagator check;
+  QDP::LatticePropagator result;
+  QDP::LatticePropagator  psiU;
+  QDP::LatticePropagator  psiD;
+  QDP::LatticePropagator  psiS;
+
+
+  Chroma::ChromaWrapper::ImportPropagator(qU, psiU);
+  Chroma::ChromaWrapper::ImportPropagator(qD, psiD);
+  Chroma::ChromaWrapper::ImportPropagator(qS, psiS);
+
+  if(0){
+    std::cout << "Testing ImportPropagator(): "  << std::endl;
+    Grid::GridCartesian *UGrid = (Grid::GridCartesian *)lat.Grid();
+    std::vector<Grid::TComplex> buf;
+    Grid::LatticeComplex tmp(UGrid);
+    tmp = Grid::trace(qU);  
+    Grid::sliceSum(tmp,buf,Grid::Nd-1);
+    for (unsigned int t = 0; t < buf.size(); ++t)
+    {
+  	  std::cout << "Grid qU " << t << " " << Grid::TensorRemove(buf[t]) << std::endl;
+    }
+    QDP::LatticeComplex ctmp;
+    ctmp = QDP::trace(psiU);
+    Chroma::SftMom phases0(0,true,3); //How do I circumvent this? sliceSum equivalent?
+    QDP::multi2d<DComplex> hsum0;
+    hsum0 = phases0.sft(ctmp);
+    for(int t = 0; t < phases0.numSubsets(); ++t){
+  	  std::cout << "Chroma qU " << t << " " << hsum0[0][t] << std::endl;
+    }
+  }
+
+  SpinMatrix C;
+  SpinMatrix C_5;
+  SpinMatrix C_4_5;
+  SpinMatrix CG_1;
+  SpinMatrix CG_2;
+  SpinMatrix CG_3;
+  SpinMatrix CG_4;
+
+     
+  SpinMatrix g_one = 1.0;
+  //C = \gamma_2\gamma_4
+  C = (Gamma(10)*g_one);
+  
+  //C_5 = C*gamma_5
+  C_5 = (Gamma(5)*g_one);
+           
+  //C_4_5 = C*gamma_4*gamma_5
+  C_4_5 = (Gamma(13)*g_one);
+  
+  //CG_1 = C*gamma_1
+  CG_1 = (Gamma(11)*g_one);
+                     
+  //CG_2 = C*gamma_2
+  CG_2 = (Gamma(8)*g_one);
+                           
+  //CG_3 = C*gamma_3
+  CG_3 = (Gamma(14)*g_one);
+  
+  //CG_4 = C*gamma_4
+  CG_4 = (Gamma(2)*g_one);
+                                     
+  // S_proj_unpol = (1/2)(1 + gamma_4)
+  SpinMatrix S_proj_unpol = 0.5 * (g_one + (g_one * Gamma(8)));
+  
+  QDP::LatticeComplex b_prop;
+  QDP::LatticePropagator di_quark;
+
+  if(! baryon.compare("OmegaX")){
+    // Omega_x - this esentially is degenerate (s C\gamma_1 s)s
+    // C gamma_1 = Gamma(10) * Gamma(1) = Gamma(11)
+    di_quark = QDP::quarkContract13(psiS * CG_1, CG_1 * psiS);
+    b_prop = QDP::trace(S_proj_unpol * QDP::traceColor(psiS * QDP::traceSpin(di_quark)))
+		      	     + 2.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiS * di_quark));
+  } else if (! baryon.compare("OmegaY")){
+    // Omega_x - this esentially is degenerate (s C\gamma_3 s)s
+    // C gamma_1 = Gamma(10) * Gamma(2) = Gamma(8)
+    di_quark = QDP::quarkContract13(psiS * CG_2, CG_2 * psiS);
+    b_prop = QDP::trace(S_proj_unpol * QDP::traceColor(psiS * QDP::traceSpin(di_quark)))
+		      	     + 2.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiS * di_quark));
+  } else if (! baryon.compare("OmegaZ")){
+    // Omega_x - this esentially is degenerate (s C\gamma_3 s)s
+    // C gamma_1 = Gamma(10) * Gamma(4) = Gamma(14)
+    di_quark = QDP::quarkContract13(psiS * CG_3, CG_3 * psiS);
+    b_prop = QDP::trace(S_proj_unpol * QDP::traceColor(psiS * QDP::traceSpin(di_quark)))
+		      	     + 2.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiS * di_quark));
+  } else if (! baryon.compare("Proton")){
+    // Proton - this esentially is degenerate (d C\gamma_5 u)u
+// This is how the UKHadron code is written - diquarks are swapped when compared to coment above code.	  
+    //di_quark = QDP::quarkContract13(psiU * C_5, C_5 * psiD);
+    di_quark = QDP::quarkContract13(psiD * C_5, C_5 * psiU);
+    b_prop = QDP::trace(S_proj_unpol * QDP::traceColor(psiU * QDP::traceSpin(di_quark)))
+		      	     + QDP::trace(S_proj_unpol * QDP::traceColor(psiU * di_quark));
+  } else if (! baryon.compare("Lambda")){
+    // Lambda (octet) - This is the totally antisymmetric 
+    // one from the middle of the octet
+    // Lambda - (d C\gamma_5 s)u - (u C\gamma_5 s)d
+    // This is given by: 
+    // 1/3[ <us>d + <ds>u + 4<ud>s - (usd) - (dsu) + 2(sud) + 2(sdu) + 2(uds) + 2(dus) ]
+   
+/* This is how the UKHadron code is written - diquarks are swapped when compared to coments above code.	  
+    // This gives <us>d - (usd) -- yes
+    di_quark = QDP::quarkContract13(psiU * C_5, C_5 * psiS);
+    b_prop = QDP::trace(S_proj_unpol * QDP::traceColor(psiD * QDP::traceSpin(di_quark)))
+      	     - QDP::trace(S_proj_unpol * QDP::traceColor(psiD * di_quark));
+
+    // This gives <ds>u - (dsu) -- yes
+    di_quark = quarkContract13(psiD * C_5,C_5 * psiS);
+    b_prop += QDP::trace(S_proj_unpol * QDP::traceColor(psiU * QDP::traceSpin(di_quark)))
+       	      - QDP::trace(S_proj_unpol * QDP::traceColor(psiU * di_quark));
+
+    // This gives 4<ud>s  -- yes
+    di_quark = quarkContract13(psiU * C_5,C_5 * psiD);
+    b_prop += 4.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiS * QDP::traceSpin(di_quark)));
+
+    //This gives 2(sud) -- yes
+    di_quark = quarkContract13(psiS * C_5,C_5 * psiU);
+    b_prop += 2.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiD * di_quark));
+
+    // This gives 2(sdu) -- yes
+    di_quark = quarkContract13(psiS * C_5,C_5 * psiD);
+    b_prop += 2.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiU * di_quark));
+
+    // This gives 2(uds) -- yes
+    di_quark = quarkContract13(psiU * C_5,C_5 * psiD);
+    b_prop += 2.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiS * di_quark));
+
+    // This gives 2(dus) -- yes
+    di_quark = quarkContract13(psiD * C_5,C_5 * psiU);
+    b_prop += 2.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiS * di_quark));*/
+
+    // This gives <us>d - (usd) -- yes
+    di_quark = QDP::quarkContract13(psiS * C_5, C_5 * psiU);
+    b_prop = QDP::trace(S_proj_unpol * QDP::traceColor(psiD * QDP::traceSpin(di_quark)))
+      	     - QDP::trace(S_proj_unpol * QDP::traceColor(psiD * di_quark));
+
+    // This gives <ds>u - (dsu) -- yes
+    di_quark = quarkContract13(psiS * C_5,C_5 * psiD);
+    b_prop += QDP::trace(S_proj_unpol * QDP::traceColor(psiU * QDP::traceSpin(di_quark)))
+       	      - QDP::trace(S_proj_unpol * QDP::traceColor(psiU * di_quark));
+
+    // This gives 4<ud>s  -- yes
+    di_quark = quarkContract13(psiD * C_5,C_5 * psiU);
+    b_prop += 4.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiS * QDP::traceSpin(di_quark)));
+
+    //This gives 2(sud) -- yes
+    di_quark = quarkContract13(psiU * C_5,C_5 * psiS);
+    b_prop += 2.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiD * di_quark));
+
+    // This gives 2(sdu) -- yes
+    di_quark = quarkContract13(psiD * C_5,C_5 * psiS);
+    b_prop += 2.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiU * di_quark));
+
+    // This gives 2(uds) -- yes
+    di_quark = quarkContract13(psiD * C_5,C_5 * psiU);
+    b_prop += 2.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiS * di_quark));
+
+    // This gives 2(dus) -- yes
+    di_quark = quarkContract13(psiU * C_5,C_5 * psiD);
+    b_prop += 2.0 * QDP::trace(S_proj_unpol * QDP::traceColor(psiS * di_quark));
+  } else {
+    std::cout << "baryon not part of test " << std::endl;
+    return;
+  }
+  std::cout<< "Chroma computing " << baryon << std::endl;
+
+  Chroma::SftMom phases(0,true,3); //How do I circumvent this? sliceSum equivalent?
+  QDP::multi2d<DComplex> hsum;
+  hsum = phases.sft(b_prop);
+  int length = phases.numSubsets();
+  res.resize(length);
+  for(int t = 0; t < length; ++t){
+    res[t] = hsum[0][t]; //Should I test momentum?
+  }
+
+}
+
+
+void calc_grid(Grid::LatticeGaugeField &Umu, Grid::LatticePropagator &qU, Grid::LatticePropagator &qD, Grid::LatticePropagator &qS, std::vector<Grid::Complex> &res, std::string baryon)
+{
+  using namespace Grid;
+  using namespace Grid::QCD;
+
+  Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu.Grid();
+
+  Grid::Gamma G_A = Grid::Gamma(Grid::Gamma::Algebra::Identity);
+  Grid::Gamma G_B = Grid::Gamma(Grid::Gamma::Algebra::GammaZGamma5); // OmegaX: C*GammaX = i* GammaZ*Gamma5
+
+  Grid::LatticeComplex c(UGrid);
+  Grid::LatticeComplex c1(UGrid);
+
+  if(! baryon.compare("OmegaX")){
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qS,qS,qS,G_A,G_B,G_A,G_B,"sss","sss",1,c);  
+    c*=0.5;
+    std::cout << "Grid-Omega factor 2 larger than Chroma-Omega!!!" << std::endl; 
+  } else if (! baryon.compare("OmegaY")){
+    G_B = Grid::Gamma(Grid::Gamma::Algebra::GammaT);
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qS,qS,qS,G_A,G_B,G_A,G_B,"sss","sss",1,c);  
+    c*=0.5;
+    std::cout << "Grid-Omega factor 2 larger than Chroma-Omega!!!" << std::endl; 
+  } else if (! baryon.compare("OmegaZ")){
+    G_B = Grid::Gamma(Grid::Gamma::Algebra::GammaXGamma5);
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qS,qS,qS,G_A,G_B,G_A,G_B,"sss","sss",1,c);  
+    c*=0.5;
+    std::cout << "Grid-Omega factor 2 larger than Chroma-Omega!!!" << std::endl; 
+  } else if (! baryon.compare("Proton")){
+    G_B = Grid::Gamma(Grid::Gamma::Algebra::SigmaXZ);
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qU,qD,qU,G_A,G_B,G_A,G_B,"udu","udu",1,c); 
+    std::cout << "UKHadron-Proton has flipped diquarks in original code." << std::endl; 
+  } else if (! baryon.compare("Lambda")){
+    G_B = Grid::Gamma(Grid::Gamma::Algebra::SigmaXZ);
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qS,qU,qD,G_A,G_B,G_A,G_B,"sud","sud",1,c1); //<ud>s 
+    c = 4.*c1; 
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qD,qU,qS,G_A,G_B,G_A,G_B,"dus","dus",1,c1); //<us>d 
+    c += c1; 
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qU,qD,qS,G_A,G_B,G_A,G_B,"uds","uds",1,c1); //<ds>u  
+    c += c1; 
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qD,qU,qS,G_A,G_B,G_A,G_B,"dus","sud",1,c1); //(sud) 
+    c += 2.*c1; 
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qU,qD,qS,G_A,G_B,G_A,G_B,"uds","sud",1,c1); //(sdu) 
+    c -= 2.*c1; 
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qS,qU,qD,G_A,G_B,G_A,G_B,"sud","dus",1,c1); //(dus) 
+    c += 2.*c1; 
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qU,qD,qS,G_A,G_B,G_A,G_B,"uds","dus",1,c1); //-(dsu) 
+    c -= c1; 
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qS,qU,qD,G_A,G_B,G_A,G_B,"sud","uds",1,c1); //(uds) 
+    c -= 2.*c1; 
+    BaryonUtils<Grid::WilsonImplR>::ContractBaryons(qD,qU,qS,G_A,G_B,G_A,G_B,"dus","uds",1,c1); //-(usd) 
+    c -= c1; 
+    std::cout << "UKHadron-Lambda has flipped diquarks in original code." << std::endl; 
+  } else {
+    std::cout << "baryon not part of test " << std::endl;
+    return;
+  }
+  std::cout<< "Grid computing " << baryon << std::endl;
+
+  std::vector<Grid::TComplex> buf;
+  Grid::sliceSum(c,buf,Grid::Nd-1);
+  res.resize(buf.size());
+  for (unsigned int t = 0; t < buf.size(); ++t)
+  {
+    res[t]=Grid::TensorRemove(buf[t]);
+  }
+
+}
+
+int main(int argc, char **argv)
+{
+
+  /********************************************************
+   * Setup QDP
+   *********************************************************/
+  Chroma::initialize(&argc, &argv);
+  Chroma::WilsonTypeFermActs4DEnv::registerAll();
+
+  /********************************************************
+   * Setup Grid
+   *********************************************************/
+  Grid::Grid_init(&argc, &argv);
+  Grid::GridCartesian *UGrid = Grid::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(),
+                                                                         Grid::GridDefaultSimd(Grid::Nd, Grid::vComplex::Nsimd()),
+                                                                         Grid::GridDefaultMpi());
+
+  Grid::Coordinate gd = UGrid->GlobalDimensions();
+  QDP::multi1d<int> nrow(QDP::Nd);
+  for (int mu = 0; mu < 4; mu++)
+    nrow[mu] = gd[mu];
+
+  QDP::Layout::setLattSize(nrow);
+  QDP::Layout::create();
+
+  GaugeField Ug(UGrid);
+  typedef Grid::LatticePropagator PropagatorField;
+  PropagatorField up(UGrid);
+  PropagatorField down(UGrid);
+  PropagatorField strange(UGrid);
+  std::vector<ComplexD> res_chroma;
+  std::vector<Grid::Complex> res_grid;
+  Grid::Complex res_chroma_g;
+
+  std::vector<std::string> baryons({"OmegaX","OmegaY","OmegaZ","Proton","Lambda"});
+  int nBaryon=baryons.size();
+
+  for (int iB = 0; iB < nBaryon; iB++)
+  {
+      make_gauge(Ug, up, down, strange); // fills the gauge field and the propagator with random numbers
+
+      calc_chroma(Ug, up, down, strange, res_chroma,baryons[iB]);
+ 
+      for(int t=0;t<res_chroma.size();t++){
+        std::cout << " Chroma baryon "<<t<<" "<< res_chroma[t] << std::endl;
+      }
+         
+      calc_grid(Ug, up, down, strange, res_grid,baryons[iB]);
+
+      for(int t=0;t<res_chroma.size();t++){
+        std::cout << " Grid baryon "<<t<<" "<< res_grid[t] << std::endl;
+      }
+      for(int t=0;t<res_chroma.size();t++){
+	res_chroma_g = Grid::Complex(toDouble(real(res_chroma[t])), toDouble(imag(res_chroma[t])));
+        std::cout << " Difference "<<t<<" "<< res_chroma_g - res_grid[t] << std::endl;
+      }
+
+      std::cout << "Finished test " << std::endl;
+
+  }
+  Chroma::finalize();
+}