From f4033ad8cb32c34debe1623d84eab7c0d79116d5 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Mon, 27 Apr 2020 17:46:14 +0100
Subject: [PATCH 01/19] baryon speedup by a factor 2

---
 Grid/qcd/utils/BaryonUtils.h | 416 ++++++++++++++++++++++++++++++++++-
 1 file changed, 407 insertions(+), 9 deletions(-)
diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index d65b9176..18d6f84b 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -46,7 +46,8 @@ public:
   typedef typename SpinMatrixField::vector_object sobj;
 
   static const int epsilon[6][3] ;
-  static const Complex epsilon_sgn[6];
+  //static const Complex epsilon_sgn[6];
+  static const double epsilon_sgn[6];
 
   private: 
   template <class mobj, class robj>
@@ -60,6 +61,62 @@ public:
 				 const int parity,
 				 const int * wick_contractions,
   				 robj &result);
+  template <class mobj, class robj, int w0, int w1, int w2, int w3, int w4, int w5>
+  static void baryon_site_macro(const mobj &D1,
+				 const mobj &D2,
+				 const mobj &D3,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const int parity,
+  				 robj &result);
+  template <class mobj, class robj>
+  static void baryon_site_macro(const mobj &D1,
+				 const mobj &D2,
+				 const mobj &D3,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const int parity,
+				 const int * wick_contractions,
+  				 robj &result);
+  template <class mobj, class robj>
+  static inline void baryon_site_template(unsigned int mask, const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result);
+  template <unsigned int mask, class mobj, class robj>
+  static inline void baryon_site_template(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result);
+						 
+  template <unsigned int maxMask>
+  struct BaryonSiteHelper
+  {
+  template <class mobj, class robj>
+    static inline void function(const unsigned int mask, const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result);
+						 };
   public:
   static void ContractBaryons(const PropagatorField &q1_left,
 				 const PropagatorField &q2_left,
@@ -151,14 +208,18 @@ public:
 
 template <class FImpl> 
 const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
-template <class FImpl> 
+/*template <class FImpl> 
 const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
 						    Complex(1),
 						    Complex(1),
 						    Complex(-1),
 						    Complex(-1),
 						    Complex(-1)};
+*/
+template <class FImpl> 
+const double BaryonUtils<FImpl>::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0};
 
+//This is the old version
 template <class FImpl>
 template <class mobj, class robj>
 void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
@@ -188,13 +249,15 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
         int a_right = epsilon[ie_right][0]; //a'
         int b_right = epsilon[ie_right][1]; //b'
         int c_right = epsilon[ie_right][2]; //c'
+	//complex<double> ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
+	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
         //This is the \delta_{456}^{123} part
 	if (wick_contraction[0]){
           auto D2g = D2 * GammaB_left;
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+	    result()()() += ee * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
           }}}
   	}	  
         //This is the \delta_{456}^{231} part
@@ -203,7 +266,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+	    result()()() += ee * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
           }}}
         }	  
         //This is the \delta_{456}^{312} part
@@ -212,7 +275,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    result()()() += ee * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
           }}}
         }	  
         //This is the \delta_{456}^{132} part
@@ -221,7 +284,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    result()()() -= ee * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
           }}}
         }	  
         //This is the \delta_{456}^{321} part
@@ -230,7 +293,7 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
+	    result()()() -= ee * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
           }}}
         }	  
         //This is the \delta_{456}^{213} part
@@ -239,13 +302,284 @@ void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= epsilon_sgn[ie_left] * epsilon_sgn[ie_right] * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
+	    result()()() -= ee * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
           }}}
         }	  
       }
     }
 }
 
+template <class FImpl>
+template <class mobj, class robj, int w0, int w1, int w2, int w3, int w4, int w5>
+void BaryonUtils<FImpl>::baryon_site_macro(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result)
+{
+
+  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
+
+    auto gD1a = GammaA_left * GammaA_right * D1;
+    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
+    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
+    auto gD3 = GammaB_right * D3;
+
+    auto D2g = D2 * GammaB_left;
+    auto pD1g = pD1 * GammaB_left;
+    auto gD3g = gD3 * GammaB_left;
+
+    for (int ie_left=0; ie_left < 6 ; ie_left++){
+      int a_left = epsilon[ie_left][0]; //a
+      int b_left = epsilon[ie_left][1]; //b
+      int c_left = epsilon[ie_left][2]; //c
+      for (int ie_right=0; ie_right < 6 ; ie_right++){
+        int a_right = epsilon[ie_right][0]; //a'
+        int b_right = epsilon[ie_right][1]; //b'
+        int c_right = epsilon[ie_right][2]; //c'
+	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
+        //All parts together
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
+	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
+            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
+	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    if(w0){
+	        result()()() += eepD1*D2g_ab*gD3_ab;
+	    }
+  	    if(w1){
+		result()()() += eepD1g_gb*D2_ab*gD3_ag;
+	    }
+	    if(w2){
+		result()()() += eepD1_gb*D2_ag*gD3g_ab;
+	    }
+	    if(w3){
+    		result()()() -= eepD1*D2_ab*gD3g_ab;
+	    }
+    	    if(w4){
+		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
+	    }
+            if(w5){
+    	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
+            }
+  	  }}}
+      }
+    }
+}
+
+#define BARYON_SITE(w0, w1, w2, w3, w4, w5, D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, wick_contraction, result) \
+	if((wick_contraction[0] == w0) && (wick_contraction[1] == w1) &&  (wick_contraction[2] == w2) &&  (wick_contraction[3] == w3) &&  (wick_contraction[4] == w4) &&  (wick_contraction[5] == w5)) \
+{\
+ baryon_site_macro<mobj, robj, w0, w1, w2, w3, w4, w5>( D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, result );\
+}
+
+template <class FImpl>
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::baryon_site_macro(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 const int * wick_contraction,
+						 robj &result)
+{
+BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
+ 
+}
+
+
+template <class FImpl>
+template <unsigned int mask, class mobj, class robj>
+inline void BaryonUtils<FImpl>::baryon_site_template(const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result)
+{
+    constexpr bool wick_contraction_0 = ((mask & (1 << 5)) >> 5);
+    constexpr bool wick_contraction_1 = ((mask & (1 << 4)) >> 4);
+    constexpr bool wick_contraction_2 = ((mask & (1 << 3)) >> 3);
+    constexpr bool wick_contraction_3 = ((mask & (1 << 2)) >> 2);
+    constexpr bool wick_contraction_4 = ((mask & (1 << 1)) >> 1);
+    constexpr bool wick_contraction_5 = ((mask & (1 << 0)) >> 0);
+
+  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
+
+    auto gD1a = GammaA_left * GammaA_right * D1;
+    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
+    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
+    auto gD3 = GammaB_right * D3;
+
+    auto D2g = D2 * GammaB_left;
+    auto pD1g = pD1 * GammaB_left;
+    auto gD3g = gD3 * GammaB_left;
+
+    for (int ie_left=0; ie_left < 6 ; ie_left++){
+      int a_left = epsilon[ie_left][0]; //a
+      int b_left = epsilon[ie_left][1]; //b
+      int c_left = epsilon[ie_left][2]; //c
+      for (int ie_right=0; ie_right < 6 ; ie_right++){
+        int a_right = epsilon[ie_right][0]; //a'
+        int b_right = epsilon[ie_right][1]; //b'
+        int c_right = epsilon[ie_right][2]; //c'
+	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
+        //All parts together
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
+	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
+            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
+	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+	    if(wick_contraction_0){
+	      result()()() += eepD1*D2g_ab*gD3_ab;
+	    }
+  	    if(wick_contraction_1){
+		    result()()() += eepD1g_gb*D2_ab*gD3_ag;
+	    }
+	    if(wick_contraction_2){
+		    result()()() += eepD1_gb*D2_ag*gD3g_ab;
+	    }
+            if(wick_contraction_3){
+    		result()()() -= eepD1*D2_ab*gD3g_ab;
+	    }
+    	    if(wick_contraction_4){
+		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
+	    }
+            if(wick_contraction_5){
+    	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
+            }
+  	  }}}
+      }
+    }
+}
+
+template <class FImpl>
+template <unsigned int maxMask>
+template <class mobj, class robj>
+inline void BaryonUtils<FImpl>::BaryonSiteHelper<maxMask>::function(const unsigned int mask, const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result)
+{
+    if (mask == maxMask)
+    {
+        baryon_site_template<maxMask,decltype(D1),decltype(result)>(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
+    }
+    else
+    {
+        BaryonSiteHelper<(maxMask>0) ? maxMask-1 : 0>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
+    }
+}
+
+// top-level function
+template <class FImpl>
+template <class mobj, class robj>
+inline void BaryonUtils<FImpl>::baryon_site_template(const unsigned int mask, const mobj &D1,
+						 const mobj &D2,
+						 const mobj &D3,
+				                 const Gamma GammaA_left,
+				                 const Gamma GammaB_left,
+				                 const Gamma GammaA_right,
+		                 		 const Gamma GammaB_right,
+						 const int parity,
+						 robj &result)
+{
+    BaryonSiteHelper<63>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
+}
+
+
 template<class FImpl>
 void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const PropagatorField &q2_left,
@@ -259,6 +593,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const int parity,
 						 ComplexField &baryon_corr)
 {
+    const std::chrono::system_clock::time_point start{ std::chrono::system_clock::now() };
+    std::time_t now = std::chrono::system_clock::to_time_t( start );
+    std::cout << "Setup start " << std::ctime( &now );
+
   std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
     std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
     std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
@@ -278,6 +616,16 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
   auto v2 = q2_left.View();
   auto v3 = q3_left.View();
 
+    const std::chrono::system_clock::time_point stop{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( stop );
+    const std::chrono::duration<double> duration_seconds = stop - start;
+    const double seconds{ ( duration_seconds.count() ) };
+    std::cout << "Setup stop " << std::ctime( &now )
+              << "Total duration " << std::fixed << std::setprecision(5) << seconds << " seconds." << std::endl;
+
+    const std::chrono::system_clock::time_point start2{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( start2 );
+    std::cout << "Normal Loop start " << std::ctime( &now );
  // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
   thread_for(ss,grid->oSites(),{
   //for(int ss=0; ss < grid->oSites(); ss++){
@@ -290,6 +638,56 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
     baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
     vbaryon_corr[ss] = result; 
   }  );//end loop over lattice sites
+    const std::chrono::system_clock::time_point stop2{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( stop2 );
+    const std::chrono::duration<double> duration_seconds2 = stop2 - start2;
+    const double seconds2{ ( duration_seconds2.count() ) };
+    std::cout << "Normal Loop stop " << std::ctime( &now )
+              << "Total duration " << std::fixed << std::setprecision(5) << seconds2 << " seconds." << std::endl;
+	      const std::chrono::system_clock::time_point start4{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( start4 );
+    std::cout << "Opt-macro Loop start " << std::ctime( &now );
+ // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+  thread_for(ss,grid->oSites(),{
+  //for(int ss=0; ss < grid->oSites(); ss++){
+
+    auto D1 = v1[ss];
+    auto D2 = v2[ss];
+    auto D3 = v3[ss];
+
+    vobj result=Zero();
+    baryon_site_macro(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+    vbaryon_corr[ss] = result; 
+  }  );//end loop over lattice sites
+    const std::chrono::system_clock::time_point stop4{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( stop4 );
+    const std::chrono::duration<double> duration_seconds4 = stop4 - start4;
+    const double seconds4{ ( duration_seconds4.count() ) };
+    std::cout << "Opt-macro Loop stop " << std::ctime( &now )
+              << "Total duration " << std::fixed << std::setprecision(5) << seconds4 << " seconds." << std::endl; 
+    const std::chrono::system_clock::time_point start3{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( start3 );
+    int wick_id=32*wick_contraction[0]+16*wick_contraction[1]+8*wick_contraction[2]+4*wick_contraction[3]+2*wick_contraction[4]+wick_contraction[5];
+    std::cout << "Opt-template Loop start " << std::ctime( &now );
+ // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+  thread_for(ss,grid->oSites(),{
+  //for(int ss=0; ss < grid->oSites(); ss++){
+
+    auto D1 = v1[ss];
+    auto D2 = v2[ss];
+    auto D3 = v3[ss];
+
+    vobj result=Zero();
+    baryon_site_template(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
+    vbaryon_corr[ss] = result; 
+  }  );//end loop over lattice sites
+    const std::chrono::system_clock::time_point stop3{ std::chrono::system_clock::now() };
+    now = std::chrono::system_clock::to_time_t( stop3 );
+    const std::chrono::duration<double> duration_seconds3 = stop3 - start3;
+    const double seconds3{ ( duration_seconds3.count() ) };
+    std::cout << "Opt-template Loop stop " << std::ctime( &now )
+              << "Total duration " << std::fixed << std::setprecision(5) << seconds3 << " seconds." << std::endl;
+
 }
 template <class FImpl>
 template <class mobj, class robj>
@@ -318,7 +716,7 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
     wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
 
      result=Zero();
-     baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+     baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
 }
 
 /***********************************************************************

From 6240e02619be0b0c5bda84173c0dc6cba62aef84 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Mon, 27 Apr 2020 18:50:53 +0100
Subject: [PATCH 02/19] added assertion to avoid potential infinite loop

---
 Grid/qcd/utils/BaryonUtils.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index 18d6f84b..a392f223 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -553,6 +553,7 @@ inline void BaryonUtils<FImpl>::BaryonSiteHelper<maxMask>::function(const unsign
 						 const int parity,
 						 robj &result)
 {
+    assert(mask <= maxMask);
     if (mask == maxMask)
     {
         baryon_site_template<maxMask,decltype(D1),decltype(result)>(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);

From dee96cbf8296b9f16dc378c78cdcb74302da77c5 Mon Sep 17 00:00:00 2001
From: Christopher Kelly <giltirn@gmail.com>
Date: Wed, 29 Apr 2020 10:37:11 -0400
Subject: [PATCH 03/19] Added workaround in configure to still catch Cuda
 compiler when nvcc with extra arguments (eg -ccbin) is used as CXX

---
 configure.ac | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index fb69ca0e..46559507 100644
--- a/configure.ac
+++ b/configure.ac
@@ -274,12 +274,20 @@ case ${ac_gen_scalar} in
 esac
 
 ##################### Compiler dependent choices
-case ${CXX} in 
+
+#Strip any optional compiler arguments from nvcc call (eg -ccbin) for compiler comparison
+CXXBASE=${CXX}
+CXXTEST=${CXX}
+if echo "${CXX}" | grep -q "nvcc"; then
+  CXXTEST="nvcc"
+fi   
+
+case ${CXXTEST} in 
   nvcc) 
 #    CXX="nvcc -keep -v -x cu "
 #    CXXLD="nvcc -v -link"
-    CXX="nvcc -x cu "
-    CXXLD="nvcc -link"
+    CXX="${CXXBASE} -x cu "
+    CXXLD="${CXXBASE} -link"
 #    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
     CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
     if test $ac_openmp = yes; then

From 56e2f7d088aa36af4990742822c8cd47363d2391 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Thu, 7 May 2020 10:03:45 +0100
Subject: [PATCH 04/19] deleted test routines. cleaned up fast version. assert
 Ns=4,Nc=3.

---
 Grid/qcd/utils/BaryonUtils.h | 414 ++++-------------------------------
 1 file changed, 40 insertions(+), 374 deletions(-)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index a392f223..241395c3 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -46,44 +46,11 @@ public:
   typedef typename SpinMatrixField::vector_object sobj;
 
   static const int epsilon[6][3] ;
-  //static const Complex epsilon_sgn[6];
   static const double epsilon_sgn[6];
 
   private: 
   template <class mobj, class robj>
-  static void baryon_site(const mobj &D1,
-				 const mobj &D2,
-				 const mobj &D3,
-				 const Gamma GammaA_left,
-				 const Gamma GammaB_left,
-				 const Gamma GammaA_right,
-				 const Gamma GammaB_right,
-				 const int parity,
-				 const int * wick_contractions,
-  				 robj &result);
-  template <class mobj, class robj, int w0, int w1, int w2, int w3, int w4, int w5>
-  static void baryon_site_macro(const mobj &D1,
-				 const mobj &D2,
-				 const mobj &D3,
-				 const Gamma GammaA_left,
-				 const Gamma GammaB_left,
-				 const Gamma GammaA_right,
-				 const Gamma GammaB_right,
-				 const int parity,
-  				 robj &result);
-  template <class mobj, class robj>
-  static void baryon_site_macro(const mobj &D1,
-				 const mobj &D2,
-				 const mobj &D3,
-				 const Gamma GammaA_left,
-				 const Gamma GammaB_left,
-				 const Gamma GammaA_right,
-				 const Gamma GammaB_right,
-				 const int parity,
-				 const int * wick_contractions,
-  				 robj &result);
-  template <class mobj, class robj>
-  static inline void baryon_site_template(unsigned int mask, const mobj &D1,
+  static inline void baryon_site(unsigned int mask, const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -93,7 +60,7 @@ public:
 						 const int parity,
 						 robj &result);
   template <unsigned int mask, class mobj, class robj>
-  static inline void baryon_site_template(const mobj &D1,
+  static inline void baryon_site(const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -208,266 +175,12 @@ public:
 
 template <class FImpl> 
 const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
-/*template <class FImpl> 
-const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
-						    Complex(1),
-						    Complex(1),
-						    Complex(-1),
-						    Complex(-1),
-						    Complex(-1)};
-*/
 template <class FImpl> 
 const double BaryonUtils<FImpl>::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0};
 
-//This is the old version
-template <class FImpl>
-template <class mobj, class robj>
-void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 const int * wick_contraction,
-						 robj &result)
-{
-
-  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
-
-    auto gD1a = GammaA_left * GammaA_right * D1;
-    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
-    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
-    auto gD3 = GammaB_right * D3;
-
-    for (int ie_left=0; ie_left < 6 ; ie_left++){
-      int a_left = epsilon[ie_left][0]; //a
-      int b_left = epsilon[ie_left][1]; //b
-      int c_left = epsilon[ie_left][2]; //c
-      for (int ie_right=0; ie_right < 6 ; ie_right++){
-        int a_right = epsilon[ie_right][0]; //a'
-        int b_right = epsilon[ie_right][1]; //b'
-        int c_right = epsilon[ie_right][2]; //c'
-	//complex<double> ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
-	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
-        //This is the \delta_{456}^{123} part
-	if (wick_contraction[0]){
-          auto D2g = D2 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += ee * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
-          }}}
-  	}	  
-        //This is the \delta_{456}^{231} part
-	if (wick_contraction[1]){
-          auto pD1g = pD1 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += ee * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
-          }}}
-        }	  
-        //This is the \delta_{456}^{312} part
-	if (wick_contraction[2]){
-          auto gD3g = gD3 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() += ee * pD1()(gamma_left,beta_left)(c_right,b_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
-          }}}
-        }	  
-        //This is the \delta_{456}^{132} part
-	if (wick_contraction[3]){
-          auto gD3g = gD3 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= ee * pD1()(gamma_left,gamma_left)(c_right,c_left)*D2()(alpha_right,beta_left)(a_right,b_left)*gD3g()(alpha_right,beta_left)(b_right,a_left);
-          }}}
-        }	  
-        //This is the \delta_{456}^{321} part
-	if (wick_contraction[4]){
-          auto D2g = D2 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= ee * pD1()(gamma_left,beta_left)(c_right,b_left)*D2g()(alpha_right,beta_left)(a_right,a_left)*gD3()(alpha_right,gamma_left)(b_right,c_left);
-          }}}
-        }	  
-        //This is the \delta_{456}^{213} part
-	if (wick_contraction[5]){
-          auto pD1g = pD1 * GammaB_left;
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-	    result()()() -= ee * pD1g()(gamma_left,beta_left)(c_right,a_left)*D2()(alpha_right,gamma_left)(a_right,c_left)*gD3()(alpha_right,beta_left)(b_right,b_left);
-          }}}
-        }	  
-      }
-    }
-}
-
-template <class FImpl>
-template <class mobj, class robj, int w0, int w1, int w2, int w3, int w4, int w5>
-void BaryonUtils<FImpl>::baryon_site_macro(const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result)
-{
-
-  Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
-
-    auto gD1a = GammaA_left * GammaA_right * D1;
-    auto gD1b = GammaA_left * g4 * GammaA_right * D1;
-    auto pD1 = 0.5* (gD1a + (double)parity * gD1b);
-    auto gD3 = GammaB_right * D3;
-
-    auto D2g = D2 * GammaB_left;
-    auto pD1g = pD1 * GammaB_left;
-    auto gD3g = gD3 * GammaB_left;
-
-    for (int ie_left=0; ie_left < 6 ; ie_left++){
-      int a_left = epsilon[ie_left][0]; //a
-      int b_left = epsilon[ie_left][1]; //b
-      int c_left = epsilon[ie_left][2]; //c
-      for (int ie_right=0; ie_right < 6 ; ie_right++){
-        int a_right = epsilon[ie_right][0]; //a'
-        int b_right = epsilon[ie_right][1]; //b'
-        int c_right = epsilon[ie_right][2]; //c'
-	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
-        //All parts together
-	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
-            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
-	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
-            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
-	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
-	  for (int beta_left=0; beta_left<Ns; beta_left++){
-            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
-            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
-	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
-	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
-	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
-	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
-	    if(w0){
-	        result()()() += eepD1*D2g_ab*gD3_ab;
-	    }
-  	    if(w1){
-		result()()() += eepD1g_gb*D2_ab*gD3_ag;
-	    }
-	    if(w2){
-		result()()() += eepD1_gb*D2_ag*gD3g_ab;
-	    }
-	    if(w3){
-    		result()()() -= eepD1*D2_ab*gD3g_ab;
-	    }
-    	    if(w4){
-		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
-	    }
-            if(w5){
-    	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
-            }
-  	  }}}
-      }
-    }
-}
-
-#define BARYON_SITE(w0, w1, w2, w3, w4, w5, D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, wick_contraction, result) \
-	if((wick_contraction[0] == w0) && (wick_contraction[1] == w1) &&  (wick_contraction[2] == w2) &&  (wick_contraction[3] == w3) &&  (wick_contraction[4] == w4) &&  (wick_contraction[5] == w5)) \
-{\
- baryon_site_macro<mobj, robj, w0, w1, w2, w3, w4, w5>( D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, result );\
-}
-
-template <class FImpl>
-template <class mobj, class robj>
-void BaryonUtils<FImpl>::baryon_site_macro(const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 const int * wick_contraction,
-						 robj &result)
-{
-BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 0 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
-BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 1 ,  D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result);
- 
-}
-
-
 template <class FImpl>
 template <unsigned int mask, class mobj, class robj>
-inline void BaryonUtils<FImpl>::baryon_site_template(const mobj &D1,
+inline void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -556,7 +269,7 @@ inline void BaryonUtils<FImpl>::BaryonSiteHelper<maxMask>::function(const unsign
     assert(mask <= maxMask);
     if (mask == maxMask)
     {
-        baryon_site_template<maxMask,decltype(D1),decltype(result)>(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
+        baryon_site<maxMask,decltype(D1),decltype(result)>(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
     }
     else
     {
@@ -567,7 +280,7 @@ inline void BaryonUtils<FImpl>::BaryonSiteHelper<maxMask>::function(const unsign
 // top-level function
 template <class FImpl>
 template <class mobj, class robj>
-inline void BaryonUtils<FImpl>::baryon_site_template(const unsigned int mask, const mobj &D1,
+inline void BaryonUtils<FImpl>::baryon_site(const unsigned int mask, const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -594,40 +307,30 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const int parity,
 						 ComplexField &baryon_corr)
 {
-    const std::chrono::system_clock::time_point start{ std::chrono::system_clock::now() };
-    std::time_t now = std::chrono::system_clock::to_time_t( start );
-    std::cout << "Setup start " << std::ctime( &now );
+   
+    assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+    assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
-  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+    std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
     std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
     std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
     std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
     std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
-  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+    assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
-  GridBase *grid = q1_left.Grid();
+    GridBase *grid = q1_left.Grid();
 
-  int wick_contraction[6];
-  for (int ie=0; ie < 6 ; ie++)
-    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
+    int wick_id;
+    for (int ie=0; ie < 6 ; ie++)
+      wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie);
 
-  auto vbaryon_corr= baryon_corr.View();
-  auto v1 = q1_left.View();
-  auto v2 = q2_left.View();
-  auto v3 = q3_left.View();
+    auto vbaryon_corr= baryon_corr.View();
+    auto v1 = q1_left.View();
+    auto v2 = q2_left.View();
+    auto v3 = q3_left.View();
 
-    const std::chrono::system_clock::time_point stop{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( stop );
-    const std::chrono::duration<double> duration_seconds = stop - start;
-    const double seconds{ ( duration_seconds.count() ) };
-    std::cout << "Setup stop " << std::ctime( &now )
-              << "Total duration " << std::fixed << std::setprecision(5) << seconds << " seconds." << std::endl;
-
-    const std::chrono::system_clock::time_point start2{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( start2 );
-    std::cout << "Normal Loop start " << std::ctime( &now );
- // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+    // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
   thread_for(ss,grid->oSites(),{
   //for(int ss=0; ss < grid->oSites(); ss++){
 
@@ -636,58 +339,9 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
     auto D3 = v3[ss];
 
     vobj result=Zero();
-    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+    baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
     vbaryon_corr[ss] = result; 
   }  );//end loop over lattice sites
-    const std::chrono::system_clock::time_point stop2{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( stop2 );
-    const std::chrono::duration<double> duration_seconds2 = stop2 - start2;
-    const double seconds2{ ( duration_seconds2.count() ) };
-    std::cout << "Normal Loop stop " << std::ctime( &now )
-              << "Total duration " << std::fixed << std::setprecision(5) << seconds2 << " seconds." << std::endl;
-	      const std::chrono::system_clock::time_point start4{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( start4 );
-    std::cout << "Opt-macro Loop start " << std::ctime( &now );
- // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-  thread_for(ss,grid->oSites(),{
-  //for(int ss=0; ss < grid->oSites(); ss++){
-
-    auto D1 = v1[ss];
-    auto D2 = v2[ss];
-    auto D3 = v3[ss];
-
-    vobj result=Zero();
-    baryon_site_macro(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
-    vbaryon_corr[ss] = result; 
-  }  );//end loop over lattice sites
-    const std::chrono::system_clock::time_point stop4{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( stop4 );
-    const std::chrono::duration<double> duration_seconds4 = stop4 - start4;
-    const double seconds4{ ( duration_seconds4.count() ) };
-    std::cout << "Opt-macro Loop stop " << std::ctime( &now )
-              << "Total duration " << std::fixed << std::setprecision(5) << seconds4 << " seconds." << std::endl; 
-    const std::chrono::system_clock::time_point start3{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( start3 );
-    int wick_id=32*wick_contraction[0]+16*wick_contraction[1]+8*wick_contraction[2]+4*wick_contraction[3]+2*wick_contraction[4]+wick_contraction[5];
-    std::cout << "Opt-template Loop start " << std::ctime( &now );
- // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
-  thread_for(ss,grid->oSites(),{
-  //for(int ss=0; ss < grid->oSites(); ss++){
-
-    auto D1 = v1[ss];
-    auto D2 = v2[ss];
-    auto D3 = v3[ss];
-
-    vobj result=Zero();
-    baryon_site_template(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
-    vbaryon_corr[ss] = result; 
-  }  );//end loop over lattice sites
-    const std::chrono::system_clock::time_point stop3{ std::chrono::system_clock::now() };
-    now = std::chrono::system_clock::to_time_t( stop3 );
-    const std::chrono::duration<double> duration_seconds3 = stop3 - start3;
-    const double seconds3{ ( duration_seconds3.count() ) };
-    std::cout << "Opt-template Loop stop " << std::ctime( &now )
-              << "Total duration " << std::fixed << std::setprecision(5) << seconds3 << " seconds." << std::endl;
 
 }
 template <class FImpl>
@@ -704,20 +358,24 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
 						 const int parity,
 						 robj &result)
 {
-  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+
+    assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+    assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+    
+    std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
     std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
     std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
     std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
     std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
-  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+    assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
-  int wick_contraction[6];
-  for (int ie=0; ie < 6 ; ie++)
-    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
-
-     result=Zero();
-     baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
+    int wick_id;
+    for (int ie=0; ie < 6 ; ie++)
+      wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie);
+  
+    result=Zero();
+    baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
 }
 
 /***********************************************************************
@@ -957,6 +615,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();
@@ -994,6 +656,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();

From 253bcc3426a212675cc497147c6142c6700102ee Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Thu, 7 May 2020 18:03:17 +0100
Subject: [PATCH 05/19] back to old version

---
 Grid/qcd/utils/BaryonUtils.h | 249 +++++++++++++++--------------------
 1 file changed, 105 insertions(+), 144 deletions(-)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index 241395c3..6cf526c3 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -46,44 +46,20 @@ public:
   typedef typename SpinMatrixField::vector_object sobj;
 
   static const int epsilon[6][3] ;
-  static const double epsilon_sgn[6];
+  static const Complex epsilon_sgn[6];
 
   private: 
   template <class mobj, class robj>
-  static inline void baryon_site(unsigned int mask, const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result);
-  template <unsigned int mask, class mobj, class robj>
-  static inline void baryon_site(const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result);
-						 
-  template <unsigned int maxMask>
-  struct BaryonSiteHelper
-  {
-  template <class mobj, class robj>
-    static inline void function(const unsigned int mask, const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result);
-						 };
+  static void baryon_site(const mobj &D1,
+				 const mobj &D2,
+				 const mobj &D3,
+				 const Gamma GammaA_left,
+				 const Gamma GammaB_left,
+				 const Gamma GammaA_right,
+				 const Gamma GammaB_right,
+				 const int parity,
+				 const int * wick_contractions,
+  				 robj &result);
   public:
   static void ContractBaryons(const PropagatorField &q1_left,
 				 const PropagatorField &q2_left,
@@ -176,11 +152,17 @@ public:
 template <class FImpl> 
 const int BaryonUtils<FImpl>::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}};
 template <class FImpl> 
-const double BaryonUtils<FImpl>::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0};
+const Complex BaryonUtils<FImpl>::epsilon_sgn[6] = {Complex(1),
+						    Complex(1),
+						    Complex(1),
+						    Complex(-1),
+						    Complex(-1),
+						    Complex(-1)};
 
+//This is the old version
 template <class FImpl>
-template <unsigned int mask, class mobj, class robj>
-inline void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
+template <class mobj, class robj>
+void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 						 const mobj &D2,
 						 const mobj &D3,
 				                 const Gamma GammaA_left,
@@ -188,14 +170,9 @@ inline void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
 				                 const Gamma GammaA_right,
 		                 		 const Gamma GammaB_right,
 						 const int parity,
+						 const int * wick_contraction,
 						 robj &result)
 {
-    constexpr bool wick_contraction_0 = ((mask & (1 << 5)) >> 5);
-    constexpr bool wick_contraction_1 = ((mask & (1 << 4)) >> 4);
-    constexpr bool wick_contraction_2 = ((mask & (1 << 3)) >> 3);
-    constexpr bool wick_contraction_3 = ((mask & (1 << 2)) >> 2);
-    constexpr bool wick_contraction_4 = ((mask & (1 << 1)) >> 1);
-    constexpr bool wick_contraction_5 = ((mask & (1 << 0)) >> 0);
 
   Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4)
 
@@ -216,84 +193,77 @@ inline void BaryonUtils<FImpl>::baryon_site(const mobj &D1,
         int a_right = epsilon[ie_right][0]; //a'
         int b_right = epsilon[ie_right][1]; //b'
         int c_right = epsilon[ie_right][2]; //c'
-	double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
-        //All parts together
+	Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right];
+        //This is the \delta_{456}^{123} part
+	if (wick_contraction[0]){
 	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
             auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
+	        result()()() += eepD1*D2g_ab*gD3_ab;
+          }}}
+  	}	  
+        //This is the \delta_{456}^{231} part
+	if (wick_contraction[1]){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
 	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
             auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
+	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+		result()()() += eepD1g_gb*D2_ab*gD3_ag;
+          }}}
+        }	  
+        //This is the \delta_{456}^{312} part
+	if (wick_contraction[2]){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+		result()()() += eepD1_gb*D2_ag*gD3g_ab;
+          }}}
+        }	  
+        //This is the \delta_{456}^{132} part
+	if (wick_contraction[3]){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+            auto eepD1 = ee * pD1()(gamma_left,gamma_left)(c_right,c_left);
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
+	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
+    		result()()() -= eepD1*D2_ab*gD3g_ab;
+          }}}
+        }	  
+        //This is the \delta_{456}^{321} part
+	if (wick_contraction[4]){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
+            auto gD3_ag = gD3()(alpha_right,gamma_left)(b_right,c_left);
+	  for (int beta_left=0; beta_left<Ns; beta_left++){
+            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
+	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
+		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
+          }}}
+        }	  
+        //This is the \delta_{456}^{213} part
+	if (wick_contraction[5]){
+	  for (int gamma_left=0; gamma_left<Ns; gamma_left++){
+	  for (int alpha_right=0; alpha_right<Ns; alpha_right++){
 	    auto D2_ag = D2()(alpha_right,gamma_left)(a_right,c_left);
 	  for (int beta_left=0; beta_left<Ns; beta_left++){
             auto eepD1g_gb = ee * pD1g()(gamma_left,beta_left)(c_right,a_left);
-            auto eepD1_gb = ee * pD1()(gamma_left,beta_left)(c_right,b_left);
-	    auto D2g_ab = D2g()(alpha_right,beta_left)(a_right,a_left);
-	    auto D2_ab = D2()(alpha_right,beta_left)(a_right,b_left);
 	    auto gD3_ab = gD3()(alpha_right,beta_left)(b_right,b_left);
-	    auto gD3g_ab = gD3g()(alpha_right,beta_left)(b_right,a_left);
-	    if(wick_contraction_0){
-	      result()()() += eepD1*D2g_ab*gD3_ab;
-	    }
-  	    if(wick_contraction_1){
-		    result()()() += eepD1g_gb*D2_ab*gD3_ag;
-	    }
-	    if(wick_contraction_2){
-		    result()()() += eepD1_gb*D2_ag*gD3g_ab;
-	    }
-            if(wick_contraction_3){
-    		result()()() -= eepD1*D2_ab*gD3g_ab;
-	    }
-    	    if(wick_contraction_4){
-		result()()() -= eepD1_gb*D2g_ab*gD3_ag;
-	    }
-            if(wick_contraction_5){
     	        result()()() -= eepD1g_gb*D2_ag*gD3_ab;
-            }
-  	  }}}
+          }}}
+        }	  
       }
     }
 }
 
-template <class FImpl>
-template <unsigned int maxMask>
-template <class mobj, class robj>
-inline void BaryonUtils<FImpl>::BaryonSiteHelper<maxMask>::function(const unsigned int mask, const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result)
-{
-    assert(mask <= maxMask);
-    if (mask == maxMask)
-    {
-        baryon_site<maxMask,decltype(D1),decltype(result)>(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
-    }
-    else
-    {
-        BaryonSiteHelper<(maxMask>0) ? maxMask-1 : 0>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
-    }
-}
-
-// top-level function
-template <class FImpl>
-template <class mobj, class robj>
-inline void BaryonUtils<FImpl>::baryon_site(const unsigned int mask, const mobj &D1,
-						 const mobj &D2,
-						 const mobj &D3,
-				                 const Gamma GammaA_left,
-				                 const Gamma GammaB_left,
-				                 const Gamma GammaA_right,
-		                 		 const Gamma GammaB_right,
-						 const int parity,
-						 robj &result)
-{
-    BaryonSiteHelper<63>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result);
-}
-
-
 template<class FImpl>
 void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const PropagatorField &q2_left,
@@ -307,30 +277,30 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
 						 const int parity,
 						 ComplexField &baryon_corr)
 {
-   
-    assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-    assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
 
-    std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
+  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
     std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
     std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
     std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
     std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
-    assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
-    GridBase *grid = q1_left.Grid();
+  GridBase *grid = q1_left.Grid();
 
-    int wick_id;
-    for (int ie=0; ie < 6 ; ie++)
-      wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie);
+  int wick_contraction[6];
+  for (int ie=0; ie < 6 ; ie++)
+    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
 
-    auto vbaryon_corr= baryon_corr.View();
-    auto v1 = q1_left.View();
-    auto v2 = q2_left.View();
-    auto v3 = q3_left.View();
+  auto vbaryon_corr= baryon_corr.View();
+  auto v1 = q1_left.View();
+  auto v2 = q2_left.View();
+  auto v3 = q3_left.View();
 
-    // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
+ // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
   thread_for(ss,grid->oSites(),{
   //for(int ss=0; ss < grid->oSites(); ss++){
 
@@ -339,10 +309,9 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
     auto D3 = v3[ss];
 
     vobj result=Zero();
-    baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
+    baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
     vbaryon_corr[ss] = result; 
   }  );//end loop over lattice sites
-
 }
 template <class FImpl>
 template <class mobj, class robj>
@@ -359,23 +328,23 @@ void BaryonUtils<FImpl>::ContractBaryons_Sliced(const mobj &D1,
 						 robj &result)
 {
 
-    assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-    assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
-    
-    std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
+  std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl;
     std::cout << "GammaA (left) " << (GammaA_left.g) <<  std::endl;
     std::cout << "GammaB (left) " << (GammaB_left.g) <<  std::endl;
     std::cout << "GammaA (right) " << (GammaA_right.g) <<  std::endl;
     std::cout << "GammaB (right) " << (GammaB_right.g) <<  std::endl;
  
-    assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
+  assert(parity==1 || parity == -1 && "Parity must be +1 or -1");
 
-    int wick_id;
-    for (int ie=0; ie < 6 ; ie++)
-      wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie);
-  
-    result=Zero();
-    baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result);
+  int wick_contraction[6];
+  for (int ie=0; ie < 6 ; ie++)
+    wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
+
+     result=Zero();
+     baryon_site<decltype(D1),decltype(result)>(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result);
 }
 
 /***********************************************************************
@@ -615,10 +584,6 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
-
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
-
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();
@@ -656,10 +621,6 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
-
-  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
-  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
-
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();

From 42bb5f0721de5cfdf0d16c6eda66a9fd9f4d13c6 Mon Sep 17 00:00:00 2001
From: ferben <ferben@debian.felix.com>
Date: Thu, 7 May 2020 18:06:12 +0100
Subject: [PATCH 06/19] asserrtion

---
 Grid/qcd/utils/BaryonUtils.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h
index 6cf526c3..fa2f3376 100644
--- a/Grid/qcd/utils/BaryonUtils.h
+++ b/Grid/qcd/utils/BaryonUtils.h
@@ -584,6 +584,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();
@@ -621,6 +625,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
 						 const std::string op,
 						 SpinMatrixField &stn_corr)
 {
+
+  assert(Ns==4 && "Baryon code only implemented for N_spin = 4");
+  assert(Nc==3 && "Baryon code only implemented for N_colour = 3");
+
   GridBase *grid = qs_ti.Grid();
 
   auto vcorr= stn_corr.View();

From 21ca182c368ba2415d874c744b8def244c0b37dd Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 09:18:24 -0400
Subject: [PATCH 07/19] Comments remove

---
 Grid/algorithms/LinearOperator.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h
index 50600d2d..a7fa1a90 100644
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -257,13 +257,11 @@ public:
       virtual  RealD Mpc      (const Field &in, Field &out) {
       Field tmp(in.Grid());
       tmp.Checkerboard() = !in.Checkerboard();
-	//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << "  _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
 
 	_Mat.Meooe(in,tmp);
 	_Mat.MooeeInv(tmp,out);
 	_Mat.Meooe(out,tmp);
 
-      //std::cout << "cb in " << in.Checkerboard() << "  cb out " << out.Checkerboard() << std::endl;
 	_Mat.Mooee(in,out);
 	return axpy_norm(out,-1.0,tmp,out);
       }

From 6859a3e1d4cf2487b6d2f7f560f845df4fd5a7af Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 09:19:12 -0400
Subject: [PATCH 08/19] Schur operator

---
 benchmarks/Benchmark_schur.cc | 176 ++++++++++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 benchmarks/Benchmark_schur.cc

diff --git a/benchmarks/Benchmark_schur.cc b/benchmarks/Benchmark_schur.cc
new file mode 100644
index 00000000..afee31b0
--- /dev/null
+++ b/benchmarks/Benchmark_schur.cc
@@ -0,0 +1,176 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./benchmarks/Benchmark_dwf.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+
+  Gamma::Algebra Gmu [] = {
+    Gamma::Algebra::GammaX,
+    Gamma::Algebra::GammaY,
+    Gamma::Algebra::GammaZ,
+    Gamma::Algebra::GammaT
+  };
+
+void benchDw(std::vector<int> & L, int Ls);
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+
+  const int Ls=12;
+  std::vector< std::vector<int> > latts;
+#if 0
+  latts.push_back(std::vector<int> ({24,24,24,24}) );
+  latts.push_back(std::vector<int> ({48,24,24,24}) );
+  latts.push_back(std::vector<int> ({96,24,24,24}) );
+  latts.push_back(std::vector<int> ({96,48,24,24}) );
+  //  latts.push_back(std::vector<int> ({96,48,48,24}) );
+  //  latts.push_back(std::vector<int> ({96,48,48,48}) );
+#else
+  //  latts.push_back(std::vector<int> ({96,48,48,48}) );
+  latts.push_back(std::vector<int> ({96,96,96,192}) );
+#endif
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  int threads = GridThread::GetThreads();
+  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t SchurDiagOne "<<std::endl;
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+
+  for (int l=0;l<latts.size();l++){
+    std::vector<int> latt4 = latts[l];
+    std::cout << GridLogMessage <<"\t";
+    for(int d=0;d<Nd;d++){
+      std::cout<<latt4[d]<<"x";
+    }
+    std::cout <<Ls<<"\t" ;
+    benchDw (latt4,Ls);
+  }
+  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
+  Grid_finalize();
+}
+
+
+void benchDw(std::vector<int> & latt4, int Ls)
+{
+  /////////////////////////////////////////////////////////////////////////////////////
+  // for Nc=3
+  /////////////////////////////////////////////////////////////////////////////////////
+  // Dw :  Ls*24*(7+48)= Ls*1320 
+  //
+  // M5D:  Ls*(4*2*Nc mul + 4*2*Nc madd ) = 3*4*2*Nc*Ls = Ls*72
+  // Meo:  Ls*24*(7+48) + Ls*72 = Ls*1392 
+  //
+  // Mee:  3*Ns*2*Nc*Ls  // Chroma 6*N5*Nc*Ns 
+  //
+  // LeemInv : 2*2*Nc*madd*Ls
+  // LeeInv  : 2*2*Nc*madd*Ls
+  // DeeInv  : 4*2*Nc*mul *Ls
+  // UeeInv  : 2*2*Nc*madd*Ls
+  // UeemInv : 2*2*Nc*madd*Ls = Nc*Ls*(8+8+8+8+8) = 40*Nc*Ls// Chroma (10*N5 - 8)*Nc*Ns ~ (40 N5 - 32)Nc flops
+  // QUDA counts as dense LsxLs real matrix x Ls x NcNsNreim => Nc*4*2 x Ls^2 FMA = 16Nc Ls^2 flops
+  // Mpc => 1452*cbvol*2*Ls flops // 
+  //     => (1344+Ls*48)*Ls*cbvol*2 flops QUDA = 1920 @Ls=12 and 2112 @Ls=16
+  /////////////////////////////////////////////////////////////////////////////////////
+  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  //  long unsigned int single_site_flops     = 8*Nc*(7+16*Nc)*Ls;
+  long unsigned int single_site_mpc_flops = 8*Nc*(7+16*Nc)*2*Ls + 40*Nc*2*Ls + 4*Nc*2*Ls;
+  long unsigned int single_site_quda_flops = 8*Nc*(7+16*Nc)*2*Ls + 16*Nc*Ls*Ls + 4*Nc*2*Ls;
+  std::vector<int> seeds4({1,2,3,4});
+  std::vector<int> seeds5({5,6,7,8});
+
+
+  ColourMatrixF cm = ComplexF(1.0,0.0);
+
+  int ncall=300;
+  RealD mass=0.1;
+  RealD M5  =1.8;
+  RealD NP = UGrid->_Nprocessors;
+  double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+
+  LatticeGaugeFieldF Umu(UGrid); Umu=Zero();
+  MobiusFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.5,0.5);
+  
+  LatticeFermionF src_o (FrbGrid); src_o=1.0;
+  LatticeFermionF r_o   (FrbGrid); r_o=Zero();
+
+  int order =151;
+  SchurDiagOneOperator<MobiusFermionF,LatticeFermionF>  Mpc(Dw);
+  Chebyshev<LatticeFermionF>      Cheby(0.0,60.0,order);
+
+  {
+    Mpc.Mpc(src_o,r_o);
+    Mpc.Mpc(src_o,r_o);
+    Mpc.Mpc(src_o,r_o);
+
+    double t0=usecond();
+    for(int i=0;i<ncall;i++){
+      Mpc.Mpc(src_o,r_o);
+    }
+    double t1=usecond();
+
+    double flops=(single_site_mpc_flops*volume*ncall); // Mpc has 1 - Moo^-1 Moe Mee^-1 Meo  so CB cancels.
+    std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0);
+    flops=(single_site_quda_flops*volume*ncall);
+    std::cout <<"\t"<<flops/(t1-t0)<<"\t"<<(t1-t0)/1000./1000.<<" s\t";
+
+    // Cheby uses MpcDagMpc so 2x flops
+    for(int i=0;i<100;i++){
+    Cheby(Mpc,src_o,r_o);
+    t0=usecond();
+    Cheby(Mpc,src_o,r_o);
+    t1=usecond();
+    flops=(single_site_mpc_flops*volume*2*order);
+    std::cout <<"\t"<<flops/(t1-t0);
+    flops=(single_site_quda_flops*volume*2*order);
+    std::cout <<"\t"<<flops/(t1-t0) << "\t" << (t1-t0)/1000./1000. <<" s";
+    std::cout <<std::endl;
+    }
+  }
+  //  Dw.Report();
+}
+
+
+

From 93920c481146e8ad46e8abacd1c314c4d65b9571 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 09:19:54 -0400
Subject: [PATCH 09/19] Remove verbose

---
 Grid/qcd/action/fermion/MobiusFermion.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Grid/qcd/action/fermion/MobiusFermion.h b/Grid/qcd/action/fermion/MobiusFermion.h
index 1cbb6609..1e948092 100644
--- a/Grid/qcd/action/fermion/MobiusFermion.h
+++ b/Grid/qcd/action/fermion/MobiusFermion.h
@@ -59,7 +59,7 @@ public:
   {
     RealD eps = 1.0;
 
-    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
+    //    std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
     Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
     assert(zdata->n==this->Ls);
 	

From 1d65e2f62ccb3555e6fec9413960bd7b8f46b84d Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 09:20:54 -0400
Subject: [PATCH 10/19] Slightly faster Chebyshev; ifdef'ed out the fastest
 until tested numerics Lifteed from HDCR setup

---
 Grid/algorithms/approx/Chebyshev.h | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h
index 133db2b4..c0b0646d 100644
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -234,10 +234,9 @@ public:
 
     GridBase *grid=in.Grid();
 
-    // std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
-    //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
-
     int vol=grid->gSites();
+    typedef typename Field::vector_type vector_type;
+    constexpr int Nsimd = vector_type::Nsimd();
 
     Field T0(grid); T0 = in;  
     Field T1(grid); 
@@ -258,14 +257,27 @@ public:
     //    out = ()*T0 + Coeffs[1]*T1;
     axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
     for(int n=2;n<order;n++){
-	
+
       Linop.HermOp(*Tn,y);
-      //     y=xscale*y+mscale*(*Tn);
-      //      *Tnp=2.0*y-(*Tnm);
-      //      out=out+Coeffs[n]* (*Tnp);
+#if 0
+      auto y_v = y.View();
+      auto Tn_v = Tn->View();
+      auto Tnp_v = Tnp->View();
+      auto Tnm_v = Tnm->View();
+      accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
+	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
+	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
+      });
+      if ( Coeffs[n] != 0.0) {
+	axpy(out,Coeffs[n],*Tnp,out);
+      }
+#else
       axpby(y,xscale,mscale,y,(*Tn));
       axpby(*Tnp,2.0,-1.0,y,(*Tnm));
-      axpy(out,Coeffs[n],*Tnp,out);
+      if ( Coeffs[n] != 0.0) {
+	axpy(out,Coeffs[n],*Tnp,out);
+      }
+#endif
       // Cycle pointers to avoid copies
       Field *swizzle = Tnm;
       Tnm    =Tn;

From 0c570824f24e2033b15d5efa291430561a9c2c83 Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Tue, 21 Apr 2020 13:26:43 +0200
Subject: [PATCH 11/19] Add missing declaration of GridCmdOptionInt

---
 Grid/util/Init.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Grid/util/Init.h b/Grid/util/Init.h
index f7f032ba..dad963a0 100644
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -56,6 +56,7 @@ std::string GridCmdVectorIntToString(const VectorInt & vec);
 void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(std::string &str,VectorInt & vec);
+void GridCmdOptionInt(std::string &str,int & val);
 
 
 void GridParseLayout(char **argv,int argc,

From 779e3c74425c9d5e0a65abc0128e215a911fe90a Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Tue, 21 Apr 2020 13:30:08 +0200
Subject: [PATCH 12/19] Const-correctness for retrieval routines of
 GridStopWatch

---
 Grid/perfmon/Timer.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Grid/perfmon/Timer.h b/Grid/perfmon/Timer.h
index 88b4e1cc..2a44faee 100644
--- a/Grid/perfmon/Timer.h
+++ b/Grid/perfmon/Timer.h
@@ -110,15 +110,15 @@ public:
 #endif
     accumulator = std::chrono::duration_cast<GridUsecs>(start-start); 
   }
-  GridTime Elapsed(void) {
+  GridTime Elapsed(void) const {
     assert(running == false);
     return std::chrono::duration_cast<GridTime>( accumulator );
   }
-  uint64_t useconds(void){
+  uint64_t useconds(void) const {
     assert(running == false);
     return (uint64_t) accumulator.count();
   }
-  bool isRunning(void){
+  bool isRunning(void) const {
     return running;
   }
 };

From ab0c5d77fbce6a0525fbe4385d45bec58f3bb3cb Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Wed, 22 Apr 2020 19:50:30 +0200
Subject: [PATCH 13/19] Correct NonHermitianSchurOperatorBase

---
 Grid/algorithms/LinearOperator.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h
index a7fa1a90..c41f8eef 100644
--- a/Grid/algorithms/LinearOperator.h
+++ b/Grid/algorithms/LinearOperator.h
@@ -364,6 +364,9 @@ public:
         void OpDir(const Field& in, Field& out, int dir, int disp) {
           assert(0);
         }
+        void OpDirAll(const Field& in, std::vector<Field>& out){
+          assert(0);
+        };
     };
 
     template<class Matrix, class Field>

From c83471bfd098c4ad36f5f368231c68dfa48ca6bf Mon Sep 17 00:00:00 2001
From: Daniel Richtmann <daniel.richtmann@gmail.com>
Date: Thu, 23 Apr 2020 10:54:19 +0200
Subject: [PATCH 14/19] Fix missing checkerboards for adj und conjugate

---
 Grid/lattice/Lattice_reality.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Grid/lattice/Lattice_reality.h b/Grid/lattice/Lattice_reality.h
index 7373b2f9..96af8dab 100644
--- a/Grid/lattice/Lattice_reality.h
+++ b/Grid/lattice/Lattice_reality.h
@@ -40,6 +40,7 @@ NAMESPACE_BEGIN(Grid);
 
 template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
   Lattice<vobj> ret(lhs.Grid());
+  ret.Checkerboard()=lhs.Checkerboard();
   auto lhs_v = lhs.View();
   auto ret_v = ret.View();
   accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
@@ -50,6 +51,7 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
 
 template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
   Lattice<vobj> ret(lhs.Grid());
+  ret.Checkerboard() = lhs.Checkerboard();
   auto lhs_v = lhs.View();
   auto ret_v = ret.View();
   accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {

From 2b576fc1852cf0dcb526b21848586f16fc730ce1 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 18:54:29 -0400
Subject: [PATCH 15/19] Comment deadd codde remove

---
 benchmarks/Benchmark_staggered.cc | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/benchmarks/Benchmark_staggered.cc b/benchmarks/Benchmark_staggered.cc
index 93086927..17b73c57 100644
--- a/benchmarks/Benchmark_staggered.cc
+++ b/benchmarks/Benchmark_staggered.cc
@@ -88,25 +88,6 @@ int main (int argc, char ** argv)
     U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
   }
   ref = Zero();
-  /*  
-  { // Naive wilson implementation
-    ref = Zero();
-    for(int mu=0;mu<Nd;mu++){
-      //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
-      tmp = U[mu]*Cshift(src,mu,1);
-      for(int i=0;i<ref._odata.size();i++){
-	ref[i]+= tmp[i] - Gamma(Gmu[mu])*tmp[i]; ;
-      }
-
-      tmp =adj(U[mu])*src;
-      tmp =Cshift(tmp,mu,-1);
-      for(int i=0;i<ref._odata.size();i++){
-	ref[i]+= tmp[i] + Gamma(Gmu[mu])*tmp[i]; ;
-      }
-    }
-  }
-  ref = -0.5*ref;
-  */
 
   RealD mass=0.1;
   RealD c1=9.0/8.0;

From ee1de82a532858200260cc47f658640b9f73a643 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 8 May 2020 18:54:50 -0400
Subject: [PATCH 16/19] Working ITT benchmark again

---
 benchmarks/Benchmark_ITT.cc | 584 +++++++++++++++---------------------
 1 file changed, 235 insertions(+), 349 deletions(-)

diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
index 7ad4a147..1bb77aff 100644
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -30,7 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 using namespace Grid;
 
-
 std::vector<int> L_list;
 std::vector<int> Ls_list;
 std::vector<double> mflop_list;
@@ -76,7 +75,6 @@ struct controls {
   int Opt;
   int CommsOverlap;
   Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
-  //  int HugePages;
 };
 
 class Benchmark {
@@ -119,14 +117,15 @@ public:
     std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
     comms_header();
 
-    for(int lat=4;lat<=maxlat;lat+=4){
-      for(int Ls=8;Ls<=8;Ls*=2){
+    for(int lat=16;lat<=maxlat;lat+=8){
+      //      for(int Ls=8;Ls<=8;Ls*=2){
+      { int Ls=12;
 
 	Coordinate latt_size  ({lat*mpi_layout[0],
 	      lat*mpi_layout[1],
 	      lat*mpi_layout[2],
 	      lat*mpi_layout[3]});
-
+	std::cout << GridLogMessage<< latt_size <<std::endl;
 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 	RealD Nrank = Grid._Nprocessors;
 	RealD Nnode = Grid.NodeCount();
@@ -184,9 +183,6 @@ public:
 	}
 
 	timestat.statistics(t_time);
-	//	for(int i=0;i<t_time.size();i++){
-	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
-	//	}
 
 	dbytes=dbytes*ppn;
 	double xbytes    = dbytes*0.5;
@@ -199,8 +195,6 @@ public:
 		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
-
- 
 	
 	    }
     }    
@@ -227,14 +221,15 @@ public:
     uint64_t NN;
 
 
-  uint64_t lmax=48;
+  uint64_t lmax=32;
 #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 
     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
-    for(int lat=8;lat<=lmax;lat+=4){
+    for(int lat=8;lat<=lmax;lat+=8){
 
       Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
       int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+
       GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 
       //      NP= Grid.RankCount();
@@ -270,191 +265,8 @@ public:
     }
   };
 
-#if 0
-  static double DWF5(int Ls,int L)
-  {
-    //    RealD mass=0.1;
-    RealD M5  =1.8;
 
-    double mflops;
-    double mflops_best = 0;
-    double mflops_worst= 0;
-    std::vector<double> mflops_all;
-
-    ///////////////////////////////////////////////////////
-    // Set/Get the layout & grid size
-    ///////////////////////////////////////////////////////
-    int threads = GridThread::GetThreads();
-    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
-    Coordinate local({L,L,L,L});
-
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), 
-								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-    uint64_t NP = TmpGrid->RankCount();
-    uint64_t NN = TmpGrid->NodeCount();
-    NN_global=NN;
-    uint64_t SHM=NP/NN;
-
-    Coordinate internal;
-    if      ( SHM == 1 )   internal = Coordinate({1,1,1,1});
-    else if ( SHM == 2 )   internal = Coordinate({2,1,1,1});
-    else if ( SHM == 4 )   internal = Coordinate({2,2,1,1});
-    else if ( SHM == 8 )   internal = Coordinate({2,2,2,1});
-    else assert(0);
-
-    Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
-    Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
-
-    ///////// Welcome message ////////////
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
-    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
-    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
-    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
-    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
-    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
-    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    ///////// Lattice Init ////////////
-    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
-    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
-    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
-    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
-
-    ///////// RNG Init ////////////
-    std::vector<int> seeds4({1,2,3,4});
-    std::vector<int> seeds5({5,6,7,8});
-    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
-    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
-
-    ///////// Source preparation ////////////
-    LatticeFermion src   (sFGrid); 
-    LatticeFermion tmp   (sFGrid);
-    std::cout << GridLogMessage << "allocated src and tmp" << std::endl;
-    random(RNG5,src);
-    std::cout << GridLogMessage << "intialised random source" << std::endl;
-
-    RealD N2 = 1.0/::sqrt(norm2(src));
-    src = src*N2;
-    
-    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
-
-    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
-    LatticeFermion src_e (sFrbGrid);
-    LatticeFermion src_o (sFrbGrid);
-    LatticeFermion r_e   (sFrbGrid);
-    LatticeFermion r_o   (sFrbGrid);
-    LatticeFermion r_eo  (sFGrid);
-    LatticeFermion err   (sFGrid);
-    {
-
-      pickCheckerboard(Even,src_e,src);
-      pickCheckerboard(Odd,src_o,src);
-
-#if defined(AVX512) 
-      const int num_cases = 6;
-      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
-#else
-      const int num_cases = 4;
-      std::string fmt("U/S ; U/O ; G/S ; G/O ");
-#endif
-      controls Cases [] = {
-#ifdef AVX512
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
-#endif
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
-      }; 
-
-      for(int c=0;c<num_cases;c++) {
-
-	 WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
-	 WilsonKernelsStatic::Opt   = Cases[c].Opt;
-	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
-
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
-	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-	int nwarm = 100;
-	uint64_t ncall = 1000;
-
-	double t0=usecond();
-	sFGrid->Barrier();
-	for(int i=0;i<nwarm;i++){
-	  sDw.DhopEO(src_o,r_e,DaggerNo);
-	}
-	sFGrid->Barrier();
-	double t1=usecond();
-
-	sDw.ZeroCounters();
-	time_statistics timestat;
-	std::vector<double> t_time(ncall);
-	for(uint64_t i=0;i<ncall;i++){
-	  t0=usecond();
-	  sDw.DhopEO(src_o,r_e,DaggerNo);
-	  t1=usecond();
-	  t_time[i] = t1-t0;
-	}
-	sFGrid->Barrier();
-	
-	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-	double flops=(1344.0*volume)/2;
-	double mf_hi, mf_lo, mf_err;
-
-	timestat.statistics(t_time);
-	mf_hi = flops/timestat.min;
-	mf_lo = flops/timestat.max;
-	mf_err= flops/timestat.min * timestat.err/timestat.mean;
-
-	mflops = flops/timestat.mean;
-	mflops_all.push_back(mflops);
-	if ( mflops_best == 0   ) mflops_best = mflops;
-	if ( mflops_worst== 0   ) mflops_worst= mflops;
-	if ( mflops>mflops_best ) mflops_best = mflops;
-	if ( mflops<mflops_worst) mflops_worst= mflops;
-
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
-	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
-
-	sDw.Report();
-
-      }
-      double robust = mflops_worst/mflops_best;;
-      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
-
-      std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust <<std::endl;
-      std::cout<<GridLogMessage <<fmt << std::endl;
-      std::cout<<GridLogMessage;
-
-      for(int i=0;i<mflops_all.size();i++){
-	std::cout<<mflops_all[i]/NN<<" ; " ;
-      }
-      std::cout<<std::endl;
-      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-    }
-    return mflops_best;
-  }
-#endif
-
-  static double DWF(int Ls,int L, double & robust)
+  static double DWF(int Ls,int L)
   {
     RealD mass=0.1;
     RealD M5  =1.8;
@@ -471,37 +283,30 @@ public:
     Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
     Coordinate local({L,L,L,L});
 
-    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), 
-								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
     uint64_t NP = TmpGrid->RankCount();
     uint64_t NN = TmpGrid->NodeCount();
     NN_global=NN;
     uint64_t SHM=NP/NN;
 
-    Coordinate internal;
-    if      ( SHM == 1 )   internal = Coordinate({1,1,1,1});
-    else if ( SHM == 2 )   internal = Coordinate({2,1,1,1});
-    else if ( SHM == 4 )   internal = Coordinate({2,2,1,1});
-    else if ( SHM == 8 )   internal = Coordinate({2,2,2,1});
-    else assert(0);
-
-    Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
-    Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
 
     ///////// Welcome message ////////////
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
     std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
     std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
-    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
-    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
-    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
     std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
     std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
-
     ///////// Lattice Init ////////////
-    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
     GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
     GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
     GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
@@ -514,74 +319,31 @@ public:
     GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
     std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 
+    typedef DomainWallFermionF Action;
+    typedef typename Action::FermionField Fermion;
+    typedef LatticeGaugeFieldF Gauge;
+    
     ///////// Source preparation ////////////
-    LatticeFermion src   (FGrid); random(RNG5,src);
-    LatticeFermion ref   (FGrid);
-    LatticeFermion tmp   (FGrid);
+    Gauge Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
+    Fermion src   (FGrid); random(RNG5,src);
+    Fermion src_e (FrbGrid);
+    Fermion src_o (FrbGrid);
+    Fermion r_e   (FrbGrid);
+    Fermion r_o   (FrbGrid);
+    Fermion r_eo  (FGrid);
+    Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 
-    RealD N2 = 1.0/::sqrt(norm2(src));
-    std::cout<<GridLogMessage << "Normalising src  "<< N2 <<std::endl;
-    src = src*N2;
-    
-    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
-    
-
-    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-
-    ////////////////////////////////////
-    // Naive wilson implementation
-    ////////////////////////////////////
-    {
-      LatticeGaugeField Umu5d(FGrid); 
-      std::vector<LatticeColourMatrix> U(4,FGrid);
-      auto Umu_v = Umu.View();
-      auto Umu5d_v = Umu5d.View();
-      for(int ss=0;ss<Umu.Grid()->oSites();ss++){
-	for(int s=0;s<Ls;s++){
-	  Umu5d_v[Ls*ss+s] = Umu_v[ss];
-	}
-      }
-      ref = Zero();
-      for(int mu=0;mu<Nd;mu++){
-	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
-      }
-      for(int mu=0;mu<Nd;mu++){
-	
-	tmp = U[mu]*Cshift(src,mu+1,1);
-	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-	
-	tmp =adj(U[mu])*src;
-	tmp =Cshift(tmp,mu+1,-1);
-	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
-      }
-      ref = -0.5*ref;
-    }
-
-    LatticeFermion src_e (FrbGrid);
-    LatticeFermion src_o (FrbGrid);
-    LatticeFermion r_e   (FrbGrid);
-    LatticeFermion r_o   (FrbGrid);
-    LatticeFermion r_eo  (FGrid);
-    LatticeFermion err   (FGrid);
     {
 
       pickCheckerboard(Even,src_e,src);
       pickCheckerboard(Odd,src_o,src);
 
-#if defined(AVX512) 
-      const int num_cases = 6;
-      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
-#else
       const int num_cases = 4;
-      std::string fmt("U/S ; U/O ; G/S ; G/O ");
-#endif
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+
       controls Cases [] = {
-#ifdef AVX512
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
-#endif
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
-	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
       }; 
@@ -594,15 +356,12 @@ public:
 
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
-	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
-	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
-	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
-	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
-	int nwarm = 200;
+	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
@@ -610,9 +369,7 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
-	//	if (ncall < 500) ncall = 500;
-	uint64_t ncall = 1000;
+	uint64_t ncall = 50;
 
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 
@@ -649,24 +406,11 @@ public:
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 
-	Dw.Report();
-
-	Dw.DhopEO(src_o,r_e,DaggerNo);
-	Dw.DhopOE(src_e,r_o,DaggerNo);
-	setCheckerboard(r_eo,r_o);
-	setCheckerboard(r_eo,r_e);
-	err = r_eo-ref; 
-	RealD absref = norm2(ref);
-	RealD abserr = norm2(err);
-	std::cout<<GridLogMessage << "norm diff   "<< abserr << " / " << absref<<std::endl;
-	assert(abserr<1.0e-4);
-
       }
-      robust = mflops_worst/mflops_best;
+
       std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust  <<std::endl;
       std::cout<<GridLogMessage <<fmt << std::endl;
       std::cout<<GridLogMessage ;
 
@@ -680,8 +424,166 @@ public:
     return mflops_best;
   }
 
+
+  static double Staggered(int L)
+  {
+    double mflops;
+    double mflops_best = 0;
+    double mflops_worst= 0;
+    std::vector<double> mflops_all;
+
+    ///////////////////////////////////////////////////////
+    // Set/Get the layout & grid size
+    ///////////////////////////////////////////////////////
+    int threads = GridThread::GetThreads();
+    Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
+    Coordinate local({L,L,L,L});
+    
+    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), 
+								       GridDefaultSimd(Nd,vComplex::Nsimd()),
+								       GridDefaultMpi());
+    uint64_t NP = TmpGrid->RankCount();
+    uint64_t NN = TmpGrid->NodeCount();
+    NN_global=NN;
+    uint64_t SHM=NP/NN;
+
+    Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
+
+    ///////// Welcome message ////////////
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
+    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
+    std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl;
+    std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl;
+    std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl;
+    std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl;
+    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+
+    ///////// Lattice Init ////////////
+    GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
+    
+    ///////// RNG Init ////////////
+    std::vector<int> seeds4({1,2,3,4});
+    GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4);
+    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+    RealD mass=0.1;
+    RealD c1=9.0/8.0;
+    RealD c2=-1.0/24.0;
+    RealD u0=1.0;
+
+    typedef ImprovedStaggeredFermionF Action;
+    typedef typename Action::FermionField Fermion; 
+    typedef LatticeGaugeFieldF Gauge;
+    
+    Gauge Umu(FGrid);  SU3::HotConfiguration(RNG4,Umu); 
+
+    typename Action::ImplParams params;
+    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
+
+    ///////// Source preparation ////////////
+    Fermion src   (FGrid); random(RNG4,src);
+    Fermion src_e (FrbGrid);
+    Fermion src_o (FrbGrid);
+    Fermion r_e   (FrbGrid);
+    Fermion r_o   (FrbGrid);
+    Fermion r_eo  (FGrid);
+  
+    {
+
+      pickCheckerboard(Even,src_e,src);
+      pickCheckerboard(Odd,src_o,src);
+    
+      const int num_cases = 4;
+      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
+      
+      controls Cases [] = {
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
+	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
+      }; 
+
+      for(int c=0;c<num_cases;c++) {
+	
+	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
+	StaggeredKernelsStatic::Opt   = Cases[c].Opt;
+	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
+      
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
+	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
+	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
+	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+	
+	int nwarm = 10;
+	double t0=usecond();
+	FGrid->Barrier();
+	for(int i=0;i<nwarm;i++){
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	}
+	FGrid->Barrier();
+	double t1=usecond();
+	uint64_t ncall = 500;
+
+	FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
+	Ds.ZeroCounters();
+
+	time_statistics timestat;
+	std::vector<double> t_time(ncall);
+	for(uint64_t i=0;i<ncall;i++){
+	  t0=usecond();
+	  Ds.DhopEO(src_o,r_e,DaggerNo);
+	  t1=usecond();
+	  t_time[i] = t1-t0;
+	}
+	FGrid->Barrier();
+	
+	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
+	double flops=(1146.0*volume)/2;
+	double mf_hi, mf_lo, mf_err;
+	
+	timestat.statistics(t_time);
+	mf_hi = flops/timestat.min;
+	mf_lo = flops/timestat.max;
+	mf_err= flops/timestat.min * timestat.err/timestat.mean;
+
+	mflops = flops/timestat.mean;
+	mflops_all.push_back(mflops);
+	if ( mflops_best == 0   ) mflops_best = mflops;
+	if ( mflops_worst== 0   ) mflops_worst= mflops;
+	if ( mflops>mflops_best ) mflops_best = mflops;
+	if ( mflops<mflops_worst) mflops_worst= mflops;
+	
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
+	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
+      
+      }
+
+      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
+      std::cout<<GridLogMessage <<fmt << std::endl;
+      std::cout<<GridLogMessage ;
+
+      for(int i=0;i<mflops_all.size();i++){
+	std::cout<<mflops_all[i]/NN<<" ; " ;
+      }
+      std::cout<<std::endl;
+    }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    return mflops_best;
+  }
 };
 
+
+
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
@@ -696,62 +598,50 @@ int main (int argc, char ** argv)
 
   int do_memory=1;
   int do_comms =1;
-  int do_su3   =0;
-  int do_wilson=1;
-  int do_dwf   =1;
 
-  if ( do_su3 ) {
-    // empty for now
-  }
-#if 1
   int sel=2;
-  Coordinate L_list({8,12,16,24});
-#else
-  int sel=1;
-  Coordinate L_list({8,12});
-#endif
+  std::vector<int> L_list({16,24,32});
   int selm1=sel-1;
-  std::vector<double> robust_list;
 
   std::vector<double> wilson;
   std::vector<double> dwf4;
-  std::vector<double> dwf5;
+  std::vector<double> staggered;
 
-  if ( do_wilson ) {
-    int Ls=1;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    for(int l=0;l<L_list.size();l++){
-      double robust;
-      wilson.push_back(Benchmark::DWF(Ls,L_list[l],robust));
-    }
+  int Ls=1;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    wilson.push_back(Benchmark::DWF(Ls,L_list[l]));
   }
 
-  int Ls=16;
-  if ( do_dwf ) {
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
-    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    for(int l=0;l<L_list.size();l++){
-      double robust;
-      double result = Benchmark::DWF(Ls,L_list[l],robust) ;
-      dwf4.push_back(result);
-      robust_list.push_back(robust);
-    }
+  Ls=12;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    double result = Benchmark::DWF(Ls,L_list[l]) ;
+    dwf4.push_back(result);
   }
 
-  if ( do_dwf ) {
+  /*
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
+  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+  for(int l=0;l<L_list.size();l++){
+    double result = Benchmark::Staggered(L_list[l]) ;
+    staggered.push_back(result);
+  }
+  */
 
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 " <<std::endl;
+  std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl;
   for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l] <<std::endl;
+    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl;
   }
   std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  }
 
   int NN=NN_global;
   if ( do_memory ) {
@@ -768,24 +658,20 @@ int main (int argc, char ** argv)
     Benchmark::Comms();
   }
 
-  if ( do_dwf ) {
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  " <<std::endl;
-  for(int l=0;l<L_list.size();l++){
-    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
-  }
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  " <<std::endl;
+    for(int l=0;l<L_list.size();l++){
+      std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<std::endl;
+    }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
-  std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
-  std::cout<<std::setprecision(3);
-  std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl;
-  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
-  }
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
+    std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
+    std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
+    std::cout<<std::setprecision(3);
+    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 
   Grid_finalize();
 }

From efe5bc6a3cb7f2069646883f29a16d3f345f74f8 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 9 May 2020 22:27:56 -0400
Subject: [PATCH 17/19] Split allocator cache into two pools of different sizes

---
 Grid/allocator/AlignedAllocator.cc | 69 +++++++++++++++---------------
 Grid/allocator/AlignedAllocator.h  | 17 ++++----
 2 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc
index d53c4dc2..77646410 100644
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -6,21 +6,19 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 
-#ifdef GRID_NVCC
-#define SMALL_LIMIT (0)
-#else
-#define SMALL_LIMIT (4096)
-#endif
-
-#ifdef POINTER_CACHE
-int PointerCache::victim;
-
+int PointerCache::Victim;
+int PointerCache::VictimSmall;
 PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
+PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmall];
 
-void *PointerCache::Insert(void *ptr,size_t bytes) {
-
-  if (bytes < SMALL_LIMIT ) return ptr;
-
+void *PointerCache::Insert(void *ptr,size_t bytes) 
+{
+  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
+    return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
+  return Insert(ptr,bytes,Entries,Ncache,Victim);  
+}
+void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) 
+{
 #ifdef GRID_OMP
   assert(omp_in_parallel()==0);
 #endif 
@@ -28,8 +26,8 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
   void * ret = NULL;
   int v = -1;
 
-  for(int e=0;e<Ncache;e++) {
-    if ( Entries[e].valid==0 ) {
+  for(int e=0;e<ncache;e++) {
+    if ( entries[e].valid==0 ) {
       v=e; 
       break;
     }
@@ -37,40 +35,43 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
 
   if ( v==-1 ) {
     v=victim;
-    victim = (victim+1)%Ncache;
+    victim = (victim+1)%ncache;
   }
 
-  if ( Entries[v].valid ) {
-    ret = Entries[v].address;
-    Entries[v].valid = 0;
-    Entries[v].address = NULL;
-    Entries[v].bytes = 0;
+  if ( entries[v].valid ) {
+    ret = entries[v].address;
+    entries[v].valid = 0;
+    entries[v].address = NULL;
+    entries[v].bytes = 0;
   }
 
-  Entries[v].address=ptr;
-  Entries[v].bytes  =bytes;
-  Entries[v].valid  =1;
+  entries[v].address=ptr;
+  entries[v].bytes  =bytes;
+  entries[v].valid  =1;
 
   return ret;
 }
 
-void *PointerCache::Lookup(size_t bytes) {
-
-  if (bytes < SMALL_LIMIT ) return NULL;
-
+void *PointerCache::Lookup(size_t bytes)
+{
+  if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
+    return Lookup(bytes,EntriesSmall,NcacheSmall);
+  return Lookup(bytes,Entries,Ncache);
+}
+void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) 
+{
 #ifdef GRID_OMP
   assert(omp_in_parallel()==0);
 #endif 
-
-  for(int e=0;e<Ncache;e++){
-    if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
-      Entries[e].valid = 0;
-      return Entries[e].address;
+  for(int e=0;e<ncache;e++){
+    if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
+      entries[e].valid = 0;
+      return entries[e].address;
     }
   }
   return NULL;
 }
-#endif
+
 
 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index 8c189be8..d6e2e073 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -42,21 +42,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 #define POINTER_CACHE
 #define GRID_ALLOC_ALIGN (2*1024*1024)
+#define GRID_ALLOC_SMALL_LIMIT (4096)
 
 NAMESPACE_BEGIN(Grid);
 
 // Move control to configure.ac and Config.h?
-#ifdef POINTER_CACHE
+
 class PointerCache {
 private:
 /*Pinning pages is costly*/
 /*Could maintain separate large and small allocation caches*/
-#ifdef GRID_NVCC 
-  static const int Ncache=128;
-#else
+/* Could make these configurable, perhaps up to a max size*/
+  static const int NcacheSmall=128; 
   static const int Ncache=8;
-#endif
-  static int victim;
 
   typedef struct { 
     void *address;
@@ -65,14 +63,17 @@ private:
   } PointerCacheEntry;
     
   static PointerCacheEntry Entries[Ncache];
+  static int Victim;
+  static PointerCacheEntry EntriesSmall[NcacheSmall];
+  static int VictimSmall;
 
 public:
 
   static void *Insert(void *ptr,size_t bytes) ;
+  static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
   static void *Lookup(size_t bytes) ;
-
+  static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
 };
-#endif  
 
 std::string sizeString(size_t bytes);
 

From 2bb2c68e15572bcc6012bfe2694bdca10948463f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 9 May 2020 22:57:21 -0400
Subject: [PATCH 18/19] Separate pools for small and large allocations cache

---
 Grid/allocator/AlignedAllocator.cc               | 16 ++++++++++++++--
 Grid/allocator/AlignedAllocator.h                | 12 +++++++-----
 .../CayleyFermion5DImplementation.h              |  3 ++-
 Grid/util/Init.cc                                |  2 ++
 4 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc
index 77646410..976dfbdc 100644
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -6,11 +6,23 @@ NAMESPACE_BEGIN(Grid);
 MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 
+int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
+int PointerCache::Ncache      = PointerCache::NcacheMax;
 int PointerCache::Victim;
 int PointerCache::VictimSmall;
-PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
-PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmall];
+PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
+PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
 
+void PointerCache::Init(void)
+{
+  char * str;
+  str= getenv("GRID_ALLOC_NCACHE_LARGE");
+  if ( str ) Ncache = atoi(str);
+  if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
+  str= getenv("GRID_ALLOC_NCACHE_SMALL");
+  if ( str ) NcacheSmall = atoi(str);
+  if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
+}
 void *PointerCache::Insert(void *ptr,size_t bytes) 
 {
   if (bytes < GRID_ALLOC_SMALL_LIMIT ) 
diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h
index d6e2e073..77167299 100644
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@@ -53,8 +53,10 @@ private:
 /*Pinning pages is costly*/
 /*Could maintain separate large and small allocation caches*/
 /* Could make these configurable, perhaps up to a max size*/
-  static const int NcacheSmall=128; 
-  static const int Ncache=8;
+  static const int NcacheSmallMax=128; 
+  static const int NcacheMax=16;
+  static int NcacheSmall;
+  static int Ncache;
 
   typedef struct { 
     void *address;
@@ -62,13 +64,13 @@ private:
     int valid;
   } PointerCacheEntry;
     
-  static PointerCacheEntry Entries[Ncache];
+  static PointerCacheEntry Entries[NcacheMax];
   static int Victim;
-  static PointerCacheEntry EntriesSmall[NcacheSmall];
+  static PointerCacheEntry EntriesSmall[NcacheSmallMax];
   static int VictimSmall;
 
 public:
-
+  static void Init(void);
   static void *Insert(void *ptr,size_t bytes) ;
   static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
   static void *Lookup(size_t bytes) ;
diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
index c80d2425..e379026c 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h
@@ -779,9 +779,9 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
   assert(mu>=0);
   assert(mu<Nd);
 
-  int tshift = (mu == Nd-1) ? 1 : 0;
 
 #if 0
+  int tshift = (mu == Nd-1) ? 1 : 0;
   ////////////////////////////////////////////////
   // SHAMIR CASE 
   ////////////////////////////////////////////////
@@ -829,6 +829,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
 #endif
 
 #ifndef GRID_NVCC
+  int tshift = (mu == Nd-1) ? 1 : 0;
   ////////////////////////////////////////////////
   // GENERAL CAYLEY CASE
   ////////////////////////////////////////////////
diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc
index 570f4234..1b672141 100644
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -355,6 +355,8 @@ void Grid_init(int *argc,char ***argv)
   //////////////////////////////////////////////////////////
   GridGpuInit(); // Must come first to set device prior to MPI init
 
+  PointerCache::Init();
+
   if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
     int MB;
     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");

From ea08f193e7bdd7fcb8d18a8713f0f5387def9b2f Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sun, 10 May 2020 05:24:26 -0400
Subject: [PATCH 19/19] Allocator cache spliit into large/small pools

---
 Grid/algorithms/approx/Chebyshev.h |  2 +-
 Grid/allocator/AlignedAllocator.cc | 10 +++++++++-
 Grid/communicator/SharedMemory.cc  |  4 +++-
 benchmarks/Benchmark_schur.cc      |  4 ++--
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h
index c0b0646d..584ed1d5 100644
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -236,7 +236,6 @@ public:
 
     int vol=grid->gSites();
     typedef typename Field::vector_type vector_type;
-    constexpr int Nsimd = vector_type::Nsimd();
 
     Field T0(grid); T0 = in;  
     Field T1(grid); 
@@ -264,6 +263,7 @@ public:
       auto Tn_v = Tn->View();
       auto Tnp_v = Tnp->View();
       auto Tnm_v = Tnm->View();
+      constexpr int Nsimd = vector_type::Nsimd();
       accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc
index 976dfbdc..ef6459ed 100644
--- a/Grid/allocator/AlignedAllocator.cc
+++ b/Grid/allocator/AlignedAllocator.cc
@@ -7,7 +7,11 @@ MemoryStats *MemoryProfiler::stats = nullptr;
 bool         MemoryProfiler::debug = false;
 
 int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
-int PointerCache::Ncache      = PointerCache::NcacheMax;
+#ifdef GRID_CUDA
+int PointerCache::Ncache      = 32;
+#else 
+int PointerCache::Ncache      = 8;
+#endif
 int PointerCache::Victim;
 int PointerCache::VictimSmall;
 PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
@@ -16,12 +20,16 @@ PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheS
 void PointerCache::Init(void)
 {
   char * str;
+
   str= getenv("GRID_ALLOC_NCACHE_LARGE");
   if ( str ) Ncache = atoi(str);
   if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
+
   str= getenv("GRID_ALLOC_NCACHE_SMALL");
   if ( str ) NcacheSmall = atoi(str);
   if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
+
+  //  printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
 }
 void *PointerCache::Insert(void *ptr,size_t bytes) 
 {
diff --git a/Grid/communicator/SharedMemory.cc b/Grid/communicator/SharedMemory.cc
index 5bca9764..de10da3d 100644
--- a/Grid/communicator/SharedMemory.cc
+++ b/Grid/communicator/SharedMemory.cc
@@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
   if (heap_bytes >= heap_size) {
     std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
     std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
-    std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
+    std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
+    std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
+    std::cout<< " Current heap  is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
     assert(heap_bytes<heap_size);
   }
   //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
diff --git a/benchmarks/Benchmark_schur.cc b/benchmarks/Benchmark_schur.cc
index afee31b0..8171998a 100644
--- a/benchmarks/Benchmark_schur.cc
+++ b/benchmarks/Benchmark_schur.cc
@@ -47,7 +47,7 @@ int main (int argc, char ** argv)
 
   const int Ls=12;
   std::vector< std::vector<int> > latts;
-#if 0
+#if 1
   latts.push_back(std::vector<int> ({24,24,24,24}) );
   latts.push_back(std::vector<int> ({48,24,24,24}) );
   latts.push_back(std::vector<int> ({96,24,24,24}) );
@@ -157,7 +157,7 @@ void benchDw(std::vector<int> & latt4, int Ls)
     std::cout <<"\t"<<flops/(t1-t0)<<"\t"<<(t1-t0)/1000./1000.<<" s\t";
 
     // Cheby uses MpcDagMpc so 2x flops
-    for(int i=0;i<100;i++){
+    for(int i=0;i<1;i++){
     Cheby(Mpc,src_o,r_o);
     t0=usecond();
     Cheby(Mpc,src_o,r_o);