From c81d3d422d1bba570f74900d7dae7cc8bc29d318 Mon Sep 17 00:00:00 2001
From: Michael Marshall <43034299+mmphys@users.noreply.github.com>
Date: Mon, 3 Jun 2019 15:25:05 +0100
Subject: [PATCH 1/8] Housekeeping. #include <Grid.h> ---> #include
 <Grid/Grid.h>

---
 .../qcd/action/fermion/ImprovedStaggeredFermion.cc |  2 +-
 Grid/qcd/action/fermion/StaggeredKernelsAsm.cc     | 14 +++++++-------
 Grid/qcd/action/fermion/StaggeredKernelsHand.cc    |  2 +-
 Grid/qcd/action/fermion/WilsonKernelsAsm.cc        |  4 ++--
 documentation/GridXcode/readme.md                  |  5 ++---
 5 files changed, 13 insertions(+), 14 deletions(-)
diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
index 4a0f7e63..883db902 100644
--- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
+++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.cc
@@ -26,7 +26,7 @@ See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
 /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 
 namespace Grid {
 namespace QCD {
diff --git a/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc b/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
index 990ac126..9711c487 100644
--- a/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsAsm.cc
@@ -26,11 +26,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 
 #ifdef AVX512
-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
+#include <Grid/simd/Intel512common.h>
+#include <Grid/simd/Intel512avx.h>
 #endif
 
 // Interleave operations from two directions
@@ -679,7 +679,7 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
   gauge3 =(uint64_t)&UU._odata[sU]( T ); 
   
   // This is the single precision 5th direction vectorised kernel
-#include <simd/Intel512single.h>
+#include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 								    DoubledGaugeField &U, DoubledGaugeField &UUU,
 								    SiteSpinor *buf, int LLs, int sU, 
@@ -732,7 +732,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
    
 }
 
-#include <simd/Intel512double.h>
+#include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 								    DoubledGaugeField &U, DoubledGaugeField &UUU,
 								    SiteSpinor *buf, int LLs, int sU, 
@@ -816,7 +816,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
 
   // This is the single precision 5th direction vectorised kernel
 
-#include <simd/Intel512single.h>
+#include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 							       DoubledGaugeField &U, DoubledGaugeField &UUU,
 							       SiteSpinor *buf, int LLs, int sU, 
@@ -884,7 +884,7 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 #endif
 }
 
-#include <simd/Intel512double.h>
+#include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 							       DoubledGaugeField &U, DoubledGaugeField &UUU,
 							       SiteSpinor *buf, int LLs, int sU, 
diff --git a/Grid/qcd/action/fermion/StaggeredKernelsHand.cc b/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
index 47ebdd86..f304b00f 100644
--- a/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
+++ b/Grid/qcd/action/fermion/StaggeredKernelsHand.cc
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
     *************************************************************************************/
     /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 
 
 #define LOAD_CHI(b)		\
diff --git a/Grid/qcd/action/fermion/WilsonKernelsAsm.cc b/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
index cd5d2430..55911988 100644
--- a/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/Grid/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -81,8 +81,8 @@ WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,Doubl
   assert(0);
 }
 
-#include <qcd/action/fermion/WilsonKernelsAsmAvx512.h>
-#include <qcd/action/fermion/WilsonKernelsAsmQPX.h>
+#include <Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h>
+#include <Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h>
 
 #define INSTANTIATE_ASM(A)\
 template void WilsonKernels<A>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
diff --git a/documentation/GridXcode/readme.md b/documentation/GridXcode/readme.md
index 031ec72a..8d9d7ad8 100644
--- a/documentation/GridXcode/readme.md
+++ b/documentation/GridXcode/readme.md
@@ -262,7 +262,6 @@ Set HEADER_SEARCH_PATHS to:
 
     $Grid/build$(CONFIGURATION)/Grid
     $Grid
-    $Grid/Grid
 
 followed by (***the order is important***) the locations reported by `grid-config --cxxflags`, ignoring duplicates, e.g.:
 
@@ -272,7 +271,7 @@ followed by (***the order is important***) the locations reported by `grid-confi
 
 **Note: the easiest way to set this value is to put it all on one line, space separated, and edit the text to the right of `HEADER_SEARCH_PATHS`**, i.e.:
 
-    $Grid/build$(CONFIGURATION)/Grid $Grid $Grid/Grid $GridPre/openmpi/include $GridPkg/include $GridPre/lime/include
+    $Grid/build$(CONFIGURATION)/Grid $Grid $GridPre/openmpi/include $GridPkg/include $GridPre/lime/include
 
 #### LIBRARY_SEARCH_PATHS
 
@@ -298,7 +297,7 @@ The easiest way to link to all required libraries is to obtain a list of all lib
 
 and pasting the output ***with `-lGrid -lHadrons ` prepended*** (including the `-l` switches) directly into `OTHER_LDFLAGS`, e.g.:
 
-    -lGrid -lHadrons -lmpi -lhdf5_cpp -lz -lcrypto -llime -lfftw3f -lfftw3 -lmpfr -lgmp -lstdc++ -lm -lz -lhdf5
+    -lGrid -lHadrons -lmpi -lhdf5_cpp -lhdf5 -lz -lcrypto -llime -lfftw3f -lfftw3 -lmpfr -lgmp -lm
 
 ## Make additional configurations
 

From 1059189abfa4694e9ad88dd1d71ce86b37801b02 Mon Sep 17 00:00:00 2001
From: fionnoh <fionnoh@gmail.com>
Date: Thu, 27 Jun 2019 13:49:55 +0800
Subject: [PATCH 2/8] Bugfix for A2ALoop module

---
 Hadrons/Modules/MContraction/A2ALoop.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Hadrons/Modules/MContraction/A2ALoop.hpp b/Hadrons/Modules/MContraction/A2ALoop.hpp
index 2ef99354..7e7ffac0 100644
--- a/Hadrons/Modules/MContraction/A2ALoop.hpp
+++ b/Hadrons/Modules/MContraction/A2ALoop.hpp
@@ -112,7 +112,7 @@ void TA2ALoop<FImpl>::execute(void)
     loop = zero;
     for (unsigned int i = 0; i < left.size(); ++i)
     {
-        loop += outerProduct(adj(left[i]), right[i]);
+        loop += outerProduct(left[i], right[i]);
     }
 }
 

From eac6337466c1477cf875c5c8dc489b3ad745a97e Mon Sep 17 00:00:00 2001
From: Vera Guelpers <Vera.Guelpers@ed.ac.uk>
Date: Wed, 3 Jul 2019 14:36:34 +0100
Subject: [PATCH 3/8] Hadrons: EMLepton: multiple source-sink separations at
 once

---
 Hadrons/Modules/MFermion/EMLepton.hpp | 53 +++++++++++++++++++--------
 1 file changed, 38 insertions(+), 15 deletions(-)

diff --git a/Hadrons/Modules/MFermion/EMLepton.hpp b/Hadrons/Modules/MFermion/EMLepton.hpp
index 2d26416d..3fe544d3 100644
--- a/Hadrons/Modules/MFermion/EMLepton.hpp
+++ b/Hadrons/Modules/MFermion/EMLepton.hpp
@@ -54,8 +54,9 @@ BEGIN_HADRONS_NAMESPACE
 *  - action: fermion action used for propagator (string)
 *  - emField: photon field A_mu (string)
 *  - mass: input mass for the lepton propagator
+*  - boundary: boundary conditions for the lepton propagator, e.g. "1 1 1 -1"
 *  - twist: twisted boundary for lepton propagator, e.g. "0.0 0.0 0.0 0.5"
-*  - deltat: source-sink separation
+*  - deltat: list of source-sink separations
 *
 *******************************************************************************/
 
@@ -74,7 +75,7 @@ public:
 				    double, mass,
                                     std::string , boundary,
 				    std::string,  twist,
-                                    unsigned int, deltat);
+				    std::vector<unsigned int>, deltat);
 };
 
 template <typename FImpl>
@@ -124,7 +125,12 @@ std::vector<std::string> TEMLepton<FImpl>::getInput(void)
 template <typename FImpl>
 std::vector<std::string> TEMLepton<FImpl>::getOutput(void)
 {
-    std::vector<std::string> out = {getName(), getName() + "_free"};
+    std::vector<std::string> out = {};
+    for(int i=0; i<par().deltat.size(); i++)
+    {
+	out.push_back(std::to_string(par().deltat[i]) + "_" + getName() + "_free");
+	out.push_back(std::to_string(par().deltat[i]) + "_" + getName());
+    }
     
     return out;
 }
@@ -134,8 +140,11 @@ template <typename FImpl>
 void TEMLepton<FImpl>::setup(void)
 {
     Ls_ = env().getObjectLs(par().action);
-    envCreateLat(PropagatorField, getName());
-    envCreateLat(PropagatorField, getName() + "_free");
+    for(int i=0; i<par().deltat.size(); i++)
+    {
+	envCreateLat(PropagatorField, std::to_string(par().deltat[i]) + "_" + getName() + "_free");
+	envCreateLat(PropagatorField, std::to_string(par().deltat[i]) + "_" + getName());
+    }
     envTmpLat(FermionField, "source", Ls_);
     envTmpLat(FermionField, "sol", Ls_);
     envTmpLat(FermionField, "tmp");
@@ -156,9 +165,6 @@ void TEMLepton<FImpl>::execute(void)
     auto        &mat = envGet(FMat, par().action);
     RealD mass = par().mass;
     Complex ci(0.0,1.0);
-
-    PropagatorField &Aslashlep = envGet(PropagatorField, getName());
-    PropagatorField &lep = envGet(PropagatorField, getName() + "_free");
     
     envGetTmp(FermionField, source);
     envGetTmp(FermionField, sol);
@@ -227,6 +233,22 @@ void TEMLepton<FImpl>::execute(void)
         }
     }
 
+    for(unsigned int dt=0;dt<par().deltat.size();dt++){
+	PropagatorField &lep = envGet(PropagatorField, std::to_string(par().deltat[dt]) + "_" + getName() + "_free");
+	for(tl=0;tl<nt;tl++){
+
+	    //shift free propagator to different source positions
+	    //account for possible anti-periodic boundary in time
+	    proptmp = Cshift(freetmp,Tp, -tl);
+	    proptmp = where( tlat < tl, boundary[Tp]*proptmp, proptmp);
+
+            // free propagator for fixed source-sink separation
+	    lep = where(tlat == (tl-par().deltat[dt]+nt)%nt, proptmp, lep);
+	}
+	//account for possible anti-periodic boundary in time
+	lep = where( tlat >= nt-par().deltat[dt], boundary[Tp]*lep, lep);
+    }
+
     for(tl=0;tl<nt;tl++){
 
 	//shift free propagator to different source positions
@@ -234,9 +256,6 @@ void TEMLepton<FImpl>::execute(void)
 	proptmp = Cshift(freetmp,Tp, -tl);
 	proptmp = where( tlat < tl, boundary[Tp]*proptmp, proptmp);
 
-        // free propagator for fixed source-sink separation 
-	lep = where(tlat == (tl-par().deltat+nt)%nt, proptmp, lep);
-
         // i*A_mu*gamma_mu
         sourcetmp = zero;
         for(unsigned int mu=0;mu<=3;mu++)
@@ -276,13 +295,17 @@ void TEMLepton<FImpl>::execute(void)
             }
 	}
 	// keep the result for the desired delta t
-	Aslashlep = where(tlat == (tl-par().deltat+nt)%nt, proptmp, Aslashlep);
+	for(unsigned int dt=0;dt<par().deltat.size();dt++){
+	    PropagatorField &Aslashlep = envGet(PropagatorField, std::to_string(par().deltat[dt]) + "_" + getName());
+	    Aslashlep = where(tlat == (tl-par().deltat[dt]+nt)%nt, proptmp, Aslashlep);
+	}
     }
 
     //account for possible anti-periodic boundary in time
-    Aslashlep = where( tlat >= nt-par().deltat, boundary[Tp]*Aslashlep, Aslashlep);
-    lep = where( tlat >= nt-par().deltat, boundary[Tp]*lep, lep);
-
+    for(unsigned int dt=0;dt<par().deltat.size();dt++){
+	PropagatorField &Aslashlep = envGet(PropagatorField, std::to_string(par().deltat[dt]) + "_" + getName());
+	Aslashlep = where( tlat >= nt-par().deltat[dt], boundary[Tp]*Aslashlep, Aslashlep);
+    }
 }
 
 END_MODULE_NAMESPACE

From c3d0c176abc974e77e2a9ea0bf46bc4c0554c8ab Mon Sep 17 00:00:00 2001
From: Antonin Portelli <antonin.portelli@me.com>
Date: Fri, 24 May 2019 13:08:35 +0100
Subject: [PATCH 4/8] cleaning up Kl2 contraction

---
 Hadrons/Global.hpp                            |  9 ++-
 .../MContraction/WeakMesonDecayKl2.hpp        | 79 +++++++------------
 2 files changed, 32 insertions(+), 56 deletions(-)

diff --git a/Hadrons/Global.hpp b/Hadrons/Global.hpp
index 947d962c..a525a30f 100644
--- a/Hadrons/Global.hpp
+++ b/Hadrons/Global.hpp
@@ -109,10 +109,11 @@ typedef std::vector<typename ComplexField##suffix::vector_object::scalar_object>
 
 #define FERM_TYPE_ALIASES(FImpl, suffix)\
 BASIC_TYPE_ALIASES(FImpl, suffix);\
-typedef FermionOperator<FImpl>            FMat##suffix;\
-typedef typename FImpl::FermionField      FermionField##suffix;\
-typedef typename FImpl::GaugeField        GaugeField##suffix;\
-typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
+typedef FermionOperator<FImpl>                     FMat##suffix;\
+typedef typename FImpl::FermionField               FermionField##suffix;\
+typedef typename FImpl::GaugeField                 GaugeField##suffix;\
+typedef typename FImpl::DoubledGaugeField          DoubledGaugeField##suffix;\
+typedef Lattice<iSpinMatrix<typename FImpl::Simd>> SpinMatrixField##suffix;
 
 #define GAUGE_TYPE_ALIASES(GImpl, suffix)\
 typedef typename GImpl::GaugeField GaugeField##suffix;
diff --git a/Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp b/Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp
index fa97cda3..d7a45108 100644
--- a/Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp
+++ b/Hadrons/Modules/MContraction/WeakMesonDecayKl2.hpp
@@ -64,7 +64,7 @@ BEGIN_HADRONS_NAMESPACE
 */
 
 /******************************************************************************
- *                                TWeakMesonDecayKl2                             *
+ *                               TWeakMesonDecayKl2                           *
  ******************************************************************************/
 BEGIN_MODULE_NAMESPACE(MContraction)
 
@@ -75,7 +75,7 @@ public:
                                     std::string, q1,
                                     std::string, q2,
                                     std::string, lepton,
-				    std::string, output);
+				                    std::string, output);
 };
 
 template <typename FImpl>
@@ -83,14 +83,13 @@ class TWeakMesonDecayKl2: public Module<WeakMesonDecayKl2Par>
 {
 public:
     FERM_TYPE_ALIASES(FImpl,);
-    class Metadata: Serializable
+    typedef typename SpinMatrixField::vector_object::scalar_object SpinMatrix;
+    class Result: Serializable
     {
     public:
-        GRID_SERIALIZABLE_CLASS_MEMBERS(Metadata,
-                                        int, spinidx1,
-                                        int, spinidx2);
+        GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
+                                        std::vector<SpinMatrix>, corr);
     };
-    typedef Correlator<Metadata> Result;
 public:
     // constructor
     TWeakMesonDecayKl2(const std::string name);
@@ -138,10 +137,10 @@ std::vector<std::string> TWeakMesonDecayKl2<FImpl>::getOutput(void)
 template <typename FImpl>
 void TWeakMesonDecayKl2<FImpl>::setup(void)
 {
-    envTmpLat(LatticeComplex, "c");
+    envTmpLat(ComplexField, "c");
     envTmpLat(PropagatorField, "prop_buf");
     envCreateLat(PropagatorField, getName());
-    envTmpLat(LatticeComplex, "buf");
+    envTmpLat(SpinMatrixField, "buf");
 }
 
 // execution ///////////////////////////////////////////////////////////////////
@@ -150,57 +149,33 @@ void TWeakMesonDecayKl2<FImpl>::execute(void)
 {
     LOG(Message) << "Computing QED Kl2 contractions '" << getName() << "' using"
                  << " quarks '" << par().q1 << "' and '" << par().q2 << "' and"
-		 << "lepton '"  << par().lepton << "'" << std::endl;
+		         << "lepton '"  << par().lepton << "'" << std::endl;
 
+    Gamma                   g5(Gamma::Algebra::Gamma5);
+    int                     nt = env().getDim(Tp);
+    std::vector<SpinMatrix> res_summed;
+    Result                  r;
 
-    auto &res = envGet(PropagatorField, getName()); res = zero;
-    Gamma                  g5(Gamma::Algebra::Gamma5);
-    int                    nt = env().getDim(Tp);
-
-    auto &q1 = envGet(PropagatorField, par().q1);
-    auto &q2 = envGet(PropagatorField, par().q2);
+    auto &res    = envGet(PropagatorField, getName()); res = zero;
+    auto &q1     = envGet(PropagatorField, par().q1);
+    auto &q2     = envGet(PropagatorField, par().q2);
     auto &lepton = envGet(PropagatorField, par().lepton);
-    envGetTmp(LatticeComplex, buf);
-    std::vector<TComplex>  res_summed;
-    envGetTmp(LatticeComplex, c);
+    envGetTmp(SpinMatrixField, buf);
+    envGetTmp(ComplexField, c);
     envGetTmp(PropagatorField, prop_buf);  
 
-    std::vector<Result>    result;
-    Result r;
-
     for (unsigned int mu = 0; mu < 4; ++mu)
     {
-	c = zero;
-	//hadronic part: trace(q1*adj(q2)*g5*gL[mu]) 
-        c   = trace(q1*adj(q2)*g5*GammaL(Gamma::gmu[mu]));
-    	prop_buf = 1.;
-	//multiply lepton part
-	res += c * prop_buf * GammaL(Gamma::gmu[mu]) * lepton;
+        c = zero;
+        //hadronic part: trace(q1*adj(q2)*g5*gL[mu]) 
+        c = trace(q1*adj(q2)*g5*GammaL(Gamma::gmu[mu]));
+        prop_buf = 1.;
+        //multiply lepton part
+        res += c * prop_buf * GammaL(Gamma::gmu[mu]) * lepton;
     }
-
-    //loop over spinor index of lepton part
-    unsigned int i = 0;
-    for (unsigned int s1 = 0; s1 < Ns ; ++s1)
-    for (unsigned int s2 = 0; s2 < Ns ; ++s2)
-    {
-	buf = peekColour(peekSpin(res,s1,s2),0,0);
-
-	sliceSum(buf, res_summed, Tp);
-
-	r.corr.clear();
-	for (unsigned int t = 0; t < nt; ++t)
-	{
-              r.corr.push_back(TensorRemove(res_summed[t]));
-	}
-
-	r.info.spinidx1 = s1;
-	r.info.spinidx2 = s2;
-	result.push_back(r);
-
-	i+=1;
-    }
-
-    saveResult(par().output, "weakdecay", result);
+    buf = peekColour(res, 0, 0);
+    sliceSum(buf, r.corr, Tp);
+    saveResult(par().output, "weakdecay", r);
 }
 
 END_MODULE_NAMESPACE

From 9e926e3fc5f53031f195fa2ecc2ecb715190e4c0 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 19 Jul 2019 10:01:52 +0100
Subject: [PATCH 5/8] Build fix in develop

---
 Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h | 1 +
 HMC/Mobius2p1fEOFA.cc                                | 3 +++
 tests/forces/Test_dwf_force_eofa.cc                  | 2 +-
 tests/forces/Test_dwf_gpforce_eofa.cc                | 2 +-
 tests/forces/Test_mobius_force_eofa.cc               | 2 +-
 tests/forces/Test_mobius_gpforce_eofa.cc             | 2 +-
 6 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
index 25285565..c6746a88 100644
--- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
+++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h
@@ -66,6 +66,7 @@ namespace QCD{
       FermionField Phi; // the pseudofermion field for this trajectory
 
     public:
+
       ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, 
 					      AbstractEOFAFermion<Impl>& _Rop,
 					      OperatorFunction<FermionField>& HeatbathCG, 
diff --git a/HMC/Mobius2p1fEOFA.cc b/HMC/Mobius2p1fEOFA.cc
index 61b06829..997e76ab 100644
--- a/HMC/Mobius2p1fEOFA.cc
+++ b/HMC/Mobius2p1fEOFA.cc
@@ -30,7 +30,9 @@ directory
 /*  END LEGAL */
 #include <Grid/Grid.h>
 
+#ifdef GRID_DEFAULT_PRECISION_DOUBLE
 #define MIXED_PRECISION
+#endif
 
 namespace Grid{ 
   namespace QCD{
@@ -346,6 +348,7 @@ int main(int argc, char **argv) {
 #else
   ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
     EOFA(Strange_Op_L, Strange_Op_R, 
+	 ActionCG,
 	 ActionCG, ActionCG,
 	 DerivativeCG, DerivativeCG,
 	 OFRp, true);
diff --git a/tests/forces/Test_dwf_force_eofa.cc b/tests/forces/Test_dwf_force_eofa.cc
index f17579ae..18e339ab 100644
--- a/tests/forces/Test_dwf_force_eofa.cc
+++ b/tests/forces/Test_dwf_force_eofa.cc
@@ -84,7 +84,7 @@ int main (int argc, char** argv)
   DomainWallEOFAFermionR Rop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, -1.0, 1, M5);
   OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, 12);
   ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
-  ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, true);
+  ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, true);
 
   Meofa.refresh(U, RNG5);
   RealD S = Meofa.S(U); // pdag M p
diff --git a/tests/forces/Test_dwf_gpforce_eofa.cc b/tests/forces/Test_dwf_gpforce_eofa.cc
index 3afeaa43..32b3ed6b 100644
--- a/tests/forces/Test_dwf_gpforce_eofa.cc
+++ b/tests/forces/Test_dwf_gpforce_eofa.cc
@@ -89,7 +89,7 @@ int main (int argc, char** argv)
   FermionAction Rop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, -1.0, 1, M5, params);
   OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, 12);
   ConjugateGradient<FermionField> CG(1.0e-12, 5000);
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, true);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, true);
 
   Meofa.refresh(U, RNG5);
   RealD S = Meofa.S(U); // pdag M p
diff --git a/tests/forces/Test_mobius_force_eofa.cc b/tests/forces/Test_mobius_force_eofa.cc
index 2a5a7d04..68a2931f 100644
--- a/tests/forces/Test_mobius_force_eofa.cc
+++ b/tests/forces/Test_mobius_force_eofa.cc
@@ -86,7 +86,7 @@ int main (int argc, char** argv)
   MobiusEOFAFermionR Rop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, -1.0, 1, M5, b, c);
   OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, 12);
   ConjugateGradient<LatticeFermion> CG(1.0e-12, 5000);
-  ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, Params, false);
+  ExactOneFlavourRatioPseudoFermionAction<WilsonImplR> Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false);
 
   Meofa.refresh(U, RNG5);
   RealD S = Meofa.S(U); // pdag M p
diff --git a/tests/forces/Test_mobius_gpforce_eofa.cc b/tests/forces/Test_mobius_gpforce_eofa.cc
index 72f1dee2..577b4477 100644
--- a/tests/forces/Test_mobius_gpforce_eofa.cc
+++ b/tests/forces/Test_mobius_gpforce_eofa.cc
@@ -91,7 +91,7 @@ int main (int argc, char** argv)
   FermionAction Rop(U, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mb, mf, mb, -1.0, 1, M5, b, c, params);
   OneFlavourRationalParams Params(0.95, 100.0, 5000, 1.0e-12, 12);
   ConjugateGradient<FermionField> CG(1.0e-12, 5000);
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, Params, false);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false);
 
   Meofa.refresh(U, RNG5);
   RealD S = Meofa.S(U); // pdag M p

From ff325376cb19794bcf980fcdd792fb700e306ef4 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 19 Jul 2019 10:47:44 +0100
Subject: [PATCH 6/8] Fix single precision deriv test fail

---
 tests/forces/Test_rect_force.cc | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/forces/Test_rect_force.cc b/tests/forces/Test_rect_force.cc
index e0ffd28c..259090eb 100644
--- a/tests/forces/Test_rect_force.cc
+++ b/tests/forces/Test_rect_force.cc
@@ -73,7 +73,7 @@ int main (int argc, char ** argv)
   ////////////////////////////////////
   // Modify the gauge field a little 
   ////////////////////////////////////
-  RealD dt = 0.0001;
+  RealD dt = 0.002;
 
   LatticeColourMatrix mommu(&Grid); 
   LatticeColourMatrix forcemu(&Grid); 
@@ -88,13 +88,7 @@ int main (int argc, char ** argv)
 
     // fourth order exponential approx
     parallel_for(auto i=mom.begin();i<mom.end();i++){ // exp(pmu dt) * Umu
-      Uprime[i](mu) = U[i](mu) + mom[i](mu)*U[i](mu)*dt 
-	+ mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt/2.0)
-	+ mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt/6.0)
-	+ mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt/24.0)
-	+ mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt*dt/120.0)
-	+ mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt*dt*dt/720.0);
-
+      Uprime[i](mu) = U[i](mu) + mom[i](mu)*U[i](mu)*dt ;
     }
   }
 

From 76c704b84be7d041064992a80d3769bf1b714c56 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Sat, 20 Jul 2019 16:52:24 +0100
Subject: [PATCH 7/8] Intrinsics for CLANG are now fixed in v6

---
 Grid/simd/Grid_avx512.h | 79 -----------------------------------------
 1 file changed, 79 deletions(-)

diff --git a/Grid/simd/Grid_avx512.h b/Grid/simd/Grid_avx512.h
index cce77a58..7546b22d 100644
--- a/Grid/simd/Grid_avx512.h
+++ b/Grid/simd/Grid_avx512.h
@@ -485,83 +485,6 @@ namespace Optimization {
   // Some Template specialization
 
   // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
-#ifndef __INTEL_COMPILER
-#warning "Slow reduction due to incomplete reduce intrinsics"
-  //Complex float Reduce
-  template<>
-    inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
-    __m512 v1,v2;
-    v1=Optimization::Permute::Permute0(in); // avx 512; quad complex single
-    v1= _mm512_add_ps(v1,in);
-    v2=Optimization::Permute::Permute1(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    v2=Optimization::Permute::Permute2(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    u512f conv; conv.v = v1;
-    return Grid::ComplexF(conv.f[0],conv.f[1]);
-  }
-  
-  //Real float Reduce
-  template<>
-    inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
-    __m512 v1,v2;
-    v1 = Optimization::Permute::Permute0(in); // avx 512; octo-double
-    v1 = _mm512_add_ps(v1,in);
-    v2 = Optimization::Permute::Permute1(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    v2 = Optimization::Permute::Permute2(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    v2 = Optimization::Permute::Permute3(v1); 
-    v1 = _mm512_add_ps(v1,v2);
-    u512f conv; conv.v=v1;
-    return conv.f[0];
-  }
-  
-  
-  //Complex double Reduce
-  template<>
-    inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
-    __m512d v1;
-    v1 = Optimization::Permute::Permute0(in); // sse 128; paired complex single
-    v1 = _mm512_add_pd(v1,in);
-    v1 = Optimization::Permute::Permute1(in); // sse 128; paired complex single
-    v1 = _mm512_add_pd(v1,in);
-    u512d conv; conv.v = v1;
-    return Grid::ComplexD(conv.f[0],conv.f[1]);
-  }
-  
-  //Real double Reduce
-  template<>
-    inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
-    __m512d v1,v2;
-    v1 = Optimization::Permute::Permute0(in); // avx 512; quad double
-    v1 = _mm512_add_pd(v1,in);
-      v2 = Optimization::Permute::Permute1(v1); 
-      v1 = _mm512_add_pd(v1,v2);
-      v2 = Optimization::Permute::Permute2(v1); 
-      v1 = _mm512_add_pd(v1,v2);
-     u512d conv; conv.v = v1;
-     return conv.f[0];
-  }
-  
-  //Integer Reduce
-  template<>
-  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
-    // No full vector reduce, use AVX to add upper and lower halves of register
-    // and perform AVX reduction.
-    __m256i v1, v2, v3;
-    __m128i u1, u2, ret;
-    v1  = _mm512_castsi512_si256(in);       // upper half
-    v2  = _mm512_extracti32x8_epi32(in, 1); // lower half
-    v3  = _mm256_add_epi32(v1, v2);
-    v1  = _mm256_hadd_epi32(v3, v3);
-    v2  = _mm256_hadd_epi32(v1, v1);
-    u1  = _mm256_castsi256_si128(v2);        // upper half
-    u2  = _mm256_extracti128_si256(v2, 1);  // lower half
-    ret = _mm_add_epi32(u1, u2);
-    return _mm_cvtsi128_si32(ret);
-  }
-#else
   //Complex float Reduce
   template<>
   inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
@@ -590,8 +513,6 @@ namespace Optimization {
   inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
     return _mm512_reduce_add_epi32(in);
   }
-#endif
-  
   
 }
 

From 263dcbababaa49e8b50f8555a4d80be042aee6a5 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Tue, 30 Jul 2019 22:51:04 +0100
Subject: [PATCH 8/8] Simplify the comms benchmark

---
 benchmarks/Benchmark_comms.cc | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc
index 6d95bbe2..f87caef0 100644
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -188,9 +188,9 @@ int main (int argc, char ** argv)
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
       }
+      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 
       int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 
       for(int i=0;i<Nloop;i++){
       double start=usecond();
@@ -277,15 +277,15 @@ int main (int argc, char ** argv)
       std::vector<HalfSpinColourVectorD *> xbuf(8);
       std::vector<HalfSpinColourVectorD *> rbuf(8);
       Grid.ShmBufferFreeAll();
+      uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
       for(int d=0;d<8;d++){
-	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
+	bzero((void *)xbuf[d],bytes);
+	bzero((void *)rbuf[d],bytes);
       }
 
       int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 
       double dbytes;
       for(int i=0;i<Nloop;i++){
@@ -374,15 +374,15 @@ int main (int argc, char ** argv)
       std::vector<HalfSpinColourVectorD *> xbuf(8);
       std::vector<HalfSpinColourVectorD *> rbuf(8);
       Grid.ShmBufferFreeAll();
+      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
       for(int d=0;d<8;d++){
-	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
+	bzero((void *)xbuf[d],bytes);
+	bzero((void *)rbuf[d],bytes);
       }
 
       int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
       double dbytes;
       for(int i=0;i<Nloop;i++){
 	double start=usecond();
@@ -472,15 +472,16 @@ int main (int argc, char ** argv)
       std::vector<HalfSpinColourVectorD *> xbuf(8);
       std::vector<HalfSpinColourVectorD *> rbuf(8);
       Grid.ShmBufferFreeAll();
+      //      uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+      uint64_t bytes = 2*1024*1024;
       for(int d=0;d<8;d++){
-	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
+	bzero((void *)xbuf[d],bytes);
+	bzero((void *)rbuf[d],bytes);
       }
 
       int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
       double dbytes;
       for(int i=0;i<Nloop;i++){
 	double start=usecond();