From ad01290545c496e3d4362aa03c0e2bce93193b2c Mon Sep 17 00:00:00 2001
From: Nils Asmussen <n.asmussen@soton.ac.uk>
Date: Mon, 19 Aug 2019 20:30:33 +0100
Subject: [PATCH 01/12] remove remnants of the namespace QCD

---
 Grid/parallelIO/NerscIO.h               | 2 +-
 Grid/qcd/hmc/HMC_GridModules.h          | 1 -
 Grid/qcd/utils/CovariantSmearing.h      | 6 +++---
 Grid/qcd/utils/LinalgUtils.h            | 1 -
 Hadrons/Utilities/HadronsXmlValidate.cc | 1 -
 5 files changed, 4 insertions(+), 7 deletions(-)
diff --git a/Grid/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h
index 499295c3..d3b62d1f 100644
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -354,6 +354,6 @@ public:
   }
 };
 
-NAMESPACE_END(QCD);
+NAMESPACE_END(Grid);
 
 #endif
diff --git a/Grid/qcd/hmc/HMC_GridModules.h b/Grid/qcd/hmc/HMC_GridModules.h
index 0c834cf2..45b361b0 100644
--- a/Grid/qcd/hmc/HMC_GridModules.h
+++ b/Grid/qcd/hmc/HMC_GridModules.h
@@ -97,7 +97,6 @@ protected:
 ////////////////////////////////////
 // Classes for the user
 ////////////////////////////////////
-// Note: the space time grid should be out of the QCD namespace
 template <class vector_type>
 class GridFourDimModule : public GridModule
 {
diff --git a/Grid/qcd/utils/CovariantSmearing.h b/Grid/qcd/utils/CovariantSmearing.h
index 7feddea9..9ad7cd50 100644
--- a/Grid/qcd/utils/CovariantSmearing.h
+++ b/Grid/qcd/utils/CovariantSmearing.h
@@ -27,8 +27,7 @@ directory
 *************************************************************************************/
 #pragma once
 
-namespace Grid {
-namespace QCD {
+NAMESPACE_BEGIN(Grid);
 
 template <class Gimpl> class CovariantSmearing : public Gimpl 
 {
@@ -84,4 +83,5 @@ public:
     }
   }
 };
-}}
+
+NAMESPACE_END(Grid);
diff --git a/Grid/qcd/utils/LinalgUtils.h b/Grid/qcd/utils/LinalgUtils.h
index 5974f4ed..56f8f164 100644
--- a/Grid/qcd/utils/LinalgUtils.h
+++ b/Grid/qcd/utils/LinalgUtils.h
@@ -201,7 +201,6 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
   });
 }
 
-// I explicitly need these outside the QCD namespace
 template<typename vobj>
 void G5C(Lattice<vobj> &z, const Lattice<vobj> &x)
 {
diff --git a/Hadrons/Utilities/HadronsXmlValidate.cc b/Hadrons/Utilities/HadronsXmlValidate.cc
index 73cf3139..fa2bfe3a 100644
--- a/Hadrons/Utilities/HadronsXmlValidate.cc
+++ b/Hadrons/Utilities/HadronsXmlValidate.cc
@@ -29,7 +29,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/Application.hpp>
 
 using namespace Grid;
-using namespace QCD;
 using namespace Hadrons;
 
 int main(int argc, char *argv[])

From 9210b0aa6ea3750e3eab7c5811d5f6b33cf19848 Mon Sep 17 00:00:00 2001
From: Nils Asmussen <n.asmussen@soton.ac.uk>
Date: Tue, 20 Aug 2019 15:21:23 +0100
Subject: [PATCH 02/12] remove namespace QCD from directory HMC

---
 HMC/Mobius2p1f.cc        |  17 +-
 HMC/Mobius2p1fEOFA.cc    | 373 +++++++++++++++++++--------------------
 HMC/Mobius2p1fEOFA_F1.cc | 342 +++++++++++++++++------------------
 HMC/Mobius2p1fRHMC.cc    |  23 ++-
 4 files changed, 376 insertions(+), 379 deletions(-)

diff --git a/HMC/Mobius2p1f.cc b/HMC/Mobius2p1f.cc
index fe373dcb..5f82e0e7 100644
--- a/HMC/Mobius2p1f.cc
+++ b/HMC/Mobius2p1f.cc
@@ -31,7 +31,6 @@ directory
 
 int main(int argc, char **argv) {
   using namespace Grid;
-  using namespace Grid::QCD;
 
   Grid_init(&argc, &argv);
   int threads = GridThread::GetThreads();
@@ -44,18 +43,18 @@ int main(int argc, char **argv) {
   typedef typename FermionAction::FermionField FermionField;
 
   typedef Grid::XmlReader       Serialiser;
-  
+
   //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
   IntegratorParameters MD;
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
   //  MD.name    = std::string("Leap Frog");
-  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
   //  MD.name    = std::string("Force Gradient");
-  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
   MD.name    = std::string("MinimumNorm2");
   MD.MDsteps = 20;
   MD.trajL   = 1.0;
-  
+
   HMCparameters HMCparams;
   HMCparams.StartTrajectory  = 0;
   HMCparams.Trajectories     = 200;
@@ -67,7 +66,7 @@ int main(int argc, char **argv) {
 
   // Grid from the command line arguments --grid and --mpi
   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-  
+
   CheckpointerParameters CPparams;
   CPparams.config_prefix = "ckpoint_EODWF_lat";
   CPparams.rng_prefix    = "ckpoint_EODWF_rng";
@@ -81,7 +80,7 @@ int main(int argc, char **argv) {
   TheHMC.Resources.SetRNGSeeds(RNGpar);
 
   // Construct observables
-  // here there is too much indirection 
+  // here there is too much indirection
   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
   TheHMC.Resources.AddObservable<PlaqObs>();
   //////////////////////////////////////////////
@@ -118,7 +117,7 @@ int main(int argc, char **argv) {
   // These lines are unecessary if BC are all periodic
   std::vector<Complex> boundary = {1,1,1,-1};
   FermionAction::ImplParams Params(boundary);
-  
+
   double StoppingCondition = 1e-10;
   double MaxCGIterations = 30000;
   ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);
diff --git a/HMC/Mobius2p1fEOFA.cc b/HMC/Mobius2p1fEOFA.cc
index 35fb19cb..4a37bc22 100644
--- a/HMC/Mobius2p1fEOFA.cc
+++ b/HMC/Mobius2p1fEOFA.cc
@@ -2,7 +2,7 @@
 
 Grid physics library, www.github.com/paboyle/Grid
 
-Source file: 
+Source file:
 
 Copyright (C) 2015-2016
 
@@ -34,140 +34,139 @@ directory
 #define MIXED_PRECISION
 #endif
 
-namespace Grid{ 
-  namespace QCD{
+NAMESPACE_BEGIN(Grid);
 
-  /*
-   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
-   *    -- Store the single prec action operator.
-   *    -- Clone the gauge field from the operator function argument.
-   *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
-   */
+/*
+ * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
+ *    -- Store the single prec action operator.
+ *    -- Clone the gauge field from the operator function argument.
+ *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
+ */
 
-  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
-  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
-  public:
-    typedef typename FermionOperatorD::FermionField FieldD;
-    typedef typename FermionOperatorF::FermionField FieldF;
+template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF>
+class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+public:
+  typedef typename FermionOperatorD::FermionField FieldD;
+  typedef typename FermionOperatorF::FermionField FieldF;
 
-    using OperatorFunction<FieldD>::operator();
+  using OperatorFunction<FieldD>::operator();
 
-    RealD   Tolerance;
-    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
-    Integer MaxInnerIterations;
-    Integer MaxOuterIterations;
-    GridBase* SinglePrecGrid4; //Grid for single-precision fields
-    GridBase* SinglePrecGrid5; //Grid for single-precision fields
-    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+  RealD   Tolerance;
+  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+  Integer MaxInnerIterations;
+  Integer MaxOuterIterations;
+  GridBase* SinglePrecGrid4; //Grid for single-precision fields
+  GridBase* SinglePrecGrid5; //Grid for single-precision fields
+  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
 
-    FermionOperatorF &FermOpF;
-    FermionOperatorD &FermOpD;;
-    SchurOperatorF &LinOpF;
-    SchurOperatorD &LinOpD;
+  FermionOperatorF &FermOpF;
+  FermionOperatorD &FermOpD;;
+  SchurOperatorF &LinOpF;
+  SchurOperatorD &LinOpD;
 
-    Integer TotalInnerIterations; //Number of inner CG iterations
-    Integer TotalOuterIterations; //Number of restarts
-    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+  Integer TotalInnerIterations; //Number of inner CG iterations
+  Integer TotalOuterIterations; //Number of restarts
+  Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
 
-    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
-						    Integer maxinnerit, 
-						    Integer maxouterit, 
-						    GridBase* _sp_grid4, 
-						    GridBase* _sp_grid5, 
-						    FermionOperatorF &_FermOpF,
-						    FermionOperatorD &_FermOpD,
-						    SchurOperatorF   &_LinOpF,
-						    SchurOperatorD   &_LinOpD): 
-      LinOpF(_LinOpF),
-      LinOpD(_LinOpD),
-      FermOpF(_FermOpF),
-      FermOpD(_FermOpD),
-      Tolerance(tol), 
-      InnerTolerance(tol), 
-      MaxInnerIterations(maxinnerit), 
-      MaxOuterIterations(maxouterit), 
-      SinglePrecGrid4(_sp_grid4),
-      SinglePrecGrid5(_sp_grid5),
-      OuterLoopNormMult(100.) 
-    { 
-      /* Debugging instances of objects; references are stored
-      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
-      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
-      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
-      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
-      */
-    };
-
-    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
-
-      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
-
-      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      
-      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
-      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
-      // Assumption made in code to extract gauge field
-      // We could avoid storing LinopD reference alltogether ?
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Must snarf a single precision copy of the gauge field in Linop_d argument
-      ////////////////////////////////////////////////////////////////////////////////////
-      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
-      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
-      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
-      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
-
-      GridBase * GridPtrF = SinglePrecGrid4;
-      GridBase * GridPtrD = FermOpD.Umu.Grid();
-      GaugeFieldF     U_f  (GridPtrF);
-      GaugeLinkFieldF Umu_f(GridPtrF);
-      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
-      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Moving this to a Clone method of fermion operator would allow to duplicate the 
-      // physics parameters and decrease gauge field copies
-      ////////////////////////////////////////////////////////////////////////////////////
-      GaugeLinkFieldD Umu_d(GridPtrD);
-      for(int mu=0;mu<Nd*2;mu++){ 
-	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
-	precisionChange(Umu_f,Umu_d);
-	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
-      }
-      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
-      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Could test to make sure that LinOpF and LinOpD agree to single prec?
-      ////////////////////////////////////////////////////////////////////////////////////
-      /*
-      GridBase *Fgrid = psi._grid;
-      FieldD tmp2(Fgrid);
-      FieldD tmp1(Fgrid);
-      LinOpU.Op(src,tmp1);
-      LinOpD.Op(src,tmp2);
-      std::cout << " Double gauge field "<< norm2(FermOpD.Umu)<<std::endl;
-      std::cout << " Single gauge field "<< norm2(FermOpF.Umu)<<std::endl;
-      std::cout << " Test of operators "<<norm2(tmp1)<<std::endl;
-      std::cout << " Test of operators "<<norm2(tmp2)<<std::endl;
-      tmp1=tmp1-tmp2;
-      std::cout << " Test of operators diff "<<norm2(tmp1)<<std::endl;
-      */
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Make a mixed precision conjugate gradient
-      ////////////////////////////////////////////////////////////////////////////////////
-      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
-      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
-      MPCG(src,psi);
-    }
+  MixedPrecisionConjugateGradientOperatorFunction(RealD tol,
+                                                  Integer maxinnerit,
+                                                  Integer maxouterit,
+                                                  GridBase* _sp_grid4,
+                                                  GridBase* _sp_grid5,
+                                                  FermionOperatorF &_FermOpF,
+                                                  FermionOperatorD &_FermOpD,
+                                                  SchurOperatorF   &_LinOpF,
+                                                  SchurOperatorD   &_LinOpD):
+    LinOpF(_LinOpF),
+    LinOpD(_LinOpD),
+    FermOpF(_FermOpF),
+    FermOpD(_FermOpD),
+    Tolerance(tol),
+    InnerTolerance(tol),
+    MaxInnerIterations(maxinnerit),
+    MaxOuterIterations(maxouterit),
+    SinglePrecGrid4(_sp_grid4),
+    SinglePrecGrid5(_sp_grid5),
+    OuterLoopNormMult(100.)
+  {
+    /* Debugging instances of objects; references are stored
+    std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
+    std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
+    std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
+    std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
+    */
   };
-}};
+
+  void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+    std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+    SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+
+    //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
+    //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
+    // Assumption made in code to extract gauge field
+    // We could avoid storing LinopD reference alltogether ?
+    assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Must snarf a single precision copy of the gauge field in Linop_d argument
+    ////////////////////////////////////////////////////////////////////////////////////
+    typedef typename FermionOperatorF::GaugeField GaugeFieldF;
+    typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
+    typedef typename FermionOperatorD::GaugeField GaugeFieldD;
+    typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
+
+    GridBase * GridPtrF = SinglePrecGrid4;
+    GridBase * GridPtrD = FermOpD.Umu.Grid();
+    GaugeFieldF     U_f  (GridPtrF);
+    GaugeLinkFieldF Umu_f(GridPtrF);
+    //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
+    //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Moving this to a Clone method of fermion operator would allow to duplicate the
+    // physics parameters and decrease gauge field copies
+    ////////////////////////////////////////////////////////////////////////////////////
+    GaugeLinkFieldD Umu_d(GridPtrD);
+    for(int mu=0;mu<Nd*2;mu++){
+      Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
+      precisionChange(Umu_f,Umu_d);
+      PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
+    }
+    pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+    pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Could test to make sure that LinOpF and LinOpD agree to single prec?
+    ////////////////////////////////////////////////////////////////////////////////////
+    /*
+    GridBase *Fgrid = psi._grid;
+    FieldD tmp2(Fgrid);
+    FieldD tmp1(Fgrid);
+    LinOpU.Op(src,tmp1);
+    LinOpD.Op(src,tmp2);
+    std::cout << " Double gauge field "<< norm2(FermOpD.Umu)<<std::endl;
+    std::cout << " Single gauge field "<< norm2(FermOpF.Umu)<<std::endl;
+    std::cout << " Test of operators "<<norm2(tmp1)<<std::endl;
+    std::cout << " Test of operators "<<norm2(tmp2)<<std::endl;
+    tmp1=tmp1-tmp2;
+    std::cout << " Test of operators diff "<<norm2(tmp1)<<std::endl;
+    */
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Make a mixed precision conjugate gradient
+    ////////////////////////////////////////////////////////////////////////////////////
+    MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+    std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+    MPCG(src,psi);
+  }
+};
+
+NAMESPACE_END(Grid);
 
 int main(int argc, char **argv) {
   using namespace Grid;
-  using namespace Grid::QCD;
 
   Grid_init(&argc, &argv);
   int threads = GridThread::GetThreads();
@@ -184,18 +183,18 @@ int main(int argc, char **argv) {
   typedef typename FermionActionF::FermionField FermionFieldF;
 
   typedef Grid::XmlReader       Serialiser;
-  
+
   //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
   IntegratorParameters MD;
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
   //  MD.name    = std::string("Leap Frog");
-  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
+  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
   MD.name    = std::string("Force Gradient");
-  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
+  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
   //  MD.name    = std::string("MinimumNorm2");
   MD.MDsteps = 6;
   MD.trajL   = 1.0;
-  
+
   HMCparameters HMCparams;
   HMCparams.StartTrajectory  = 590;
   HMCparams.Trajectories     = 1000;
@@ -208,7 +207,7 @@ int main(int argc, char **argv) {
 
   // Grid from the command line arguments --grid and --mpi
   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-  
+
   CheckpointerParameters CPparams;
   CPparams.config_prefix = "ckpoint_EODWF_lat";
   CPparams.rng_prefix    = "ckpoint_EODWF_rng";
@@ -222,7 +221,7 @@ int main(int argc, char **argv) {
   TheHMC.Resources.SetRNGSeeds(RNGpar);
 
   // Construct observables
-  // here there is too much indirection 
+  // here there is too much indirection
   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
   TheHMC.Resources.AddObservable<PlaqObs>();
   //////////////////////////////////////////////
@@ -233,7 +232,7 @@ int main(int argc, char **argv) {
   Real strange_mass = 0.04;
   Real pv_mass      = 1.0;
   RealD M5  = 1.8;
-  RealD b   = 1.0; 
+  RealD b   = 1.0;
   RealD c   = 0.0;
 
   std::vector<Real> hasenbusch({ 0.1, 0.3, 0.6 });
@@ -262,7 +261,7 @@ int main(int argc, char **argv) {
   std::vector<Complex> boundary = {1,1,1,-1};
   FermionAction::ImplParams Params(boundary);
   FermionActionF::ImplParams ParamsF(boundary);
-  
+
   double ActionStoppingCondition     = 1e-10;
   double DerivativeStoppingCondition = 1e-6;
   double MaxCGIterations = 30000;
@@ -293,7 +292,7 @@ int main(int argc, char **argv) {
   OFRp.degree   = 14;
   OFRp.precision= 50;
 
-  
+
   MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
   MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
   MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
@@ -310,50 +309,50 @@ int main(int argc, char **argv) {
   LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
 
   MxPCG_EOFA ActionCGL(ActionStoppingCondition,
-		       MX_inner,
-		       MaxCGIterations,
-		       GridPtrF,
-		       FrbGridF,
-		       Strange_Op_LF,Strange_Op_L,
-		       Strange_LinOp_LF,Strange_LinOp_L);
+                       MX_inner,
+                       MaxCGIterations,
+                       GridPtrF,
+                       FrbGridF,
+                       Strange_Op_LF,Strange_Op_L,
+                       Strange_LinOp_LF,Strange_LinOp_L);
 
   MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,
-			   MX_inner,
-			   MaxCGIterations,
-			   GridPtrF,
-			   FrbGridF,
-			   Strange_Op_LF,Strange_Op_L,
-			   Strange_LinOp_LF,Strange_LinOp_L);
-  
-  MxPCG_EOFA ActionCGR(ActionStoppingCondition,
-		       MX_inner,
-		       MaxCGIterations,
-		       GridPtrF,
-		       FrbGridF,
-		       Strange_Op_RF,Strange_Op_R,
-		       Strange_LinOp_RF,Strange_LinOp_R);
-  
-  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
-			   MX_inner,
-			   MaxCGIterations,
-			   GridPtrF,
-			   FrbGridF,
-			   Strange_Op_RF,Strange_Op_R,
-			   Strange_LinOp_RF,Strange_LinOp_R);
+                           MX_inner,
+                           MaxCGIterations,
+                           GridPtrF,
+                           FrbGridF,
+                           Strange_Op_LF,Strange_Op_L,
+                           Strange_LinOp_LF,Strange_LinOp_L);
 
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA(Strange_Op_L, Strange_Op_R, 
-	 ActionCG, 
-	 ActionCGL, ActionCGR,
-	 DerivativeCGL, DerivativeCGR,
-	 OFRp, true);
+  MxPCG_EOFA ActionCGR(ActionStoppingCondition,
+                       MX_inner,
+                       MaxCGIterations,
+                       GridPtrF,
+                       FrbGridF,
+                       Strange_Op_RF,Strange_Op_R,
+                       Strange_LinOp_RF,Strange_LinOp_R);
+
+  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
+                           MX_inner,
+                           MaxCGIterations,
+                           GridPtrF,
+                           FrbGridF,
+                           Strange_Op_RF,Strange_Op_R,
+                           Strange_LinOp_RF,Strange_LinOp_R);
+
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
+    EOFA(Strange_Op_L, Strange_Op_R,
+         ActionCG,
+         ActionCGL, ActionCGR,
+         DerivativeCGL, DerivativeCGR,
+         OFRp, true);
 #else
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA(Strange_Op_L, Strange_Op_R, 
-	 ActionCG,
-	 ActionCG, ActionCG,
-	 DerivativeCG, DerivativeCG, 
-	 OFRp, true);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
+    EOFA(Strange_Op_L, Strange_Op_R,
+         ActionCG,
+         ActionCG, ActionCG,
+         DerivativeCG, DerivativeCG,
+         OFRp, true);
 #endif
   Level1.push_back(&EOFA);
 
@@ -384,7 +383,7 @@ int main(int argc, char **argv) {
   std::vector<MxPCG *> MPCG;
   std::vector<FermionActionF *> DenominatorsF;
   std::vector<LinearOperatorD *> LinOpD;
-  std::vector<LinearOperatorF *> LinOpF; 
+  std::vector<LinearOperatorF *> LinOpF;
 
   for(int h=0;h<n_hasenbusch+1;h++){
 
@@ -403,20 +402,20 @@ int main(int argc, char **argv) {
     LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
 
     MPCG.push_back(new MxPCG(DerivativeStoppingCondition,
-			     MX_inner,
-			     MaxCGIterations,
-			     GridPtrF,
-			     FrbGridF,
-			     *DenominatorsF[h],*Denominators[h],
-			     *LinOpF[h], *LinOpD[h]) );
+                             MX_inner,
+                             MaxCGIterations,
+                             GridPtrF,
+                             FrbGridF,
+                             *DenominatorsF[h],*Denominators[h],
+                             *LinOpF[h], *LinOpD[h]) );
 
     ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,
-				   MX_inner,
-				   MaxCGIterations,
-				   GridPtrF,
-				   FrbGridF,
-				   *DenominatorsF[h],*Denominators[h],
-				   *LinOpF[h], *LinOpD[h]) );
+                                   MX_inner,
+                                   MaxCGIterations,
+                                   GridPtrF,
+                                   FrbGridF,
+                                   *DenominatorsF[h],*Denominators[h],
+                                   *LinOpF[h], *LinOpD[h]) );
 
     // Heatbath not mixed yet. As inverts numerators not so important as raised mass.
     Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],ActionCG));
diff --git a/HMC/Mobius2p1fEOFA_F1.cc b/HMC/Mobius2p1fEOFA_F1.cc
index 3d51b16c..9d006da3 100644
--- a/HMC/Mobius2p1fEOFA_F1.cc
+++ b/HMC/Mobius2p1fEOFA_F1.cc
@@ -2,7 +2,7 @@
 
 Grid physics library, www.github.com/paboyle/Grid
 
-Source file: 
+Source file:
 
 Copyright (C) 2015-2016
 
@@ -34,123 +34,123 @@ directory
 #define MIXED_PRECISION
 #endif
 
-namespace Grid{ 
-  namespace QCD{
+NAMESPACE_BEGIN(Grid);
 
-  /*
-   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
-   *    -- Store the single prec action operator.
-   *    -- Clone the gauge field from the operator function argument.
-   *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
-   */
+/*
+ * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
+ *    -- Store the single prec action operator.
+ *    -- Clone the gauge field from the operator function argument.
+ *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
+ */
 
-  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
-  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
-  public:
-    typedef typename FermionOperatorD::FermionField FieldD;
-    typedef typename FermionOperatorF::FermionField FieldF;
+template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF>
+class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+public:
+  typedef typename FermionOperatorD::FermionField FieldD;
+  typedef typename FermionOperatorF::FermionField FieldF;
 
-    using OperatorFunction<FieldD>::operator();
+  using OperatorFunction<FieldD>::operator();
 
-    RealD   Tolerance;
-    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
-    Integer MaxInnerIterations;
-    Integer MaxOuterIterations;
-    GridBase* SinglePrecGrid4; //Grid for single-precision fields
-    GridBase* SinglePrecGrid5; //Grid for single-precision fields
-    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+  RealD   Tolerance;
+  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+  Integer MaxInnerIterations;
+  Integer MaxOuterIterations;
+  GridBase* SinglePrecGrid4; //Grid for single-precision fields
+  GridBase* SinglePrecGrid5; //Grid for single-precision fields
+  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
 
-    FermionOperatorF &FermOpF;
-    FermionOperatorD &FermOpD;;
-    SchurOperatorF &LinOpF;
-    SchurOperatorD &LinOpD;
+  FermionOperatorF &FermOpF;
+  FermionOperatorD &FermOpD;;
+  SchurOperatorF &LinOpF;
+  SchurOperatorD &LinOpD;
 
-    Integer TotalInnerIterations; //Number of inner CG iterations
-    Integer TotalOuterIterations; //Number of restarts
-    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+  Integer TotalInnerIterations; //Number of inner CG iterations
+  Integer TotalOuterIterations; //Number of restarts
+  Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
 
-    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
-						    Integer maxinnerit, 
-						    Integer maxouterit, 
-						    GridBase* _sp_grid4, 
-						    GridBase* _sp_grid5, 
-						    FermionOperatorF &_FermOpF,
-						    FermionOperatorD &_FermOpD,
-						    SchurOperatorF   &_LinOpF,
-						    SchurOperatorD   &_LinOpD): 
-      LinOpF(_LinOpF),
-      LinOpD(_LinOpD),
-      FermOpF(_FermOpF),
-      FermOpD(_FermOpD),
-      Tolerance(tol), 
-      InnerTolerance(tol), 
-      MaxInnerIterations(maxinnerit), 
-      MaxOuterIterations(maxouterit), 
-      SinglePrecGrid4(_sp_grid4),
-      SinglePrecGrid5(_sp_grid5),
-      OuterLoopNormMult(100.) 
-    { 
-      /* Debugging instances of objects; references are stored
-      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
-      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
-      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
-      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
-      */
-    };
-
-    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
-
-      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
-
-      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
-      
-      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
-      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
-      // Assumption made in code to extract gauge field
-      // We could avoid storing LinopD reference alltogether ?
-      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Must snarf a single precision copy of the gauge field in Linop_d argument
-      ////////////////////////////////////////////////////////////////////////////////////
-      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
-      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
-      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
-      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
-
-      GridBase * GridPtrF = SinglePrecGrid4;
-      GridBase * GridPtrD = FermOpD.Umu.Grid();
-      GaugeFieldF     U_f  (GridPtrF);
-      GaugeLinkFieldF Umu_f(GridPtrF);
-      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
-      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Moving this to a Clone method of fermion operator would allow to duplicate the 
-      // physics parameters and decrease gauge field copies
-      ////////////////////////////////////////////////////////////////////////////////////
-      GaugeLinkFieldD Umu_d(GridPtrD);
-      for(int mu=0;mu<Nd*2;mu++){ 
-	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
-	precisionChange(Umu_f,Umu_d);
-	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
-      }
-      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
-      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
-
-      ////////////////////////////////////////////////////////////////////////////////////
-      // Make a mixed precision conjugate gradient
-      ////////////////////////////////////////////////////////////////////////////////////
-      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
-      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
-      MPCG(src,psi);
-    }
+  MixedPrecisionConjugateGradientOperatorFunction(RealD tol,
+                                                  Integer maxinnerit,
+                                                  Integer maxouterit,
+                                                  GridBase* _sp_grid4,
+                                                  GridBase* _sp_grid5,
+                                                  FermionOperatorF &_FermOpF,
+                                                  FermionOperatorD &_FermOpD,
+                                                  SchurOperatorF   &_LinOpF,
+                                                  SchurOperatorD   &_LinOpD):
+    LinOpF(_LinOpF),
+    LinOpD(_LinOpD),
+    FermOpF(_FermOpF),
+    FermOpD(_FermOpD),
+    Tolerance(tol),
+    InnerTolerance(tol),
+    MaxInnerIterations(maxinnerit),
+    MaxOuterIterations(maxouterit),
+    SinglePrecGrid4(_sp_grid4),
+    SinglePrecGrid5(_sp_grid5),
+    OuterLoopNormMult(100.)
+  {
+    /* Debugging instances of objects; references are stored
+    std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
+    std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
+    std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
+    std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
+    */
   };
-}};
+
+  void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+
+    std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+
+    SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+
+    //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
+    //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
+    // Assumption made in code to extract gauge field
+    // We could avoid storing LinopD reference alltogether ?
+    assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Must snarf a single precision copy of the gauge field in Linop_d argument
+    ////////////////////////////////////////////////////////////////////////////////////
+    typedef typename FermionOperatorF::GaugeField GaugeFieldF;
+    typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
+    typedef typename FermionOperatorD::GaugeField GaugeFieldD;
+    typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
+
+    GridBase * GridPtrF = SinglePrecGrid4;
+    GridBase * GridPtrD = FermOpD.Umu.Grid();
+    GaugeFieldF     U_f  (GridPtrF);
+    GaugeLinkFieldF Umu_f(GridPtrF);
+    //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
+    //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Moving this to a Clone method of fermion operator would allow to duplicate the
+    // physics parameters and decrease gauge field copies
+    ////////////////////////////////////////////////////////////////////////////////////
+    GaugeLinkFieldD Umu_d(GridPtrD);
+    for(int mu=0;mu<Nd*2;mu++){
+      Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
+      precisionChange(Umu_f,Umu_d);
+      PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
+    }
+    pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+    pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Make a mixed precision conjugate gradient
+    ////////////////////////////////////////////////////////////////////////////////////
+    MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+    std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+    MPCG(src,psi);
+  }
+};
+
+NAMESPACE_END(Grid);
+
 
 int main(int argc, char **argv) {
   using namespace Grid;
-  using namespace Grid::QCD;
 
   Grid_init(&argc, &argv);
   int threads = GridThread::GetThreads();
@@ -167,12 +167,12 @@ int main(int argc, char **argv) {
   typedef typename FermionActionF::FermionField FermionFieldF;
 
   typedef Grid::XmlReader       Serialiser;
-  
+
   //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
-  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
-  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
 
   HMCparameters HMCparams;
   {
@@ -184,7 +184,7 @@ int main(int argc, char **argv) {
 
   // Grid from the command line arguments --grid and --mpi
   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-  
+
   CheckpointerParameters CPparams;
   CPparams.config_prefix = "ckpoint_EODWF_lat";
   CPparams.rng_prefix    = "ckpoint_EODWF_rng";
@@ -198,7 +198,7 @@ int main(int argc, char **argv) {
   TheHMC.Resources.SetRNGSeeds(RNGpar);
 
   // Construct observables
-  // here there is too much indirection 
+  // here there is too much indirection
   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
   TheHMC.Resources.AddObservable<PlaqObs>();
   //////////////////////////////////////////////
@@ -209,7 +209,7 @@ int main(int argc, char **argv) {
   Real strange_mass = 0.02144;
   Real pv_mass      = 1.0;
   RealD M5  = 1.8;
-  RealD b   = 1.5; 
+  RealD b   = 1.5;
   RealD c   = 0.5;
 
   // Copied from paper
@@ -222,7 +222,7 @@ int main(int argc, char **argv) {
   ///////////////////////////////////////////////////////////////////////////////////////////////
   //Bad choices with large dH. Equalising force L2 norm was not wise.
   ///////////////////////////////////////////////////////////////////////////////////////////////
-  //std::vector<Real> hasenbusch({ 0.03, 0.2, 0.3, 0.5, 0.8 }); 
+  //std::vector<Real> hasenbusch({ 0.03, 0.2, 0.3, 0.5, 0.8 });
   //std::vector<Real> hasenbusch({ 0.05, 0.2, 0.4, 0.6, 0.8 });
 
   auto GridPtr   = TheHMC.Resources.GetCartesian();
@@ -249,7 +249,7 @@ int main(int argc, char **argv) {
   std::vector<Complex> boundary = {1,1,1,-1};
   FermionAction::ImplParams Params(boundary);
   FermionActionF::ImplParams ParamsF(boundary);
-  
+
   double ActionStoppingCondition     = 1e-10;
   double DerivativeStoppingCondition = 1e-7;
   double MaxCGIterations = 30000;
@@ -280,7 +280,7 @@ int main(int argc, char **argv) {
   OFRp.degree   = 12;
   OFRp.precision= 50;
 
-  
+
   MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
   MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
   MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
@@ -298,51 +298,51 @@ int main(int argc, char **argv) {
   LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
 
   MxPCG_EOFA ActionCGL(ActionStoppingCondition,
-		       MX_inner,
-		       MaxCGIterations,
-		       GridPtrF,
-		       FrbGridF,
-		       Strange_Op_LF,Strange_Op_L,
-		       Strange_LinOp_LF,Strange_LinOp_L);
+                       MX_inner,
+                       MaxCGIterations,
+                       GridPtrF,
+                       FrbGridF,
+                       Strange_Op_LF,Strange_Op_L,
+                       Strange_LinOp_LF,Strange_LinOp_L);
 
   MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,
-			   MX_inner,
-			   MaxCGIterations,
-			   GridPtrF,
-			   FrbGridF,
-			   Strange_Op_LF,Strange_Op_L,
-			   Strange_LinOp_LF,Strange_LinOp_L);
-  
-  MxPCG_EOFA ActionCGR(ActionStoppingCondition,
-		       MX_inner,
-		       MaxCGIterations,
-		       GridPtrF,
-		       FrbGridF,
-		       Strange_Op_RF,Strange_Op_R,
-		       Strange_LinOp_RF,Strange_LinOp_R);
-  
-  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
-			   MX_inner,
-			   MaxCGIterations,
-			   GridPtrF,
-			   FrbGridF,
-			   Strange_Op_RF,Strange_Op_R,
-			   Strange_LinOp_RF,Strange_LinOp_R);
+                           MX_inner,
+                           MaxCGIterations,
+                           GridPtrF,
+                           FrbGridF,
+                           Strange_Op_LF,Strange_Op_L,
+                           Strange_LinOp_LF,Strange_LinOp_L);
 
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA(Strange_Op_L, Strange_Op_R, 
-	 ActionCG, 
-	 ActionCGL, ActionCGR,
-	 DerivativeCGL, DerivativeCGR,
-	 OFRp, true);
+  MxPCG_EOFA ActionCGR(ActionStoppingCondition,
+                       MX_inner,
+                       MaxCGIterations,
+                       GridPtrF,
+                       FrbGridF,
+                       Strange_Op_RF,Strange_Op_R,
+                       Strange_LinOp_RF,Strange_LinOp_R);
+
+  MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
+                           MX_inner,
+                           MaxCGIterations,
+                           GridPtrF,
+                           FrbGridF,
+                           Strange_Op_RF,Strange_Op_R,
+                           Strange_LinOp_RF,Strange_LinOp_R);
+
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
+    EOFA(Strange_Op_L, Strange_Op_R,
+         ActionCG,
+         ActionCGL, ActionCGR,
+         DerivativeCGL, DerivativeCGR,
+         OFRp, true);
 #else
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
-    EOFA(Strange_Op_L, Strange_Op_R, 
-	 ActionCG, 
-	 ActionCG, ActionCG,
-	 ActionCG, ActionCG,
-	 //	 DerivativeCG, DerivativeCG,
-	 OFRp, true);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
+    EOFA(Strange_Op_L, Strange_Op_R,
+         ActionCG,
+         ActionCG, ActionCG,
+         ActionCG, ActionCG,
+         //         DerivativeCG, DerivativeCG,
+         OFRp, true);
 #endif
   Level1.push_back(&EOFA);
 
@@ -373,7 +373,7 @@ int main(int argc, char **argv) {
   std::vector<MxPCG *> MPCG;
   std::vector<FermionActionF *> DenominatorsF;
   std::vector<LinearOperatorD *> LinOpD;
-  std::vector<LinearOperatorF *> LinOpF; 
+  std::vector<LinearOperatorF *> LinOpF;
 
   for(int h=0;h<n_hasenbusch+1;h++){
 
@@ -395,20 +395,20 @@ int main(int argc, char **argv) {
     double conv  = DerivativeStoppingCondition;
     if (h<3) conv= DerivativeStoppingConditionLoose; // Relax on first two hasenbusch factors
     MPCG.push_back(new MxPCG(conv,
-			     MX_inner,
-			     MaxCGIterations,
-			     GridPtrF,
-			     FrbGridF,
-			     *DenominatorsF[h],*Denominators[h],
-			     *LinOpF[h], *LinOpD[h]) );
+                             MX_inner,
+                             MaxCGIterations,
+                             GridPtrF,
+                             FrbGridF,
+                             *DenominatorsF[h],*Denominators[h],
+                             *LinOpF[h], *LinOpD[h]) );
 
     ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,
-				   MX_inner,
-				   MaxCGIterations,
-				   GridPtrF,
-				   FrbGridF,
-				   *DenominatorsF[h],*Denominators[h],
-				   *LinOpF[h], *LinOpD[h]) );
+                                   MX_inner,
+                                   MaxCGIterations,
+                                   GridPtrF,
+                                   FrbGridF,
+                                   *DenominatorsF[h],*Denominators[h],
+                                   *LinOpF[h], *LinOpD[h]) );
 
     // Heatbath not mixed yet. As inverts numerators not so important as raised mass.
     Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],ActionCG));
diff --git a/HMC/Mobius2p1fRHMC.cc b/HMC/Mobius2p1fRHMC.cc
index 04fb0ee5..82ca4d37 100644
--- a/HMC/Mobius2p1fRHMC.cc
+++ b/HMC/Mobius2p1fRHMC.cc
@@ -31,7 +31,6 @@ directory
 
 int main(int argc, char **argv) {
   using namespace Grid;
-  using namespace Grid::QCD;
 
   Grid_init(&argc, &argv);
   int threads = GridThread::GetThreads();
@@ -44,18 +43,18 @@ int main(int argc, char **argv) {
   typedef typename FermionAction::FermionField FermionField;
 
   typedef Grid::XmlReader       Serialiser;
-  
+
   //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
   IntegratorParameters MD;
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
   //  MD.name    = std::string("Leap Frog");
-  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
+  //  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
   //  MD.name    = std::string("Force Gradient");
-  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
+  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
   MD.name    = std::string("MinimumNorm2");
   MD.MDsteps = 20;
   MD.trajL   = 1.0;
-  
+
   HMCparameters HMCparams;
   HMCparams.StartTrajectory  = 30;
   HMCparams.Trajectories     = 200;
@@ -68,7 +67,7 @@ int main(int argc, char **argv) {
 
   // Grid from the command line arguments --grid and --mpi
   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-  
+
   CheckpointerParameters CPparams;
   CPparams.config_prefix = "ckpoint_EODWF_lat";
   CPparams.rng_prefix    = "ckpoint_EODWF_rng";
@@ -82,7 +81,7 @@ int main(int argc, char **argv) {
   TheHMC.Resources.SetRNGSeeds(RNGpar);
 
   // Construct observables
-  // here there is too much indirection 
+  // here there is too much indirection
   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
   TheHMC.Resources.AddObservable<PlaqObs>();
   //////////////////////////////////////////////
@@ -93,11 +92,11 @@ int main(int argc, char **argv) {
   Real strange_mass = 0.04;
   Real pv_mass      = 1.0;
   RealD M5  = 1.8;
-  RealD b   = 1.0; 
+  RealD b   = 1.0;
   RealD c   = 0.0;
-  
+
   // FIXME:
-  // Same in MC and MD 
+  // Same in MC and MD
   // Need to mix precision too
   OneFlavourRationalParams OFRp;
   OFRp.lo       = 4.0e-3;
@@ -122,7 +121,7 @@ int main(int argc, char **argv) {
   // These lines are unecessary if BC are all periodic
   std::vector<Complex> boundary = {1,1,1,-1};
   FermionAction::ImplParams Params(boundary);
-  
+
   double StoppingCondition = 1e-10;
   double MaxCGIterations = 30000;
   ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations);

From 6b6c5aa626c9b1028eb3dec7ad7cc8a9adac0977 Mon Sep 17 00:00:00 2001
From: Nils Asmussen <n.asmussen@soton.ac.uk>
Date: Tue, 20 Aug 2019 15:35:36 +0100
Subject: [PATCH 03/12] remove namespace QCD from directory tests

---
 tests/hadrons/Test_diskvector.cc          |   1 -
 tests/hmc/Test_hmc_Mobius2p1f.cc          |   1 -
 tests/qdpxx/Test_qdpxx_loops_staples.cc   | 190 +++++++++++-----------
 tests/qdpxx/Test_qdpxx_munprec.cc         |  60 +++----
 tests/qdpxx/Test_qdpxx_stag.cc            |  24 +--
 tests/qdpxx/Test_qdpxx_wilson.cc          |  36 ++--
 tests/smearing/Test_smearing.cc           |   2 -
 tests/solver/Test_wilsonclover_mg_lime.cc |   1 -
 8 files changed, 154 insertions(+), 161 deletions(-)

diff --git a/tests/hadrons/Test_diskvector.cc b/tests/hadrons/Test_diskvector.cc
index 10bc4db1..9ec97a22 100644
--- a/tests/hadrons/Test_diskvector.cc
+++ b/tests/hadrons/Test_diskvector.cc
@@ -28,7 +28,6 @@ See the full license in the file "LICENSE" in the top level distribution directo
 #include <Hadrons/DiskVector.hpp>
 
 using namespace Grid;
-using namespace Grid::QCD;
 using namespace Grid::Hadrons;
 
 GRID_SERIALIZABLE_ENUM(Enum, undef, red, 1, blue, 2, green, 3);
diff --git a/tests/hmc/Test_hmc_Mobius2p1f.cc b/tests/hmc/Test_hmc_Mobius2p1f.cc
index 253727d2..508f5b5e 100644
--- a/tests/hmc/Test_hmc_Mobius2p1f.cc
+++ b/tests/hmc/Test_hmc_Mobius2p1f.cc
@@ -31,7 +31,6 @@ directory
 
 int main(int argc, char **argv) {
   using namespace Grid;
-  using namespace Grid::QCD;
 
   Grid_init(&argc, &argv);
   int threads = GridThread::GetThreads();
diff --git a/tests/qdpxx/Test_qdpxx_loops_staples.cc b/tests/qdpxx/Test_qdpxx_loops_staples.cc
index 53202a4d..bbb41f4e 100644
--- a/tests/qdpxx/Test_qdpxx_loops_staples.cc
+++ b/tests/qdpxx/Test_qdpxx_loops_staples.cc
@@ -27,15 +27,15 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
     /*  END LEGAL */
 #include <Grid/Grid.h>
 
-double calc_grid_p      (Grid::QCD::LatticeGaugeField & lat);
-double calc_chroma_p    (Grid::QCD::LatticeGaugeField & lat);
-double calc_grid_r      (Grid::QCD::LatticeGaugeField & lat);
-double calc_grid_IW     (Grid::QCD::LatticeGaugeField & lat);
-double calc_grid_r_dir  (Grid::QCD::LatticeGaugeField & lat);
-double calc_chroma_r    (Grid::QCD::LatticeGaugeField & lat);
-double calc_chroma_IW   (Grid::QCD::LatticeGaugeField & lat);
-void check_grid_r_staple(Grid::QCD::LatticeGaugeField & Umu);
-void check_grid_p_staple(Grid::QCD::LatticeGaugeField & Umu);
+double calc_grid_p      (Grid::LatticeGaugeField & lat);
+double calc_chroma_p    (Grid::LatticeGaugeField & lat);
+double calc_grid_r      (Grid::LatticeGaugeField & lat);
+double calc_grid_IW     (Grid::LatticeGaugeField & lat);
+double calc_grid_r_dir  (Grid::LatticeGaugeField & lat);
+double calc_chroma_r    (Grid::LatticeGaugeField & lat);
+double calc_chroma_IW   (Grid::LatticeGaugeField & lat);
+void check_grid_r_staple(Grid::LatticeGaugeField & Umu);
+void check_grid_p_staple(Grid::LatticeGaugeField & Umu);
 
 const double beta=2.6;
 const double c1=-0.331;
@@ -53,10 +53,10 @@ public:
   
   typedef multi1d<LatticeColorMatrix> U;
   
-  static void ImportGauge(Grid::QCD::LatticeGaugeField & gr,
+  static void ImportGauge(Grid::LatticeGaugeField & gr,
 			  QDP::multi1d<QDP::LatticeColorMatrix> & ch) 
   {
-    Grid::QCD::LorentzColourMatrix LCM;
+    Grid::LorentzColourMatrix LCM;
     Grid::Complex cc;
     QDP::ColorMatrix cm;
     QDP::Complex c;
@@ -127,9 +127,9 @@ int main (int argc,char **argv )
    * Setup Grid
    *********************************************************/
   Grid::Grid_init(&argc,&argv);
-  Grid::GridCartesian * UGrid   = Grid::QCD::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(), 
-									    Grid::GridDefaultSimd(Grid::QCD::Nd,Grid::vComplex::Nsimd()),
-									    Grid::GridDefaultMpi());
+  Grid::GridCartesian * UGrid   = Grid::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(), 
+                                                                       Grid::GridDefaultSimd(Grid::Nd,Grid::vComplex::Nsimd()),
+                                                                       Grid::GridDefaultMpi());
   
   std::vector<int> gd = UGrid->GlobalDimensions();
   QDP::multi1d<int> nrow(QDP::Nd);
@@ -138,7 +138,7 @@ int main (int argc,char **argv )
   QDP::Layout::setLattSize(nrow);
   QDP::Layout::create();
 
-  Grid::QCD::LatticeGaugeField lat(UGrid);
+  Grid::LatticeGaugeField lat(UGrid);
 
   double s_grid   = calc_grid_p  (lat);
 
@@ -181,7 +181,7 @@ int main (int argc,char **argv )
   Chroma::finalize();
 }
 
-double calc_chroma_IW(Grid::QCD::LatticeGaugeField & lat)
+double calc_chroma_IW(Grid::LatticeGaugeField & lat)
 {
   typedef QDP::multi1d<QDP::LatticeColorMatrix> U;
 
@@ -203,7 +203,7 @@ double calc_chroma_IW(Grid::QCD::LatticeGaugeField & lat)
 
   return s;
 }
-double calc_chroma_r(Grid::QCD::LatticeGaugeField & lat)
+double calc_chroma_r(Grid::LatticeGaugeField & lat)
 {
   typedef QDP::multi1d<QDP::LatticeColorMatrix> U;
 
@@ -245,7 +245,7 @@ double calc_chroma_r(Grid::QCD::LatticeGaugeField & lat)
 // action = beta * Nd*Nd-1*vol*0.5 - beta * Nd*Nd-1*vol*0.5*plaq
 //
 // plaq == sumplaq * 2/(V*Nd*(Nd-1)*Nc)
-double calc_chroma_p(Grid::QCD::LatticeGaugeField & lat)
+double calc_chroma_p(Grid::LatticeGaugeField & lat)
 {
   typedef QDP::multi1d<QDP::LatticeColorMatrix> U;
 
@@ -270,60 +270,60 @@ double calc_chroma_p(Grid::QCD::LatticeGaugeField & lat)
 
 
 
-double calc_grid_p(Grid::QCD::LatticeGaugeField & Umu)
+double calc_grid_p(Grid::LatticeGaugeField & Umu)
 {
   std::vector<int> seeds4({1,2,3,4});
 
   Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu.Grid();
   Grid::GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 
-  Grid::QCD::SU3::HotConfiguration(RNG4,Umu);
+  Grid::SU3::HotConfiguration(RNG4,Umu);
 
-  Grid::QCD::LatticeColourMatrix tmp(UGrid); 
+  Grid::LatticeColourMatrix tmp(UGrid); 
   tmp = Grid::zero;
 
-  Grid::QCD::PokeIndex<LorentzIndex>(Umu,tmp,2);
-  Grid::QCD::PokeIndex<LorentzIndex>(Umu,tmp,3);
+  Grid::PokeIndex<LorentzIndex>(Umu,tmp,2);
+  Grid::PokeIndex<LorentzIndex>(Umu,tmp,3);
 
-  Grid::QCD::WilsonGaugeActionR Wilson(beta); // Just take beta = 1.0
+  Grid::WilsonGaugeActionR Wilson(beta); // Just take beta = 1.0
   
   return Wilson.S(Umu);
 } 
-double calc_grid_r(Grid::QCD::LatticeGaugeField & Umu)
+double calc_grid_r(Grid::LatticeGaugeField & Umu)
 {
   Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu.Grid();
 
-  Grid::QCD::PlaqPlusRectangleActionR Wilson(0.0,c1); // Just take beta = 0.0
+  Grid::PlaqPlusRectangleActionR Wilson(0.0,c1); // Just take beta = 0.0
   
   return Wilson.S(Umu);
 } 
-double calc_grid_IW(Grid::QCD::LatticeGaugeField & Umu)
+double calc_grid_IW(Grid::LatticeGaugeField & Umu)
 {
   Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu.Grid();
 
-  Grid::QCD::IwasakiGaugeActionR Iwasaki(beta);
+  Grid::IwasakiGaugeActionR Iwasaki(beta);
   
   return Iwasaki.S(Umu);
 } 
-double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
+double calc_grid_r_dir(Grid::LatticeGaugeField & Umu)
 {
   Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu.Grid();
 
-  std::vector<Grid::QCD::LatticeColourMatrix> U(4,UGrid);
+  std::vector<Grid::LatticeColourMatrix> U(4,UGrid);
   for(int mu=0;mu<Nd;mu++){
     U[mu] = Grid::PeekIndex<LorentzIndex>(Umu,mu);
   }
 
-  Grid::QCD::LatticeComplex rect(UGrid);
-  Grid::QCD::TComplex trect;
-  Grid::QCD::Complex  crect;
+  Grid::LatticeComplex rect(UGrid);
+  Grid::TComplex trect;
+  Grid::Complex  crect;
   Grid::RealD rrect;
   Grid::RealD vol = UGrid->gSites();
-  for(int mu=0;mu<Grid::QCD::Nd;mu++){
-  for(int nu=0;nu<Grid::QCD::Nd;nu++){
+  for(int mu=0;mu<Grid::Nd;mu++){
+  for(int nu=0;nu<Grid::Nd;nu++){
     if ( mu!=nu ) {
 
-      Grid::QCD::ColourWilsonLoops::traceDirRectangle(rect,U,mu,nu);
+      Grid::ColourWilsonLoops::traceDirRectangle(rect,U,mu,nu);
       trect = Grid::sum(rect);
       crect = Grid::TensorRemove(trect);
       rrect = real(crect);
@@ -335,9 +335,9 @@ double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
       // Staple test
       Peter.Start();
       {                                                  
-	Grid::QCD::LatticeColourMatrix Stap(UGrid);
-	Grid::QCD::LatticeComplex      SumTrStap(UGrid);
-	Grid::QCD::LatticeComplex      TrStap(UGrid);
+	Grid::LatticeColourMatrix Stap(UGrid);
+	Grid::LatticeComplex      SumTrStap(UGrid);
+	Grid::LatticeComplex      TrStap(UGrid);
 
 	/*
 	 * Make staple for loops centered at coor of link ; this one is ok.     //     |
@@ -346,10 +346,10 @@ double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
 	//           __ ___ 
 	//          |    __ |
 	Stap = 
-	  Grid::Cshift(Grid::QCD::PeriodicBC::CovShiftForward (U[mu],mu,
-		       Grid::QCD::PeriodicBC::CovShiftForward (U[nu],nu,
-		       Grid::QCD::PeriodicBC::CovShiftBackward(U[mu],mu,
-                       Grid::QCD::PeriodicBC::CovShiftBackward(U[mu],mu,
+	  Grid::Cshift(Grid::PeriodicBC::CovShiftForward (U[mu],mu,
+		       Grid::PeriodicBC::CovShiftForward (U[nu],nu,
+		       Grid::PeriodicBC::CovShiftBackward(U[mu],mu,
+                       Grid::PeriodicBC::CovShiftBackward(U[mu],mu,
 		       Grid::Cshift(adj(U[nu]),nu,-1))))) , mu, 1);
 
 	TrStap = Grid::trace (U[mu]*Stap);
@@ -364,10 +364,10 @@ double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
 	//              __ 
 	//          |__ __ |
 
-	Stap = Grid::Cshift(Grid::QCD::PeriodicBC::CovShiftForward (U[mu],mu,
-		            Grid::QCD::PeriodicBC::CovShiftBackward(U[nu],nu,
-   		            Grid::QCD::PeriodicBC::CovShiftBackward(U[mu],mu,
-                            Grid::QCD::PeriodicBC::CovShiftBackward(U[mu],mu, U[nu])))) , mu, 1);
+	Stap = Grid::Cshift(Grid::PeriodicBC::CovShiftForward (U[mu],mu,
+		            Grid::PeriodicBC::CovShiftBackward(U[nu],nu,
+   		            Grid::PeriodicBC::CovShiftBackward(U[mu],mu,
+                            Grid::PeriodicBC::CovShiftBackward(U[mu],mu, U[nu])))) , mu, 1);
 
 	TrStap = Grid::trace (U[mu]*Stap);
 
@@ -379,10 +379,10 @@ double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
 	//           __ 
 	//          |__ __ |
 
-	Stap = Grid::Cshift(Grid::QCD::PeriodicBC::CovShiftBackward(U[nu],nu,
-		            Grid::QCD::PeriodicBC::CovShiftBackward(U[mu],mu,
-                            Grid::QCD::PeriodicBC::CovShiftBackward(U[mu],mu,
-   		            Grid::QCD::PeriodicBC::CovShiftForward(U[nu],nu,U[mu])))) , mu, 1);
+	Stap = Grid::Cshift(Grid::PeriodicBC::CovShiftBackward(U[nu],nu,
+		            Grid::PeriodicBC::CovShiftBackward(U[mu],mu,
+                            Grid::PeriodicBC::CovShiftBackward(U[mu],mu,
+   		            Grid::PeriodicBC::CovShiftForward(U[nu],nu,U[mu])))) , mu, 1);
 
 	TrStap = Grid::trace (U[mu]*Stap);
 
@@ -395,10 +395,10 @@ double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
 	//           __ ___ 
 	//          |__    |
 
-	Stap = Grid::Cshift(Grid::QCD::PeriodicBC::CovShiftForward (U[nu],nu,
-		            Grid::QCD::PeriodicBC::CovShiftBackward(U[mu],mu,
-                            Grid::QCD::PeriodicBC::CovShiftBackward(U[mu],mu,
-                            Grid::QCD::PeriodicBC::CovShiftBackward(U[nu],nu,U[mu])))) , mu, 1);
+	Stap = Grid::Cshift(Grid::PeriodicBC::CovShiftForward (U[nu],nu,
+		            Grid::PeriodicBC::CovShiftBackward(U[mu],mu,
+                            Grid::PeriodicBC::CovShiftBackward(U[mu],mu,
+                            Grid::PeriodicBC::CovShiftBackward(U[nu],nu,U[mu])))) , mu, 1);
 
 
 	TrStap = Grid::trace (U[mu]*Stap);
@@ -418,12 +418,12 @@ double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
 	 * Make staple for loops centered at coor of link ; this one is ok.     //     |
 	 */
 	//	Stap = 
-	//	  Grid::Cshift(Grid::QCD::PeriodicBC::CovShiftForward(U[nu],nu,U[nu]),mu,1)* // ->||
-	//	  Grid::adj(Grid::QCD::PeriodicBC::CovShiftForward(U[nu],nu,Grid::QCD::PeriodicBC::CovShiftForward(U[nu],nu,U[mu]))) ;
-	Stap = Grid::Cshift(Grid::QCD::PeriodicBC::CovShiftForward(U[nu],nu,
-		            Grid::QCD::PeriodicBC::CovShiftForward(U[nu],nu,
-                            Grid::QCD::PeriodicBC::CovShiftBackward(U[mu],mu,
-                            Grid::QCD::PeriodicBC::CovShiftBackward(U[nu],nu,  Grid::Cshift(adj(U[nu]),nu,-1))))) , mu, 1);
+	//	  Grid::Cshift(Grid::PeriodicBC::CovShiftForward(U[nu],nu,U[nu]),mu,1)* // ->||
+	//	  Grid::adj(Grid::PeriodicBC::CovShiftForward(U[nu],nu,Grid::PeriodicBC::CovShiftForward(U[nu],nu,U[mu]))) ;
+	Stap = Grid::Cshift(Grid::PeriodicBC::CovShiftForward(U[nu],nu,
+		            Grid::PeriodicBC::CovShiftForward(U[nu],nu,
+                            Grid::PeriodicBC::CovShiftBackward(U[mu],mu,
+                            Grid::PeriodicBC::CovShiftBackward(U[nu],nu,  Grid::Cshift(adj(U[nu]),nu,-1))))) , mu, 1);
 	  
 	TrStap = Grid::trace (U[mu]*Stap);
 	SumTrStap += TrStap;
@@ -440,10 +440,10 @@ double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
 	//      |  | 
 	//       -- 
 
-	Stap = Grid::Cshift(Grid::QCD::PeriodicBC::CovShiftBackward(U[nu],nu,
-		            Grid::QCD::PeriodicBC::CovShiftBackward(U[nu],nu,
-                            Grid::QCD::PeriodicBC::CovShiftBackward(U[mu],mu,
-                            Grid::QCD::PeriodicBC::CovShiftForward (U[nu],nu,U[nu])))) , mu, 1);
+	Stap = Grid::Cshift(Grid::PeriodicBC::CovShiftBackward(U[nu],nu,
+		            Grid::PeriodicBC::CovShiftBackward(U[nu],nu,
+                            Grid::PeriodicBC::CovShiftBackward(U[mu],mu,
+                            Grid::PeriodicBC::CovShiftForward (U[nu],nu,U[nu])))) , mu, 1);
 
 	TrStap = Grid::trace (U[mu]*Stap);
 	trect = Grid::sum(TrStap);
@@ -459,20 +459,20 @@ double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
       Peter.Stop();
       Azusa.Start();
       {
-	Grid::QCD::LatticeComplex RectPlaq_d(UGrid);
-	Grid::QCD::LatticeColourMatrix ds_U(UGrid);
-	Grid::QCD::LatticeColourMatrix left_2(UGrid);
-	Grid::QCD::LatticeColourMatrix upper_l(UGrid);
-	Grid::QCD::LatticeColourMatrix upper_staple(UGrid);
-	Grid::QCD::LatticeColourMatrix down_l(UGrid);
-	Grid::QCD::LatticeColourMatrix down_staple(UGrid);
-	Grid::QCD::LatticeColourMatrix tmp(UGrid);
+	Grid::LatticeComplex RectPlaq_d(UGrid);
+	Grid::LatticeColourMatrix ds_U(UGrid);
+	Grid::LatticeColourMatrix left_2(UGrid);
+	Grid::LatticeColourMatrix upper_l(UGrid);
+	Grid::LatticeColourMatrix upper_staple(UGrid);
+	Grid::LatticeColourMatrix down_l(UGrid);
+	Grid::LatticeColourMatrix down_staple(UGrid);
+	Grid::LatticeColourMatrix tmp(UGrid);
 	
 	// 2 (mu)x1(nu)
-	left_2=  Grid::QCD::PeriodicBC::CovShiftForward(U[mu],mu,U[mu]);   // Umu(x) Umu(x+mu)
+	left_2=  Grid::PeriodicBC::CovShiftForward(U[mu],mu,U[mu]);   // Umu(x) Umu(x+mu)
 	tmp=Grid::Cshift(U[nu],mu,2);                          // Unu(x+2mu)
 
-	upper_l=  Grid::QCD::PeriodicBC::CovShiftForward(tmp,nu,Grid::adj(left_2)); //  Unu(x+2mu) Umu^dag(x+mu+nu) Umu^dag(x+nu) 
+	upper_l=  Grid::PeriodicBC::CovShiftForward(tmp,nu,Grid::adj(left_2)); //  Unu(x+2mu) Umu^dag(x+mu+nu) Umu^dag(x+nu) 
 	//                 __ __ 
 	//              =       |
 	
@@ -546,9 +546,9 @@ double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
 	//   _
 	//  | |
 	//  | |
-	Grid::QCD::LatticeColourMatrix up2= Grid::QCD::PeriodicBC::CovShiftForward(U[nu],nu,U[nu]);
+	Grid::LatticeColourMatrix up2= Grid::PeriodicBC::CovShiftForward(U[nu],nu,U[nu]);
 
-	upper_l= Grid::QCD::PeriodicBC::CovShiftForward(Grid::Cshift(up2,mu,1),nu,Grid::Cshift(adj(U[mu]),nu,1));
+	upper_l= Grid::PeriodicBC::CovShiftForward(Grid::Cshift(up2,mu,1),nu,Grid::Cshift(adj(U[mu]),nu,1));
 	ds_U= upper_l*Grid::adj(up2);
 
 	RectPlaq_d = Grid::trace(U[mu]*ds_U);
@@ -569,7 +569,7 @@ double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
    downer_l=           |  
                (x)<----V                 
 */    
-	down_l= Grid::adj(Grid::QCD::PeriodicBC::CovShiftForward(U[mu],mu,up2)); //downer_l
+	down_l= Grid::adj(Grid::PeriodicBC::CovShiftForward(U[mu],mu,up2)); //downer_l
 /*
                      ^     |
    down_staple  =    |     V 
@@ -601,23 +601,23 @@ double calc_grid_r_dir(Grid::QCD::LatticeGaugeField & Umu)
 
     }
   }}
-  Grid::QCD::PlaqPlusRectangleActionR Wilson(0.0,c1); // Just take beta = 0.0
+  Grid::PlaqPlusRectangleActionR Wilson(0.0,c1); // Just take beta = 0.0
   
   return Wilson.S(Umu);
 };
 
-void check_grid_r_staple(Grid::QCD::LatticeGaugeField & Umu)
+void check_grid_r_staple(Grid::LatticeGaugeField & Umu)
 {
 
   std::vector<int> seeds4({1,2,3,4});
 
   Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu.Grid();
 
-  Grid::QCD::PlaqPlusRectangleActionR Wilson(0.0,c1); // Just take beta = 0.0
+  Grid::PlaqPlusRectangleActionR Wilson(0.0,c1); // Just take beta = 0.0
 
-  Grid::QCD::LatticeColourMatrix staple(UGrid);
-  Grid::QCD::LatticeColourMatrix link(UGrid);
-  Grid::QCD::LatticeComplex Traced(UGrid);
+  Grid::LatticeColourMatrix staple(UGrid);
+  Grid::LatticeColourMatrix link(UGrid);
+  Grid::LatticeComplex Traced(UGrid);
   Grid::Complex Rplaq(0.0);
 
   for(int mu=0;mu<Nd;mu++){
@@ -630,12 +630,12 @@ void check_grid_r_staple(Grid::QCD::LatticeGaugeField & Umu)
     // Vol as for each site
     Grid::RealD RectScale(1.0/vol/12.0/6.0/3.0); 
 
-    Grid::QCD::ColourWilsonLoops::RectStaple(staple,Umu,mu);
+    Grid::ColourWilsonLoops::RectStaple(staple,Umu,mu);
     
-    link = Grid::QCD::PeekIndex<LorentzIndex>(Umu,mu);
+    link = Grid::PeekIndex<LorentzIndex>(Umu,mu);
 
     Traced = Grid::trace( link*staple) * RectScale;
-    Grid::QCD::TComplex Tp = Grid::sum(Traced);
+    Grid::TComplex Tp = Grid::sum(Traced);
     Grid::Complex p   = Grid::TensorRemove(Tp);
 
     std::cout<< "Rect from RectStaple "<<p<<std::endl;
@@ -645,18 +645,18 @@ void check_grid_r_staple(Grid::QCD::LatticeGaugeField & Umu)
   std::cout<< "Rect from RectStaple "<<Rplaq<<std::endl;
 } 
 
-void check_grid_p_staple(Grid::QCD::LatticeGaugeField & Umu)
+void check_grid_p_staple(Grid::LatticeGaugeField & Umu)
 {
 
   std::vector<int> seeds4({1,2,3,4});
 
   Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu.Grid();
 
-  Grid::QCD::PlaqPlusRectangleActionR Wilson(1.0,0.0); // Just take c1 = 0.0
+  Grid::PlaqPlusRectangleActionR Wilson(1.0,0.0); // Just take c1 = 0.0
 
-  Grid::QCD::LatticeColourMatrix staple(UGrid);
-  Grid::QCD::LatticeColourMatrix link(UGrid);
-  Grid::QCD::LatticeComplex Traced(UGrid);
+  Grid::LatticeColourMatrix staple(UGrid);
+  Grid::LatticeColourMatrix link(UGrid);
+  Grid::LatticeComplex Traced(UGrid);
   Grid::Complex plaq(0.0);
 
   for(int mu=0;mu<Nd;mu++){
@@ -669,12 +669,12 @@ void check_grid_p_staple(Grid::QCD::LatticeGaugeField & Umu)
     // Vol as for each site
     Grid::RealD Scale(1.0/vol/12.0/2.0/3.0); 
 
-    Grid::QCD::ColourWilsonLoops::Staple(staple,Umu,mu);
+    Grid::ColourWilsonLoops::Staple(staple,Umu,mu);
     
-    link = Grid::QCD::PeekIndex<LorentzIndex>(Umu,mu);
+    link = Grid::PeekIndex<LorentzIndex>(Umu,mu);
 
     Traced = Grid::trace( link*staple) * Scale;
-    Grid::QCD::TComplex Tp = Grid::sum(Traced);
+    Grid::TComplex Tp = Grid::sum(Traced);
     Grid::Complex p   = Grid::TensorRemove(Tp);
 
     std::cout<< "Plaq from PlaqStaple "<<p<<std::endl;
diff --git a/tests/qdpxx/Test_qdpxx_munprec.cc b/tests/qdpxx/Test_qdpxx_munprec.cc
index 3c80a287..fbc1ec82 100644
--- a/tests/qdpxx/Test_qdpxx_munprec.cc
+++ b/tests/qdpxx/Test_qdpxx_munprec.cc
@@ -52,8 +52,8 @@ enum ChromaAction {
 		 HtContFracZolo
 };
 
-void calc_grid      (ChromaAction action,Grid::QCD::LatticeGaugeField & lat, Grid::QCD::LatticeFermion &src, Grid::QCD::LatticeFermion &res,int dag);
-void calc_chroma    (ChromaAction action,Grid::QCD::LatticeGaugeField & lat, Grid::QCD::LatticeFermion &src, Grid::QCD::LatticeFermion &res,int dag);
+void calc_grid      (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag);
+void calc_chroma    (ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag);
 
 #include <chroma.h>
 #include <actions/ferm/invert/syssolver_linop_cg_array.h>
@@ -71,10 +71,10 @@ public:
   typedef LatticeFermion T4;
   typedef multi1d<LatticeFermion> T5;
   
-  static void ImportGauge(Grid::QCD::LatticeGaugeField & gr,
+  static void ImportGauge(Grid::LatticeGaugeField & gr,
 			  QDP::multi1d<QDP::LatticeColorMatrix> & ch) 
   {
-    Grid::QCD::LorentzColourMatrix LCM;
+    Grid::LorentzColourMatrix LCM;
     Grid::Complex cc;
     QDP::ColorMatrix cm;
     QDP::Complex c;
@@ -112,10 +112,10 @@ public:
     }}}}
   }
   
-  static void ImportFermion(Grid::QCD::LatticeFermion & gr,
+  static void ImportFermion(Grid::LatticeFermion & gr,
 			    QDP::multi1d<QDP::LatticeFermion> & ch  ) 
   {
-    Grid::QCD::SpinColourVector F;
+    Grid::SpinColourVector F;
     Grid::Complex c;
 
     QDP::Fermion cF;
@@ -154,10 +154,10 @@ public:
       QDP::pokeSite(ch[s],cF,cx);
     }}}}}
   }
-  static void ExportFermion(Grid::QCD::LatticeFermion & gr,
+  static void ExportFermion(Grid::LatticeFermion & gr,
 			    QDP::multi1d<QDP::LatticeFermion> & ch  ) 
   {
-    Grid::QCD::SpinColourVector F;
+    Grid::SpinColourVector F;
     Grid::Complex c;
 
     QDP::Fermion cF;
@@ -384,9 +384,9 @@ int main (int argc,char **argv )
    * Setup Grid
    *********************************************************/
   Grid::Grid_init(&argc,&argv);
-  Grid::GridCartesian * UGrid   = Grid::QCD::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(), 
-									    Grid::GridDefaultSimd(Grid::QCD::Nd,Grid::vComplex::Nsimd()),
-									    Grid::GridDefaultMpi());
+  Grid::GridCartesian * UGrid   = Grid::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(), 
+                                                                       Grid::GridDefaultSimd(Grid::Nd,Grid::vComplex::Nsimd()),
+                                                                       Grid::GridDefaultMpi());
   
   std::vector<int> gd = UGrid->GlobalDimensions();
   QDP::multi1d<int> nrow(QDP::Nd);
@@ -395,11 +395,11 @@ int main (int argc,char **argv )
   QDP::Layout::setLattSize(nrow);
   QDP::Layout::create();
 
-  Grid::GridCartesian         * FGrid   = Grid::QCD::SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  Grid::QCD::LatticeGaugeField lat(UGrid);
-  Grid::QCD::LatticeFermion    src(FGrid);
-  Grid::QCD::LatticeFermion    res_chroma(FGrid);
-  Grid::QCD::LatticeFermion    res_grid  (FGrid);
+  Grid::GridCartesian         * FGrid   = Grid::SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+  Grid::LatticeGaugeField lat(UGrid);
+  Grid::LatticeFermion    src(FGrid);
+  Grid::LatticeFermion    res_chroma(FGrid);
+  Grid::LatticeFermion    res_grid  (FGrid);
   
   std::vector<ChromaAction> ActionList({
 		 HtCayleyTanh, // Plain old DWF.
@@ -446,7 +446,7 @@ int main (int argc,char **argv )
   Chroma::finalize();
 }
 
-void calc_chroma(ChromaAction action,Grid::QCD::LatticeGaugeField & lat, Grid::QCD::LatticeFermion &src, Grid::QCD::LatticeFermion &res,int dag)
+void calc_chroma(ChromaAction action,Grid::LatticeGaugeField & lat, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag)
 {
   QDP::multi1d<QDP::LatticeColorMatrix> u(4);
 
@@ -483,7 +483,7 @@ void calc_chroma(ChromaAction action,Grid::QCD::LatticeGaugeField & lat, Grid::Q
 
 
 
-void calc_grid(ChromaAction action,Grid::QCD::LatticeGaugeField & Umu, Grid::QCD::LatticeFermion &src, Grid::QCD::LatticeFermion &res,int dag)
+void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeFermion &src, Grid::LatticeFermion &res,int dag)
 {
   using namespace Grid;
    ;
@@ -493,8 +493,8 @@ void calc_grid(ChromaAction action,Grid::QCD::LatticeGaugeField & Umu, Grid::QCD
 
   Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu.Grid();
   Grid::GridCartesian         * FGrid   = (Grid::GridCartesian *) src.Grid();
-  Grid::GridRedBlackCartesian * UrbGrid = Grid::QCD::SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  Grid::GridRedBlackCartesian * FrbGrid = Grid::QCD::SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+  Grid::GridRedBlackCartesian * UrbGrid = Grid::SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  Grid::GridRedBlackCartesian * FrbGrid = Grid::SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 
   Grid::GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
   Grid::GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
@@ -502,12 +502,12 @@ void calc_grid(ChromaAction action,Grid::QCD::LatticeGaugeField & Umu, Grid::QCD
   Grid::gaussian(RNG5,src);
   Grid::gaussian(RNG5,res);
 
-  Grid::QCD::SU3::HotConfiguration(RNG4,Umu);
+  Grid::SU3::HotConfiguration(RNG4,Umu);
 
   /*
-  Grid::QCD::LatticeColourMatrix U(UGrid);
+  Grid::LatticeColourMatrix U(UGrid);
   U=Grid::zero;
-  for(int nn=0;nn<Grid::QCD::Nd;nn++){
+  for(int nn=0;nn<Grid::Nd;nn++){
     if ( nn>=4 ) {
       Grid::PokeIndex<LorentzIndex>(Umu,U,nn);
     }
@@ -519,7 +519,7 @@ void calc_grid(ChromaAction action,Grid::QCD::LatticeGaugeField & Umu, Grid::QCD
 
   if ( action == HtCayleyTanh ) { 
 
-    Grid::QCD::DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5);
+    Grid::DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5);
 
     std::cout << Grid::GridLogMessage <<" Calling domain wall multiply "<<std::endl;
 
@@ -535,7 +535,7 @@ void calc_grid(ChromaAction action,Grid::QCD::LatticeGaugeField & Umu, Grid::QCD
 
     Grid::Real _b = 0.5*(mobius_scale +1.0);
     Grid::Real _c = 0.5*(mobius_scale -1.0);
-    Grid::QCD::MobiusZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c,zolo_lo,zolo_hi);
+    Grid::MobiusZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c,zolo_lo,zolo_hi);
 
     std::cout << Grid::GridLogMessage <<" Calling mobius zolo multiply "<<std::endl;
 
@@ -549,7 +549,7 @@ void calc_grid(ChromaAction action,Grid::QCD::LatticeGaugeField & Umu, Grid::QCD
 
   if ( action == HtCayleyZolo ) {
 
-    Grid::QCD::ShamirZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+    Grid::ShamirZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
 
     std::cout << Grid::GridLogMessage <<" Calling shamir zolo multiply "<<std::endl;
 
@@ -565,7 +565,7 @@ void calc_grid(ChromaAction action,Grid::QCD::LatticeGaugeField & Umu, Grid::QCD
   if ( action == HmCayleyTanh ) {
     Grid::Real _b = 0.5*(mobius_scale +1.0);
     Grid::Real _c = 0.5*(mobius_scale -1.0);
-    Grid::QCD::MobiusFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c);
+    Grid::MobiusFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,_b,_c);
 
     std::cout << Grid::GridLogMessage <<" Calling mobius tanh multiply "<<std::endl;
 
@@ -581,7 +581,7 @@ void calc_grid(ChromaAction action,Grid::QCD::LatticeGaugeField & Umu, Grid::QCD
 
   if ( action == HmCayleyTanh ) {
 
-    Grid::QCD::ScaledShamirFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,mobius_scale);
+    Grid::ScaledShamirFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,mobius_scale);
 
     std::cout << Grid::GridLogMessage <<" Calling scaled shamir multiply "<<std::endl;
 
@@ -595,7 +595,7 @@ void calc_grid(ChromaAction action,Grid::QCD::LatticeGaugeField & Umu, Grid::QCD
 
   if ( action == HwCayleyTanh ) {
 
-    Grid::QCD::OverlapWilsonCayleyTanhFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
+    Grid::OverlapWilsonCayleyTanhFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,1.0);
 
     if ( dag ) 
       D.Mdag(src,res);  
@@ -607,7 +607,7 @@ void calc_grid(ChromaAction action,Grid::QCD::LatticeGaugeField & Umu, Grid::QCD
 
   if ( action == HwCayleyZolo ) {
 
-    Grid::QCD::OverlapWilsonCayleyZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
+    Grid::OverlapWilsonCayleyZolotarevFermionR D(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,_mass,_M5,zolo_lo,zolo_hi);
 
     if ( dag ) 
       D.Mdag(src,res);  
diff --git a/tests/qdpxx/Test_qdpxx_stag.cc b/tests/qdpxx/Test_qdpxx_stag.cc
index 24ac3c2a..f283d5a9 100644
--- a/tests/qdpxx/Test_qdpxx_stag.cc
+++ b/tests/qdpxx/Test_qdpxx_stag.cc
@@ -30,8 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 
 double mq=0.1;
 
-typedef Grid::QCD::StaggeredImplR::FermionField FermionField;
-typedef Grid::QCD::LatticeGaugeField GaugeField;
+typedef Grid::StaggeredImplR::FermionField FermionField;
+typedef Grid::LatticeGaugeField GaugeField;
 
 void make_gauge     (GaugeField & lat, FermionField &src);
 void calc_grid      (GaugeField & lat, GaugeField & uthin,GaugeField & ufat, FermionField &src, FermionField &res,int dag);
@@ -53,7 +53,7 @@ public:
   static void ImportGauge(GaugeField & gr,
 			  QDP::multi1d<QDP::LatticeColorMatrix> & ch) 
   {
-    Grid::QCD::LorentzColourMatrix LCM;
+    Grid::LorentzColourMatrix LCM;
     Grid::Complex cc;
     QDP::ColorMatrix cm;
     QDP::Complex c;
@@ -88,7 +88,7 @@ public:
   static void ExportGauge(GaugeField & gr,
 			  QDP::multi1d<QDP::LatticeColorMatrix> & ch) 
   {
-    Grid::QCD::LorentzColourMatrix LCM;
+    Grid::LorentzColourMatrix LCM;
     Grid::Complex cc;
     QDP::ColorMatrix cm;
     QDP::Complex c;
@@ -124,7 +124,7 @@ public:
   static void ImportFermion(FermionField & gr,
 			    QDP::LatticeStaggeredFermion & ch  ) 
   {
-    Grid::QCD::ColourVector F;
+    Grid::ColourVector F;
     Grid::Complex c;
 
 
@@ -157,7 +157,7 @@ public:
   static void ExportFermion(FermionField & gr,
 			    QDP::LatticeStaggeredFermion & ch  ) 
   {
-    Grid::QCD::ColourVector F;
+    Grid::ColourVector F;
     Grid::Complex c;
 
     std::vector<int> x(5);
@@ -222,9 +222,9 @@ int main (int argc,char **argv )
    * Setup Grid
    *********************************************************/
   Grid::Grid_init(&argc,&argv);
-  Grid::GridCartesian * UGrid   = Grid::QCD::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(), 
-									    Grid::GridDefaultSimd(Grid::QCD::Nd,Grid::vComplex::Nsimd()),
-									    Grid::GridDefaultMpi());
+  Grid::GridCartesian * UGrid   = Grid::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(), 
+                                                                       Grid::GridDefaultSimd(Grid::Nd,Grid::vComplex::Nsimd()),
+                                                                       Grid::GridDefaultMpi());
   
   std::vector<int> gd = UGrid->GlobalDimensions();
   QDP::multi1d<int> nrow(QDP::Nd);
@@ -333,7 +333,7 @@ void make_gauge(GaugeField & Umu,FermionField &src)
 
   Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Umu.Grid();
   Grid::GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-  Grid::QCD::SU3::HotConfiguration(RNG4,Umu);
+  Grid::SU3::HotConfiguration(RNG4,Umu);
   Grid::gaussian(RNG4,src);
 }
 
@@ -343,9 +343,9 @@ void calc_grid(GaugeField & Uthin, GaugeField & Utriple, GaugeField & Ufat, Ferm
    ;
 
   Grid::GridCartesian         * UGrid   = (Grid::GridCartesian *) Uthin.Grid();
-  Grid::GridRedBlackCartesian * UrbGrid = Grid::QCD::SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  Grid::GridRedBlackCartesian * UrbGrid = Grid::SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 
-  Grid::QCD::ImprovedStaggeredFermionR Dstag(Uthin,Utriple,Ufat,*UGrid,*UrbGrid,mq*2.0);
+  Grid::ImprovedStaggeredFermionR Dstag(Uthin,Utriple,Ufat,*UGrid,*UrbGrid,mq*2.0);
 
   std::cout << Grid::GridLogMessage <<" Calling Grid staggered multiply "<<std::endl;
 
diff --git a/tests/qdpxx/Test_qdpxx_wilson.cc b/tests/qdpxx/Test_qdpxx_wilson.cc
index 29e9c9ce..fdf59982 100644
--- a/tests/qdpxx/Test_qdpxx_wilson.cc
+++ b/tests/qdpxx/Test_qdpxx_wilson.cc
@@ -35,8 +35,8 @@
 double mq = 0.1;
 
 // Define Wilson Types
-typedef Grid::QCD::WilsonImplR::FermionField FermionField;
-typedef Grid::QCD::LatticeGaugeField GaugeField;
+typedef Grid::WilsonImplR::FermionField FermionField;
+typedef Grid::LatticeGaugeField GaugeField;
 
 enum ChromaAction
 {
@@ -56,7 +56,7 @@ public:
   static void ImportGauge(GaugeField &gr,
                           QDP::multi1d<QDP::LatticeColorMatrix> &ch)
   {
-    Grid::QCD::LorentzColourMatrix LCM;
+    Grid::LorentzColourMatrix LCM;
     Grid::Complex cc;
     QDP::ColorMatrix cm;
     QDP::Complex c;
@@ -101,7 +101,7 @@ public:
   static void ExportGauge(GaugeField &gr,
                           QDP::multi1d<QDP::LatticeColorMatrix> &ch)
   {
-    Grid::QCD::LorentzColourMatrix LCM;
+    Grid::LorentzColourMatrix LCM;
     Grid::Complex cc;
     QDP::ColorMatrix cm;
     QDP::Complex c;
@@ -145,10 +145,10 @@ public:
   }
 
   // Specific for Wilson Fermions
-  static void ImportFermion(Grid::QCD::LatticeFermion &gr,
+  static void ImportFermion(Grid::LatticeFermion &gr,
                             QDP::LatticeFermion &ch)
   {
-    Grid::QCD::SpinColourVector F;
+    Grid::SpinColourVector F;
     Grid::Complex c;
 
     QDP::Fermion cF;
@@ -195,10 +195,10 @@ public:
   }
 
   // Specific for 4d Wilson fermions
-  static void ExportFermion(Grid::QCD::LatticeFermion &gr,
+  static void ExportFermion(Grid::LatticeFermion &gr,
                             QDP::LatticeFermion &ch)
   {
-    Grid::QCD::SpinColourVector F;
+    Grid::SpinColourVector F;
     Grid::Complex c;
 
     QDP::Fermion cF;
@@ -342,19 +342,18 @@ void calc_chroma(ChromaAction action, GaugeField &lat, FermionField &src, Fermio
 void make_gauge(GaugeField &Umu, FermionField &src)
 {
   using namespace Grid;
-  using namespace Grid::QCD;
 
   std::vector<int> seeds4({1, 2, 3, 4});
 
   Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu._grid;
   Grid::GridParallelRNG RNG4(UGrid);
   RNG4.SeedFixedIntegers(seeds4);
-  Grid::QCD::SU3::HotConfiguration(RNG4, Umu);
+  Grid::SU3::HotConfiguration(RNG4, Umu);
 
   // Fermion field
   Grid::gaussian(RNG4, src);
   /*
-  Grid::QCD::SpinColourVector F;
+  Grid::SpinColourVector F;
   Grid::Complex c;
 
   
@@ -391,13 +390,12 @@ void make_gauge(GaugeField &Umu, FermionField &src)
   */
 }
 
-void calc_grid(ChromaAction action, Grid::QCD::LatticeGaugeField &Umu, Grid::QCD::LatticeFermion &src, Grid::QCD::LatticeFermion &res, int dag)
+void calc_grid(ChromaAction action, Grid::LatticeGaugeField &Umu, Grid::LatticeFermion &src, Grid::LatticeFermion &res, int dag)
 {
   using namespace Grid;
-  using namespace Grid::QCD;
 
   Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu._grid;
-  Grid::GridRedBlackCartesian *UrbGrid = Grid::QCD::SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  Grid::GridRedBlackCartesian *UrbGrid = Grid::SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 
   Grid::RealD _mass = mq;
 
@@ -409,7 +407,7 @@ void calc_grid(ChromaAction action, Grid::QCD::LatticeGaugeField &Umu, Grid::QCD
     anis.xi_0 = 2.0;
     anis.nu = 1.0;
     WilsonImplParams iParam;
-    Grid::QCD::WilsonFermionR Wf(Umu, *UGrid, *UrbGrid, _mass, iParam, anis);
+    Grid::WilsonFermionR Wf(Umu, *UGrid, *UrbGrid, _mass, iParam, anis);
 
     std::cout << Grid::GridLogMessage << " Calling Grid Wilson Fermion multiply " << std::endl;
 
@@ -430,7 +428,7 @@ void calc_grid(ChromaAction action, Grid::QCD::LatticeGaugeField &Umu, Grid::QCD
     anis.xi_0 = 2.0;
     anis.nu = 1.0;
     WilsonImplParams CloverImplParam;
-    Grid::QCD::WilsonCloverFermionR Wf(Umu, *UGrid, *UrbGrid, _mass, _csw_r, _csw_t, anis, CloverImplParam);
+    Grid::WilsonCloverFermionR Wf(Umu, *UGrid, *UrbGrid, _mass, _csw_r, _csw_t, anis, CloverImplParam);
     Wf.ImportGauge(Umu);
 
     std::cout << Grid::GridLogMessage << " Calling Grid Wilson Clover Fermion multiply " << std::endl;
@@ -458,9 +456,9 @@ int main(int argc, char **argv)
    * Setup Grid
    *********************************************************/
   Grid::Grid_init(&argc, &argv);
-  Grid::GridCartesian *UGrid = Grid::QCD::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(),
-                                                                         Grid::GridDefaultSimd(Grid::QCD::Nd, Grid::vComplex::Nsimd()),
-                                                                         Grid::GridDefaultMpi());
+  Grid::GridCartesian *UGrid = Grid::SpaceTimeGrid::makeFourDimGrid(Grid::GridDefaultLatt(),
+                                                                    Grid::GridDefaultSimd(Grid::Nd, Grid::vComplex::Nsimd()),
+                                                                    Grid::GridDefaultMpi());
 
   std::vector<int> gd = UGrid->GlobalDimensions();
   QDP::multi1d<int> nrow(QDP::Nd);
diff --git a/tests/smearing/Test_smearing.cc b/tests/smearing/Test_smearing.cc
index efc336c7..c1c7c457 100644
--- a/tests/smearing/Test_smearing.cc
+++ b/tests/smearing/Test_smearing.cc
@@ -30,8 +30,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 
 using namespace std;
 using namespace Grid;
-using namespace Grid::QCD;
-
 
 int main (int argc, char ** argv)
 {
diff --git a/tests/solver/Test_wilsonclover_mg_lime.cc b/tests/solver/Test_wilsonclover_mg_lime.cc
index 687ec83f..bd2990d4 100644
--- a/tests/solver/Test_wilsonclover_mg_lime.cc
+++ b/tests/solver/Test_wilsonclover_mg_lime.cc
@@ -32,7 +32,6 @@
 
 using namespace std;
 using namespace Grid;
-using namespace Grid::QCD;
 
 int main(int argc, char **argv) {
 

From 3ef519aaa4291b83708209cfe6f271ed2de20be5 Mon Sep 17 00:00:00 2001
From: gfilaci <gianluca.filaci@gmail.com>
Date: Wed, 7 Aug 2019 16:34:01 +0100
Subject: [PATCH 04/12] fast MooeeInv

---
 .../implementation/CayleyFermion5Dcache.h     | 104 ++++++++++--------
 1 file changed, 56 insertions(+), 48 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
index 8af3e7c0..9dc9ba02 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@@ -10,6 +10,7 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Gianluca Filaci <g.filaci@ed.ac.uk>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -139,39 +140,41 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
   accelerator_for(sss,nloop,Simd::Nsimd(),{
     uint64_t ss=sss*Ls;
     typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp;
+    spinor tmp, acc, res;;
 
-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0]
-    for(int s=1;s<Ls;s++){
-      spProj5p(tmp,chi(ss+s-1));  
-      coalescedWrite(chi[ss+s] , psi(ss+s)-plee[s-1]*tmp);
+    // X = Nc*Ns
+    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
+    // Apply (L^{\prime})^{-1} L_m^{-1}
+    res = psi(ss);
+    spProj5m(tmp,res);
+    acc = pleem[0]*tmp;
+    spProj5p(tmp,res);
+    coalescedWrite(chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= plee[s-1]*tmp;
+      spProj5m(tmp,res);
+      acc += pleem[s]*tmp;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
-
-    // L_m^{-1} 
-    for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -pleem[s]P_- chi
-      spProj5m(tmp,chi(ss+s));    
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp);
-    }
-
-    // U_m^{-1} D^{-1}
-    for (int s=0;s<Ls-1;s++){
-      // Chi[s] + 1/d chi[s] 
-      spProj5p(tmp,chi(ss+Ls-1)); 
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s)-(pueem[s]/pdee[Ls-1])*tmp);
-    }	
-    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
-      
-    // Apply U^{-1}
+    res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc;
+    
+    // Apply U_m^{-1} D^{-1} U^{-1}
+    res = (1.0/pdee[Ls-1])*res;
+    coalescedWrite(chi[ss+Ls-1],res);
+    spProj5p(acc,res);
+    spProj5m(tmp,res);
     for (int s=Ls-2;s>=0;s--){
-      spProj5m(tmp,chi(ss+s+1));  
-      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp);
+      res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
   });
 
   MooeeInvTime+=usecond();
-
+  
 }
 
 template<class Impl>
@@ -201,31 +204,36 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
   accelerator_for(sss,nloop,Simd::Nsimd(),{
     uint64_t ss=sss*Ls;
     typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp;
+    spinor tmp, acc, res;
 
-    // Apply (U^{\prime})^{-dagger}
-    coalescedWrite(chi[ss],psi(ss));
-    for (int s=1;s<Ls;s++){
-      spProj5m(tmp,chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s)-conjugate(puee[s-1])*tmp);
+    // X = Nc*Ns
+    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
+    // Apply (L^{\prime})^{-1} L_m^{-1}
+    res = psi(ss);
+    spProj5p(tmp,res);
+    acc = conjugate(pueem[0])*tmp;
+    spProj5m(tmp,res);
+    coalescedWrite(chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= conjugate(puee[s-1])*tmp;
+      spProj5p(tmp,res);
+      acc += conjugate(pueem[s])*tmp;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
-    // U_m^{-\dagger} 
-    for (int s=0;s<Ls-1;s++){
-      spProj5p(tmp,chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - conjugate(pueem[s])*tmp);
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for (int s=0;s<Ls-1;s++){
-      spProj5m(tmp,chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], conjugate(1.0/pdee[s])*chi(ss+s)-conjugate(pleem[s]/pdee[Ls-1])*tmp);
-    }	
-    coalescedWrite(chi[ss+Ls-1], conjugate(1.0/pdee[Ls-1])*chi(ss+Ls-1));
-  
-    // Apply L^{-dagger}
+    res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc;
+    
+    // Apply U_m^{-1} D^{-1} U^{-1}
+    res = (1.0/pdee[Ls-1])*res;
+    coalescedWrite(chi[ss+Ls-1],res);
+    spProj5m(acc,res);
+    spProj5p(tmp,res);
     for (int s=Ls-2;s>=0;s--){
-      spProj5p(tmp,chi(ss+s+1));
-      coalescedWrite(chi[ss+s], chi(ss+s) - conjugate(plee[s])*tmp);
+      res = (1.0/pdee[s])*chi(ss+s) - conjugate(plee[s])*tmp - conjugate(pleem[s])*acc;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
   });
   MooeeInvTime+=usecond();

From 0efaf3c4fa76b125596e9ea1d0cfeeb2bff1acf2 Mon Sep 17 00:00:00 2001
From: gfilaci <gianluca.filaci@gmail.com>
Date: Mon, 2 Sep 2019 11:33:00 +0100
Subject: [PATCH 05/12] access M5D coeffs through pointers

---
 .../implementation/CayleyFermion5Dcache.h     | 12 +++++--
 .../DomainWallEOFAFermionCache.h              | 11 +++++--
 .../implementation/MobiusEOFAFermionCache.h   | 31 +++++++++++++++----
 3 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
index 9dc9ba02..9fe5f9f8 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@@ -55,6 +55,10 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
   auto chi = chi_i.View();
   assert(phi.Checkerboard() == psi.Checkerboard());
 
+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+
   int Ls =this->Ls;
 
   // 10 = 3 complex mult + 2 complex add
@@ -72,7 +76,7 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
       uint64_t idx_l = ss+((s+Ls-1)%Ls);
       spProj5m(tmp1,psi(idx_u));
       spProj5p(tmp2,psi(idx_l));
-      coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2);
+      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
     }
   });
   M5Dtime+=usecond();
@@ -94,6 +98,10 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
   auto chi = chi_i.View();
   assert(phi.Checkerboard() == psi.Checkerboard());
 
+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+
   int Ls=this->Ls;
 
   // Flops = 6.0*(Nc*Ns) *Ls*vol
@@ -110,7 +118,7 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
       uint64_t idx_l = ss+((s+Ls-1)%Ls);
       spProj5p(tmp1,psi(idx_u));
       spProj5m(tmp2,psi(idx_l));
-      coalescedWrite(chi[ss+s],diag[s]*phi(ss+s)+upper[s]*tmp1+lower[s]*tmp2);
+      coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
     }
   });
   M5Dtime+=usecond();
diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
index a3eca650..2ffb89b8 100644
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@@ -11,6 +11,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
+Author: Gianluca Filaci <g.filaci@ed.ac.uk>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -49,6 +50,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
   auto psi = psi_i.View();
   auto chi = chi_i.View();
   assert(phi.Checkerboard() == psi.Checkerboard());
+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
   // Flops = 6.0*(Nc*Ns) *Ls*vol
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
@@ -63,7 +67,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
       uint64_t idx_l = ss+((s+Ls-1)%Ls);
       spProj5m(tmp1, psi(idx_u));
       spProj5p(tmp2, psi(idx_l));
-      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
     }
   });
 
@@ -82,6 +86,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
   auto phi = phi_i.View();
   auto chi = chi_i.View();
   assert(phi.Checkerboard() == psi.Checkerboard());
+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
 
   // Flops = 6.0*(Nc*Ns) *Ls*vol
   this->M5Dcalls++;
@@ -97,7 +104,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
       uint64_t idx_l = ss+((s+Ls-1)%Ls);
       spProj5p(tmp1, psi(idx_u));
       spProj5m(tmp2, psi(idx_l));
-      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
     }
   });
 
diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
index 650435fc..4078267d 100644
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@@ -11,6 +11,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: David Murphy <dmurphy@phys.columbia.edu>
+Author: Gianluca Filaci <g.filaci@ed.ac.uk>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -49,6 +50,10 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
 
   assert(phi.Checkerboard() == psi.Checkerboard());
 
+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+
   // Flops = 6.0*(Nc*Ns) *Ls*vol
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
@@ -64,7 +69,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
       uint64_t idx_l = ss+((s+Ls-1)%Ls);
       spProj5m(tmp1, psi(idx_u));
       spProj5p(tmp2, psi(idx_l));
-      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
     }
   });
 
@@ -88,6 +93,11 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
 
   assert(phi.Checkerboard() == psi.Checkerboard());
 
+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+  auto pshift_coeffs = &shift_coeffs[0];
+
   // Flops = 6.0*(Nc*Ns) *Ls*vol
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
@@ -108,7 +118,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
       if(pm == 1){ spProj5p(tmp, psi(ss+shift_s)); }
       else       { spProj5m(tmp, psi(ss+shift_s)); }
 
-      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 +lower[s]*tmp2 + shift_coeffs[s]*tmp);
+      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 +plower[s]*tmp2 + pshift_coeffs[s]*tmp);
     }
   });
 
@@ -128,6 +138,10 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
 
   assert(phi.Checkerboard() == psi.Checkerboard());
 
+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+
   // Flops = 6.0*(Nc*Ns) *Ls*vol
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
@@ -144,7 +158,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
       uint64_t idx_l = ss+((s+Ls-1)%Ls);
       spProj5p(tmp1, psi(idx_u));
       spProj5m(tmp2, psi(idx_l));
-      coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
     }
   });
 
@@ -166,6 +180,11 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
 
   assert(phi.Checkerboard() == psi.Checkerboard());
 
+  auto pdiag = &diag[0];
+  auto pupper = &upper[0];
+  auto plower = &lower[0];
+  auto pshift_coeffs = &shift_coeffs[0];
+
   // Flops = 6.0*(Nc*Ns) *Ls*vol
   this->M5Dcalls++;
   this->M5Dtime -= usecond();
@@ -189,12 +208,12 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
       spProj5p(tmp1, psi(idx_u));
       spProj5m(tmp2, psi(idx_l));
 
-      if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
-      else          coalescedWrite(chi[ss+s], diag[s]*phi(ss+s) + upper[s]*tmp1 + lower[s]*tmp2);
+      if(s==(Ls-1)) coalescedWrite(chi[ss+s], chi(ss+s)+ pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
+      else          coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
       if(pm == 1){ spProj5p(tmp, psi(ss+s)); }
       else       { spProj5m(tmp, psi(ss+s)); }
 
-      coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+shift_coeffs[s]*tmp);
+      coalescedWrite(chi[ss+shift_s],chi(ss+shift_s)+pshift_coeffs[s]*tmp);
     }
   });
 

From e66669d30015e502278d65aaa14f2c1d19f4a05d Mon Sep 17 00:00:00 2001
From: gfilaci <gianluca.filaci@gmail.com>
Date: Mon, 2 Sep 2019 14:26:13 +0100
Subject: [PATCH 06/12] fast MooeeInv for EOFA

---
 .../implementation/CayleyFermion5Dcache.h     |   2 +-
 .../implementation/MobiusEOFAFermionCache.h   | 246 +++++++++---------
 2 files changed, 126 insertions(+), 122 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
index 9fe5f9f8..2f58a027 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@@ -148,7 +148,7 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi
   accelerator_for(sss,nloop,Simd::Nsimd(),{
     uint64_t ss=sss*Ls;
     typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp, acc, res;;
+    spinor tmp, acc, res;
 
     // X = Nc*Ns
     // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
index 4078267d..ddf852de 100644
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@@ -242,36 +242,38 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
 
   int nloop = grid->oSites()/Ls;
   accelerator_for(sss,nloop,Simd::Nsimd(),{
-
-    uint64_t ss = sss*Ls;
-
+    uint64_t ss=sss*Ls;
     typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp;
+    spinor tmp, acc, res, tmp2_spProj;
 
-    // Apply (L^{\prime})^{-1}
-    coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp, chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp);
+    // X = Nc*Ns
+    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
+    // Apply (L^{\prime})^{-1} L_m^{-1}
+    res = psi(ss);
+    spProj5m(tmp,res);
+    acc = pleem[0]*tmp;
+    spProj5p(tmp,res);
+    coalescedWrite(chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= plee[s-1]*tmp;
+      spProj5m(tmp,res);
+      acc += pleem[s]*tmp;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp);
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp);
-    }
-    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5m(tmp, chi(ss+s+1));
-      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp);
+    res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc;
+    
+    // Apply U_m^{-1} D^{-1} U^{-1}
+    res = (1.0/pdee[Ls-1])*res;
+    coalescedWrite(chi[ss+Ls-1],res);
+    spProj5p(acc,res);
+    spProj5m(tmp,res);
+    for (int s=Ls-2;s>=0;s--){
+      res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
   });
    
@@ -300,45 +302,45 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
 
   int nloop = grid->oSites()/Ls;
   accelerator_for(sss,nloop,Simd::Nsimd(),{
+      uint64_t ss=sss*Ls;
+      typedef decltype(coalescedRead(psi[0])) spinor;
+      spinor tmp, acc, res, tmp_spProj;
 
-    uint64_t ss = sss*Ls;
+      // Apply (L^{\prime})^{-1} L_m^{-1}
+      res = psi(ss);
+      spProj5m(tmp,res);
+      acc = pleem[0]*tmp;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss],res);
+      tmp_spProj = pMooeeInv_shift_lc[0]*res;
 
-    typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp1,tmp2,tmp2_spProj;
+      for(int s=1;s<Ls-1;s++){
+	res = psi(ss+s);
+	tmp_spProj += pMooeeInv_shift_lc[s]*res;
+	res -= plee[s-1]*tmp;
+	spProj5m(tmp,res);
+	acc += pleem[s]*tmp;
+	spProj5p(tmp,res);
+	coalescedWrite(chi[ss+s],res);
+      }
+      res = psi(ss+Ls-1);
 
-    // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
-    coalescedWrite(chi[ss], psi(ss)); // chi[0]=psi[0]
-    tmp2 = pMooeeInv_shift_lc[0]*psi(ss);
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp1, chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1);
-      tmp2 = tmp2 + pMooeeInv_shift_lc[s]*psi(ss+s);
-    }
-    if(pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-    else       { spProj5m(tmp2_spProj, tmp2); }
+      tmp_spProj += pMooeeInv_shift_lc[Ls-1]*res;
+      if(pm == 1){ spProj5p(tmp_spProj, tmp_spProj);}
+      else       { spProj5m(tmp_spProj, tmp_spProj); }
 
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1);
-    }
+      res = res - plee[Ls-2]*tmp - acc;
 
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp1, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls-1])*tmp1);
-    }
-    // chi[ss+Ls-1] = (1.0/pdee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
-    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
-    spProj5m(tmp1, chi(ss+Ls-1));
-    coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInv_shift_norm[Ls-1]*tmp2_spProj);
-
-    // Apply U^{-1} and add shift term
-    for(int s=Ls-2; s>=0; s--){
-      coalescedWrite(chi[ss+s] , chi(ss+s) - puee[s]*tmp1);
-      spProj5m(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInv_shift_norm[s]*tmp2_spProj);
-    }
+      // Apply U_m^{-1} D^{-1} U^{-1}
+      res = (1.0/pdee[Ls-1])*res;
+      spProj5p(acc,res);
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+Ls-1], res + pMooeeInv_shift_norm[Ls-1]*tmp_spProj);
+      for (int s=Ls-2;s>=0;s--){
+	res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
+	spProj5m(tmp,res);
+	coalescedWrite(chi[ss+s], res + pMooeeInv_shift_norm[s]*tmp_spProj);
+      }
   });
 
   this->MooeeInvTime += usecond();
@@ -366,36 +368,38 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
 
   int nloop = grid->oSites()/Ls;
   accelerator_for(sss,nloop,Simd::Nsimd(),{
-
-    uint64_t ss = sss*Ls;
-
+uint64_t ss=sss*Ls;
     typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp;
+    spinor tmp, acc, res;
 
-    // Apply (U^{\prime})^{-dag}
-    coalescedWrite(chi[ss], psi(ss));
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp, chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s) - puee[s-1]*tmp);
-    }
+    // X = Nc*Ns
+    // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
+    // Apply (L^{\prime})^{-1} L_m^{-1}
+    res = psi(ss);
+    spProj5p(tmp,res);
+    acc = pueem[0]*tmp;
+    spProj5m(tmp,res);
+    coalescedWrite(chi[ss],res);
     
-    // U_m^{-\dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp);
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= puee[s-1]*tmp;
+      spProj5p(tmp,res);
+      acc += pueem[s]*tmp;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
-
-    // L_m^{-\dag} D^{-dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp);
-    }
-    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
-
-    // Apply L^{-dag}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5p(tmp, chi(ss+s+1));
-      coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp);
+    res = psi(ss+Ls-1) - puee[Ls-2]*tmp - acc;
+    
+    // Apply U_m^{-1} D^{-1} U^{-1}
+    res = (1.0/pdee[Ls-1])*res;
+    coalescedWrite(chi[ss+Ls-1],res);
+    spProj5m(acc,res);
+    spProj5p(tmp,res);
+    for (int s=Ls-2;s>=0;s--){
+      res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
   });
 
@@ -425,45 +429,45 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
 
   int nloop = grid->oSites()/Ls;
   accelerator_for(sss,nloop,Simd::Nsimd(),{
+      uint64_t ss=sss*Ls;
+      typedef decltype(coalescedRead(psi[0])) spinor;
+      spinor tmp, acc, res, tmp_spProj;
 
-    uint64_t ss = sss*Ls;
+      // Apply (L^{\prime})^{-1} L_m^{-1}
+      res = psi(ss);
+      spProj5p(tmp,res);
+      acc = pueem[0]*tmp;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss],res);
+      tmp_spProj = pMooeeInvDag_shift_lc[0]*res;
 
-    typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp1,tmp2,tmp2_spProj;
+      for(int s=1;s<Ls-1;s++){
+	res = psi(ss+s);
+	tmp_spProj += pMooeeInvDag_shift_lc[s]*res;
+	res -= puee[s-1]*tmp;
+	spProj5p(tmp,res);
+	acc += pueem[s]*tmp;
+	spProj5m(tmp,res);
+	coalescedWrite(chi[ss+s],res);
+      }
+      res = psi(ss+Ls-1);
 
-    // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
-    coalescedWrite(chi[ss], psi(ss));
-    tmp2 = pMooeeInvDag_shift_lc[0]*psi(ss);
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp1, chi(ss+s-1));
-      coalescedWrite(chi[ss+s],psi(ss+s) - puee[s-1]*tmp1);
-      tmp2 = tmp2 + pMooeeInvDag_shift_lc[s]*psi(ss+s);
-    }
+      tmp_spProj += pMooeeInvDag_shift_lc[Ls-1]*res;
+      if(pm == 1){ spProj5p(tmp_spProj, tmp_spProj); }
+      else       { spProj5m(tmp_spProj, tmp_spProj); }
 
-    if(pm == 1){ spProj5p(tmp2_spProj, tmp2);}
-    else       { spProj5m(tmp2_spProj, tmp2);}
+      res = res - puee[Ls-2]*tmp - acc;
 
-    // U_m^{-\dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pueem[s]*tmp1);
-    }
-
-    // L_m^{-\dag} D^{-dag}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp1, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pleem[s]/pdee[Ls-1])*tmp1);
-    }
-    coalescedWrite(chi[ss+Ls-1], (1.0/pdee[Ls-1])*chi(ss+Ls-1));
-    spProj5p(tmp1, chi(ss+Ls-1));
-    coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) + pMooeeInvDag_shift_norm[Ls-1]*tmp2_spProj);
-
-    // Apply L^{-dag}
-    for(int s=Ls-2; s>=0; s--){
-      coalescedWrite(chi[ss+s], chi(ss+s) - plee[s]*tmp1);
-      spProj5p(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+s], chi(ss+s) + pMooeeInvDag_shift_norm[s]*tmp2_spProj);
-    }
+      // Apply U_m^{-1} D^{-1} U^{-1}
+      res = (1.0/pdee[Ls-1])*res;
+      spProj5m(acc,res);
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+Ls-1], res + pMooeeInvDag_shift_norm[Ls-1]*tmp_spProj);
+      for (int s=Ls-2;s>=0;s--){
+	res = (1.0/pdee[s])*chi(ss+s) - plee[s]*tmp - pleem[s]*acc;
+	spProj5p(tmp,res);
+	coalescedWrite(chi[ss+s], res + pMooeeInvDag_shift_norm[s]*tmp_spProj);
+      }
   });
 
   this->MooeeInvTime += usecond();

From fdd9b14e82dc25045cfb8db7b86f98154a26e8e1 Mon Sep 17 00:00:00 2001
From: gfilaci <gianluca.filaci@gmail.com>
Date: Mon, 2 Sep 2019 14:49:51 +0100
Subject: [PATCH 07/12] speed up MooeeInvDag for DWF EOFA

---
 .../DomainWallEOFAFermionCache.h              | 32 +++++++------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
index 2ffb89b8..8bdab03f 100644
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@@ -175,23 +175,15 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
   auto chi = chi_i.View();
   int Ls = this->Ls;
 
+  auto plee  = & this->lee[0];
+  auto pdee  = & this->dee[0];
+  auto puee  = & this->uee[0];
+
+  auto pleem = & this->leem[0];
+  auto pueem = & this->ueem[0];
+
   assert(psi.Checkerboard() == psi.Checkerboard());
 
-  Vector<Coeff_t> ueec(Ls);
-  Vector<Coeff_t> deec(Ls+1);
-  Vector<Coeff_t> leec(Ls);
-  Vector<Coeff_t> ueemc(Ls);
-  Vector<Coeff_t> leemc(Ls);
-
-  for(int s=0; s<ueec.size(); s++){
-    ueec[s]  = conjugate(this->uee[s]);
-    deec[s]  = conjugate(this->dee[s]);
-    leec[s]  = conjugate(this->lee[s]);
-    ueemc[s] = conjugate(this->ueem[s]);
-    leemc[s] = conjugate(this->leem[s]);
-  }
-  deec[Ls] = conjugate(this->dee[Ls]);
-
   this->MooeeInvCalls++;
   this->MooeeInvTime -= usecond();
   auto nloop = grid->oSites()/Ls;
@@ -204,27 +196,27 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
     coalescedWrite(chi[ss], psi(ss));
     for(int s=1; s<Ls; s++){
       spProj5m(tmp1, chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s) - ueec[s-1]*tmp1);
+      coalescedWrite(chi[ss+s], psi(ss+s) - conjugate(puee[s-1])*tmp1);
     }
 
     // U_m^{-\dagger}
     for(int s=0; s<Ls-1; s++){
       spProj5p(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - ueemc[s]*tmp1);
+      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - conjugate(pueem[s])*tmp1);
     }
 
     // L_m^{-\dagger} D^{-dagger}
     for(int s=0; s<Ls-1; s++){
       spProj5m(tmp1, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s] ,(1.0/deec[s])*chi(ss+s) - (leemc[s]/deec[Ls-1])*tmp1);
+      coalescedWrite(chi[ss+s] ,conjugate(1.0/pdee[s])*chi(ss+s) - conjugate(pleem[s]/pdee[Ls-1])*tmp1);
     }
     spProj5p(tmp2, chi(ss+Ls-1));
-    coalescedWrite(chi[ss+Ls-1], (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2);
+    coalescedWrite(chi[ss+Ls-1], conjugate(1.0/pdee[Ls-1])*tmp1 + conjugate(1.0/pdee[Ls])*tmp2);
 
     // Apply L^{-dagger}
     for(int s=Ls-2; s>=0; s--){
       spProj5p(tmp1, chi(ss+s+1));
-      coalescedWrite(chi[ss+s],chi(ss+s) - leec[s]*tmp1);
+      coalescedWrite(chi[ss+s],chi(ss+s) - conjugate(plee[s])*tmp1);
     }
   });
 

From 0c1efa523582a2b9f6121b2da036016ada623894 Mon Sep 17 00:00:00 2001
From: gfilaci <gianluca.filaci@gmail.com>
Date: Wed, 7 Aug 2019 12:11:18 +0100
Subject: [PATCH 08/12] pass OpenMP flag to host compiler

---
 configure.ac | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/configure.ac b/configure.ac
index ed7b9a17..b8f94fce 100644
--- a/configure.ac
+++ b/configure.ac
@@ -287,6 +287,9 @@ case ${CXX} in
     CXX="nvcc -x cu "
     CXXLD="nvcc -link"
     CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr"
+    if test $ac_openmp = yes; then
+       CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
+    fi
     ;;
   *)
     CXXLD=${CXX}

From a7fa86dc29e154ac3339dc8b15be831c80ecc0b9 Mon Sep 17 00:00:00 2001
From: gfilaci <gianluca.filaci@gmail.com>
Date: Thu, 5 Sep 2019 12:05:21 +0100
Subject: [PATCH 09/12] MooeeInv improvement for DW EOFA + comments

---
 .../implementation/CayleyFermion5Dcache.h     |   4 +-
 .../DomainWallEOFAFermionCache.h              | 109 +++++++++---------
 .../implementation/MobiusEOFAFermionCache.h   |  13 +--
 3 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
index 2f58a027..35402994 100644
--- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
+++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h
@@ -216,7 +216,7 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
 
     // X = Nc*Ns
     // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
-    // Apply (L^{\prime})^{-1} L_m^{-1}
+    // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
     res = psi(ss);
     spProj5p(tmp,res);
     acc = conjugate(pueem[0])*tmp;
@@ -233,7 +233,7 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
     }
     res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc;
     
-    // Apply U_m^{-1} D^{-1} U^{-1}
+    // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
     res = (1.0/pdee[Ls-1])*res;
     coalescedWrite(chi[ss+Ls-1],res);
     spProj5m(acc,res);
diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
index 8bdab03f..46d3fa1f 100644
--- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h
@@ -131,36 +131,37 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
   this->MooeeInvTime -= usecond();
   uint64_t nloop=grid->oSites()/Ls;
   accelerator_for(sss,nloop,Simd::Nsimd(),{
-    auto ss=sss*Ls;
+    uint64_t ss=sss*Ls;
     typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp1,tmp2;
+    spinor tmp, acc, res;
 
-    // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
-    // Apply (L^{\prime})^{-1}
-    coalescedWrite(chi[ss],psi(ss)); // chi[0]=psi[0]
-    for(int s=1; s<Ls; s++){
-      spProj5p(tmp1, chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s) - plee[s-1]*tmp1);
+    // Apply (L^{\prime})^{-1} L_m^{-1}
+    res = psi(ss);
+    spProj5m(tmp,res);
+    acc = pleem[0]*tmp;
+    spProj5p(tmp,res);
+    coalescedWrite(chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= plee[s-1]*tmp;
+      spProj5m(tmp,res);
+      acc += pleem[s]*tmp;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
-
-    // L_m^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
-      spProj5m(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - pleem[s]*tmp1);
-    }
-
-    // U_m^{-1} D^{-1}
-    for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
-      spProj5p(tmp1, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s], (1.0/pdee[s])*chi(ss+s) - (pueem[s]/pdee[Ls])*tmp1);
-    }
-    spProj5m(tmp2, chi(ss+Ls-1));
-    coalescedWrite(chi[ss+Ls-1],(1.0/pdee[Ls])*tmp1 + (1.0/pdee[Ls-1])*tmp2);
-
-    // Apply U^{-1}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5m(tmp1, chi(ss+s+1));
-      coalescedWrite(chi[ss+s], chi(ss+s) - puee[s]*tmp1);
+    res = psi(ss+Ls-1) - plee[Ls-2]*tmp - acc;
+    
+    // Apply U_m^{-1} D^{-1} U^{-1}
+    acc = (1.0/pdee[Ls  ])*res;
+    tmp = (1.0/pdee[Ls-1])*res;
+    spProj5p(acc,acc);
+    spProj5m(tmp,tmp);
+    coalescedWrite(chi[ss+Ls-1], acc + tmp);
+    for (int s=Ls-2;s>=0;s--){
+      res = (1.0/pdee[s])*chi(ss+s) - puee[s]*tmp - pueem[s]*acc;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
   });
   this->MooeeInvTime += usecond();
@@ -188,35 +189,37 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
   this->MooeeInvTime -= usecond();
   auto nloop = grid->oSites()/Ls;
   accelerator_for(sss,nloop,Simd::Nsimd(),{
+    uint64_t ss=sss*Ls;
     typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp1,tmp2;
-    auto ss=sss*Ls;
+    spinor tmp, acc, res;
 
-    // Apply (U^{\prime})^{-dagger}
-    coalescedWrite(chi[ss], psi(ss));
-    for(int s=1; s<Ls; s++){
-      spProj5m(tmp1, chi(ss+s-1));
-      coalescedWrite(chi[ss+s], psi(ss+s) - conjugate(puee[s-1])*tmp1);
+    // Apply (U^{\prime})^{-dagger} U_m^{-\dagger} 
+    res = psi(ss);
+    spProj5p(tmp,res);
+    acc = conjugate(pueem[0])*tmp;
+    spProj5m(tmp,res);
+    coalescedWrite(chi[ss],res);
+    
+    for(int s=1;s<Ls-1;s++){
+      res = psi(ss+s);
+      res -= conjugate(puee[s-1])*tmp;
+      spProj5p(tmp,res);
+      acc += conjugate(pueem[s])*tmp;
+      spProj5m(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
-
-    // U_m^{-\dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5p(tmp1, chi(ss+s));
-      coalescedWrite(chi[ss+Ls-1], chi(ss+Ls-1) - conjugate(pueem[s])*tmp1);
-    }
-
-    // L_m^{-\dagger} D^{-dagger}
-    for(int s=0; s<Ls-1; s++){
-      spProj5m(tmp1, chi(ss+Ls-1));
-      coalescedWrite(chi[ss+s] ,conjugate(1.0/pdee[s])*chi(ss+s) - conjugate(pleem[s]/pdee[Ls-1])*tmp1);
-    }
-    spProj5p(tmp2, chi(ss+Ls-1));
-    coalescedWrite(chi[ss+Ls-1], conjugate(1.0/pdee[Ls-1])*tmp1 + conjugate(1.0/pdee[Ls])*tmp2);
-
-    // Apply L^{-dagger}
-    for(int s=Ls-2; s>=0; s--){
-      spProj5p(tmp1, chi(ss+s+1));
-      coalescedWrite(chi[ss+s],chi(ss+s) - conjugate(plee[s])*tmp1);
+    res = psi(ss+Ls-1) - conjugate(puee[Ls-2])*tmp - acc;
+    
+    // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
+    acc = conjugate(1.0/pdee[Ls-1])*res;
+    tmp = conjugate(1.0/pdee[Ls  ])*res;
+    spProj5m(acc,acc);
+    spProj5p(tmp,tmp);
+    coalescedWrite(chi[ss+Ls-1], acc + tmp);
+    for (int s=Ls-2;s>=0;s--){
+      res = conjugate(1.0/pdee[s])*chi(ss+s) - conjugate(plee[s])*tmp - conjugate(pleem[s])*acc;
+      spProj5p(tmp,res);
+      coalescedWrite(chi[ss+s],res);
     }
   });
 
diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
index ddf852de..f74c7a51 100644
--- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
+++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h
@@ -244,7 +244,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
   accelerator_for(sss,nloop,Simd::Nsimd(),{
     uint64_t ss=sss*Ls;
     typedef decltype(coalescedRead(psi[0])) spinor;
-    spinor tmp, acc, res, tmp2_spProj;
+    spinor tmp, acc, res;
 
     // X = Nc*Ns
     // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
@@ -368,13 +368,13 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
 
   int nloop = grid->oSites()/Ls;
   accelerator_for(sss,nloop,Simd::Nsimd(),{
-uint64_t ss=sss*Ls;
+    uint64_t ss=sss*Ls;
     typedef decltype(coalescedRead(psi[0])) spinor;
     spinor tmp, acc, res;
 
     // X = Nc*Ns
     // flops = 2X + (Ls-2)(4X + 4X) + 6X + 1 + 2X + (Ls-1)(10X + 1) = -16X + Ls(1+18X) = -192 + 217*Ls flops
-    // Apply (L^{\prime})^{-1} L_m^{-1}
+    // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
     res = psi(ss);
     spProj5p(tmp,res);
     acc = pueem[0]*tmp;
@@ -391,7 +391,7 @@ uint64_t ss=sss*Ls;
     }
     res = psi(ss+Ls-1) - puee[Ls-2]*tmp - acc;
     
-    // Apply U_m^{-1} D^{-1} U^{-1}
+    // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
     res = (1.0/pdee[Ls-1])*res;
     coalescedWrite(chi[ss+Ls-1],res);
     spProj5m(acc,res);
@@ -402,7 +402,6 @@ uint64_t ss=sss*Ls;
       coalescedWrite(chi[ss+s],res);
     }
   });
-
   this->MooeeInvTime += usecond();
 }
 
@@ -433,7 +432,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
       typedef decltype(coalescedRead(psi[0])) spinor;
       spinor tmp, acc, res, tmp_spProj;
 
-      // Apply (L^{\prime})^{-1} L_m^{-1}
+      // Apply (U^{\prime})^{-dagger} U_m^{-\dagger}
       res = psi(ss);
       spProj5p(tmp,res);
       acc = pueem[0]*tmp;
@@ -458,7 +457,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
 
       res = res - puee[Ls-2]*tmp - acc;
 
-      // Apply U_m^{-1} D^{-1} U^{-1}
+      // Apply L_m^{-\dagger} D^{-dagger} L^{-dagger}
       res = (1.0/pdee[Ls-1])*res;
       spProj5m(acc,res);
       spProj5p(tmp,res);

From 317645aaeb8b82a7bdd88649d1e715f35cf9e0a1 Mon Sep 17 00:00:00 2001
From: Nils Asmussen <n.asmussen@soton.ac.uk>
Date: Wed, 2 Oct 2019 16:25:23 +0100
Subject: [PATCH 10/12] undo (most) whitespace changes in the two files
 HMC/Mobius2p1fEOFA{,_F1}.cc

---
 HMC/Mobius2p1fEOFA.cc    | 348 +++++++++++++++++++--------------------
 HMC/Mobius2p1fEOFA_F1.cc | 316 +++++++++++++++++------------------
 2 files changed, 332 insertions(+), 332 deletions(-)

diff --git a/HMC/Mobius2p1fEOFA.cc b/HMC/Mobius2p1fEOFA.cc
index 4a37bc22..b1294da5 100644
--- a/HMC/Mobius2p1fEOFA.cc
+++ b/HMC/Mobius2p1fEOFA.cc
@@ -2,7 +2,7 @@
 
 Grid physics library, www.github.com/paboyle/Grid
 
-Source file:
+Source file: 
 
 Copyright (C) 2015-2016
 
@@ -36,132 +36,132 @@ directory
 
 NAMESPACE_BEGIN(Grid);
 
-/*
- * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
- *    -- Store the single prec action operator.
- *    -- Clone the gauge field from the operator function argument.
- *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
- */
+  /*
+   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
+   *    -- Store the single prec action operator.
+   *    -- Clone the gauge field from the operator function argument.
+   *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
+   */
 
-template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF>
-class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
-public:
-  typedef typename FermionOperatorD::FermionField FieldD;
-  typedef typename FermionOperatorF::FermionField FieldF;
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
 
-  using OperatorFunction<FieldD>::operator();
+    using OperatorFunction<FieldD>::operator();
 
-  RealD   Tolerance;
-  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
-  Integer MaxInnerIterations;
-  Integer MaxOuterIterations;
-  GridBase* SinglePrecGrid4; //Grid for single-precision fields
-  GridBase* SinglePrecGrid5; //Grid for single-precision fields
-  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
 
-  FermionOperatorF &FermOpF;
-  FermionOperatorD &FermOpD;;
-  SchurOperatorF &LinOpF;
-  SchurOperatorD &LinOpD;
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
 
-  Integer TotalInnerIterations; //Number of inner CG iterations
-  Integer TotalOuterIterations; //Number of restarts
-  Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
 
-  MixedPrecisionConjugateGradientOperatorFunction(RealD tol,
-                                                  Integer maxinnerit,
-                                                  Integer maxouterit,
-                                                  GridBase* _sp_grid4,
-                                                  GridBase* _sp_grid5,
-                                                  FermionOperatorF &_FermOpF,
-                                                  FermionOperatorD &_FermOpD,
-                                                  SchurOperatorF   &_LinOpF,
-                                                  SchurOperatorD   &_LinOpD):
-    LinOpF(_LinOpF),
-    LinOpD(_LinOpD),
-    FermOpF(_FermOpF),
-    FermOpD(_FermOpD),
-    Tolerance(tol),
-    InnerTolerance(tol),
-    MaxInnerIterations(maxinnerit),
-    MaxOuterIterations(maxouterit),
-    SinglePrecGrid4(_sp_grid4),
-    SinglePrecGrid5(_sp_grid5),
-    OuterLoopNormMult(100.)
-  {
-    /* Debugging instances of objects; references are stored
-    std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
-    std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
-    std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
-    std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
-    */
-  };
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+      /* Debugging instances of objects; references are stored
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
+      */
+    };
 
-  void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
 
-    std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
 
-    SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
+      // Assumption made in code to extract gauge field
+      // We could avoid storing LinopD reference alltogether ?
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
-    //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
-    //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
-    // Assumption made in code to extract gauge field
-    // We could avoid storing LinopD reference alltogether ?
-    assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Must snarf a single precision copy of the gauge field in Linop_d argument
+      ////////////////////////////////////////////////////////////////////////////////////
+      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
+      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
+      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
+      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
 
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Must snarf a single precision copy of the gauge field in Linop_d argument
-    ////////////////////////////////////////////////////////////////////////////////////
-    typedef typename FermionOperatorF::GaugeField GaugeFieldF;
-    typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
-    typedef typename FermionOperatorD::GaugeField GaugeFieldD;
-    typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
+      GridBase * GridPtrF = SinglePrecGrid4;
+      GridBase * GridPtrD = FermOpD.Umu.Grid();
+      GaugeFieldF     U_f  (GridPtrF);
+      GaugeLinkFieldF Umu_f(GridPtrF);
+      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
+      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
 
-    GridBase * GridPtrF = SinglePrecGrid4;
-    GridBase * GridPtrD = FermOpD.Umu.Grid();
-    GaugeFieldF     U_f  (GridPtrF);
-    GaugeLinkFieldF Umu_f(GridPtrF);
-    //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
-    //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Moving this to a Clone method of fermion operator would allow to duplicate the 
+      // physics parameters and decrease gauge field copies
+      ////////////////////////////////////////////////////////////////////////////////////
+      GaugeLinkFieldD Umu_d(GridPtrD);
+      for(int mu=0;mu<Nd*2;mu++){ 
+	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
+	precisionChange(Umu_f,Umu_d);
+	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
+      }
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
 
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Moving this to a Clone method of fermion operator would allow to duplicate the
-    // physics parameters and decrease gauge field copies
-    ////////////////////////////////////////////////////////////////////////////////////
-    GaugeLinkFieldD Umu_d(GridPtrD);
-    for(int mu=0;mu<Nd*2;mu++){
-      Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
-      precisionChange(Umu_f,Umu_d);
-      PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Could test to make sure that LinOpF and LinOpD agree to single prec?
+      ////////////////////////////////////////////////////////////////////////////////////
+      /*
+      GridBase *Fgrid = psi._grid;
+      FieldD tmp2(Fgrid);
+      FieldD tmp1(Fgrid);
+      LinOpU.Op(src,tmp1);
+      LinOpD.Op(src,tmp2);
+      std::cout << " Double gauge field "<< norm2(FermOpD.Umu)<<std::endl;
+      std::cout << " Single gauge field "<< norm2(FermOpF.Umu)<<std::endl;
+      std::cout << " Test of operators "<<norm2(tmp1)<<std::endl;
+      std::cout << " Test of operators "<<norm2(tmp2)<<std::endl;
+      tmp1=tmp1-tmp2;
+      std::cout << " Test of operators diff "<<norm2(tmp1)<<std::endl;
+      */
+
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
     }
-    pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
-    pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
-
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Could test to make sure that LinOpF and LinOpD agree to single prec?
-    ////////////////////////////////////////////////////////////////////////////////////
-    /*
-    GridBase *Fgrid = psi._grid;
-    FieldD tmp2(Fgrid);
-    FieldD tmp1(Fgrid);
-    LinOpU.Op(src,tmp1);
-    LinOpD.Op(src,tmp2);
-    std::cout << " Double gauge field "<< norm2(FermOpD.Umu)<<std::endl;
-    std::cout << " Single gauge field "<< norm2(FermOpF.Umu)<<std::endl;
-    std::cout << " Test of operators "<<norm2(tmp1)<<std::endl;
-    std::cout << " Test of operators "<<norm2(tmp2)<<std::endl;
-    tmp1=tmp1-tmp2;
-    std::cout << " Test of operators diff "<<norm2(tmp1)<<std::endl;
-    */
-
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Make a mixed precision conjugate gradient
-    ////////////////////////////////////////////////////////////////////////////////////
-    MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
-    std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
-    MPCG(src,psi);
-  }
-};
+  };
 
 NAMESPACE_END(Grid);
 
@@ -183,18 +183,18 @@ int main(int argc, char **argv) {
   typedef typename FermionActionF::FermionField FermionFieldF;
 
   typedef Grid::XmlReader       Serialiser;
-
+  
   //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
   IntegratorParameters MD;
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
   //  MD.name    = std::string("Leap Frog");
-  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
   MD.name    = std::string("Force Gradient");
-  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
+  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
   //  MD.name    = std::string("MinimumNorm2");
   MD.MDsteps = 6;
   MD.trajL   = 1.0;
-
+  
   HMCparameters HMCparams;
   HMCparams.StartTrajectory  = 590;
   HMCparams.Trajectories     = 1000;
@@ -207,7 +207,7 @@ int main(int argc, char **argv) {
 
   // Grid from the command line arguments --grid and --mpi
   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-
+  
   CheckpointerParameters CPparams;
   CPparams.config_prefix = "ckpoint_EODWF_lat";
   CPparams.rng_prefix    = "ckpoint_EODWF_rng";
@@ -221,7 +221,7 @@ int main(int argc, char **argv) {
   TheHMC.Resources.SetRNGSeeds(RNGpar);
 
   // Construct observables
-  // here there is too much indirection
+  // here there is too much indirection 
   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
   TheHMC.Resources.AddObservable<PlaqObs>();
   //////////////////////////////////////////////
@@ -232,7 +232,7 @@ int main(int argc, char **argv) {
   Real strange_mass = 0.04;
   Real pv_mass      = 1.0;
   RealD M5  = 1.8;
-  RealD b   = 1.0;
+  RealD b   = 1.0; 
   RealD c   = 0.0;
 
   std::vector<Real> hasenbusch({ 0.1, 0.3, 0.6 });
@@ -261,7 +261,7 @@ int main(int argc, char **argv) {
   std::vector<Complex> boundary = {1,1,1,-1};
   FermionAction::ImplParams Params(boundary);
   FermionActionF::ImplParams ParamsF(boundary);
-
+  
   double ActionStoppingCondition     = 1e-10;
   double DerivativeStoppingCondition = 1e-6;
   double MaxCGIterations = 30000;
@@ -292,7 +292,7 @@ int main(int argc, char **argv) {
   OFRp.degree   = 14;
   OFRp.precision= 50;
 
-
+  
   MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
   MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
   MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
@@ -309,50 +309,50 @@ int main(int argc, char **argv) {
   LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
 
   MxPCG_EOFA ActionCGL(ActionStoppingCondition,
-                       MX_inner,
-                       MaxCGIterations,
-                       GridPtrF,
-                       FrbGridF,
-                       Strange_Op_LF,Strange_Op_L,
-                       Strange_LinOp_LF,Strange_LinOp_L);
+		       MX_inner,
+		       MaxCGIterations,
+		       GridPtrF,
+		       FrbGridF,
+		       Strange_Op_LF,Strange_Op_L,
+		       Strange_LinOp_LF,Strange_LinOp_L);
 
   MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,
-                           MX_inner,
-                           MaxCGIterations,
-                           GridPtrF,
-                           FrbGridF,
-                           Strange_Op_LF,Strange_Op_L,
-                           Strange_LinOp_LF,Strange_LinOp_L);
-
+			   MX_inner,
+			   MaxCGIterations,
+			   GridPtrF,
+			   FrbGridF,
+			   Strange_Op_LF,Strange_Op_L,
+			   Strange_LinOp_LF,Strange_LinOp_L);
+  
   MxPCG_EOFA ActionCGR(ActionStoppingCondition,
-                       MX_inner,
-                       MaxCGIterations,
-                       GridPtrF,
-                       FrbGridF,
-                       Strange_Op_RF,Strange_Op_R,
-                       Strange_LinOp_RF,Strange_LinOp_R);
-
+		       MX_inner,
+		       MaxCGIterations,
+		       GridPtrF,
+		       FrbGridF,
+		       Strange_Op_RF,Strange_Op_R,
+		       Strange_LinOp_RF,Strange_LinOp_R);
+  
   MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
-                           MX_inner,
-                           MaxCGIterations,
-                           GridPtrF,
-                           FrbGridF,
-                           Strange_Op_RF,Strange_Op_R,
-                           Strange_LinOp_RF,Strange_LinOp_R);
+			   MX_inner,
+			   MaxCGIterations,
+			   GridPtrF,
+			   FrbGridF,
+			   Strange_Op_RF,Strange_Op_R,
+			   Strange_LinOp_RF,Strange_LinOp_R);
 
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
-    EOFA(Strange_Op_L, Strange_Op_R,
-         ActionCG,
-         ActionCGL, ActionCGR,
-         DerivativeCGL, DerivativeCGR,
-         OFRp, true);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 ActionCG, 
+	 ActionCGL, ActionCGR,
+	 DerivativeCGL, DerivativeCGR,
+	 OFRp, true);
 #else
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
-    EOFA(Strange_Op_L, Strange_Op_R,
-         ActionCG,
-         ActionCG, ActionCG,
-         DerivativeCG, DerivativeCG,
-         OFRp, true);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 ActionCG,
+	 ActionCG, ActionCG,
+	 DerivativeCG, DerivativeCG, 
+	 OFRp, true);
 #endif
   Level1.push_back(&EOFA);
 
@@ -383,7 +383,7 @@ int main(int argc, char **argv) {
   std::vector<MxPCG *> MPCG;
   std::vector<FermionActionF *> DenominatorsF;
   std::vector<LinearOperatorD *> LinOpD;
-  std::vector<LinearOperatorF *> LinOpF;
+  std::vector<LinearOperatorF *> LinOpF; 
 
   for(int h=0;h<n_hasenbusch+1;h++){
 
@@ -402,20 +402,20 @@ int main(int argc, char **argv) {
     LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h]));
 
     MPCG.push_back(new MxPCG(DerivativeStoppingCondition,
-                             MX_inner,
-                             MaxCGIterations,
-                             GridPtrF,
-                             FrbGridF,
-                             *DenominatorsF[h],*Denominators[h],
-                             *LinOpF[h], *LinOpD[h]) );
+			     MX_inner,
+			     MaxCGIterations,
+			     GridPtrF,
+			     FrbGridF,
+			     *DenominatorsF[h],*Denominators[h],
+			     *LinOpF[h], *LinOpD[h]) );
 
     ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,
-                                   MX_inner,
-                                   MaxCGIterations,
-                                   GridPtrF,
-                                   FrbGridF,
-                                   *DenominatorsF[h],*Denominators[h],
-                                   *LinOpF[h], *LinOpD[h]) );
+				   MX_inner,
+				   MaxCGIterations,
+				   GridPtrF,
+				   FrbGridF,
+				   *DenominatorsF[h],*Denominators[h],
+				   *LinOpF[h], *LinOpD[h]) );
 
     // Heatbath not mixed yet. As inverts numerators not so important as raised mass.
     Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],ActionCG));
diff --git a/HMC/Mobius2p1fEOFA_F1.cc b/HMC/Mobius2p1fEOFA_F1.cc
index 9d006da3..3f0a7bf6 100644
--- a/HMC/Mobius2p1fEOFA_F1.cc
+++ b/HMC/Mobius2p1fEOFA_F1.cc
@@ -2,7 +2,7 @@
 
 Grid physics library, www.github.com/paboyle/Grid
 
-Source file:
+Source file: 
 
 Copyright (C) 2015-2016
 
@@ -36,115 +36,115 @@ directory
 
 NAMESPACE_BEGIN(Grid);
 
-/*
- * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
- *    -- Store the single prec action operator.
- *    -- Clone the gauge field from the operator function argument.
- *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
- */
+  /*
+   * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up)
+   *    -- Store the single prec action operator.
+   *    -- Clone the gauge field from the operator function argument.
+   *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone.
+   */
 
-template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF>
-class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
-public:
-  typedef typename FermionOperatorD::FermionField FieldD;
-  typedef typename FermionOperatorF::FermionField FieldF;
+  template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF> 
+  class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> {
+  public:
+    typedef typename FermionOperatorD::FermionField FieldD;
+    typedef typename FermionOperatorF::FermionField FieldF;
 
-  using OperatorFunction<FieldD>::operator();
+    using OperatorFunction<FieldD>::operator();
 
-  RealD   Tolerance;
-  RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
-  Integer MaxInnerIterations;
-  Integer MaxOuterIterations;
-  GridBase* SinglePrecGrid4; //Grid for single-precision fields
-  GridBase* SinglePrecGrid5; //Grid for single-precision fields
-  RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
+    RealD   Tolerance;
+    RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
+    Integer MaxInnerIterations;
+    Integer MaxOuterIterations;
+    GridBase* SinglePrecGrid4; //Grid for single-precision fields
+    GridBase* SinglePrecGrid5; //Grid for single-precision fields
+    RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
 
-  FermionOperatorF &FermOpF;
-  FermionOperatorD &FermOpD;;
-  SchurOperatorF &LinOpF;
-  SchurOperatorD &LinOpD;
+    FermionOperatorF &FermOpF;
+    FermionOperatorD &FermOpD;;
+    SchurOperatorF &LinOpF;
+    SchurOperatorD &LinOpD;
 
-  Integer TotalInnerIterations; //Number of inner CG iterations
-  Integer TotalOuterIterations; //Number of restarts
-  Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
+    Integer TotalInnerIterations; //Number of inner CG iterations
+    Integer TotalOuterIterations; //Number of restarts
+    Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
 
-  MixedPrecisionConjugateGradientOperatorFunction(RealD tol,
-                                                  Integer maxinnerit,
-                                                  Integer maxouterit,
-                                                  GridBase* _sp_grid4,
-                                                  GridBase* _sp_grid5,
-                                                  FermionOperatorF &_FermOpF,
-                                                  FermionOperatorD &_FermOpD,
-                                                  SchurOperatorF   &_LinOpF,
-                                                  SchurOperatorD   &_LinOpD):
-    LinOpF(_LinOpF),
-    LinOpD(_LinOpD),
-    FermOpF(_FermOpF),
-    FermOpD(_FermOpD),
-    Tolerance(tol),
-    InnerTolerance(tol),
-    MaxInnerIterations(maxinnerit),
-    MaxOuterIterations(maxouterit),
-    SinglePrecGrid4(_sp_grid4),
-    SinglePrecGrid5(_sp_grid5),
-    OuterLoopNormMult(100.)
-  {
-    /* Debugging instances of objects; references are stored
-    std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
-    std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
-    std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
-    std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
-    */
-  };
+    MixedPrecisionConjugateGradientOperatorFunction(RealD tol, 
+						    Integer maxinnerit, 
+						    Integer maxouterit, 
+						    GridBase* _sp_grid4, 
+						    GridBase* _sp_grid5, 
+						    FermionOperatorF &_FermOpF,
+						    FermionOperatorD &_FermOpD,
+						    SchurOperatorF   &_LinOpF,
+						    SchurOperatorD   &_LinOpD): 
+      LinOpF(_LinOpF),
+      LinOpD(_LinOpD),
+      FermOpF(_FermOpF),
+      FermOpD(_FermOpD),
+      Tolerance(tol), 
+      InnerTolerance(tol), 
+      MaxInnerIterations(maxinnerit), 
+      MaxOuterIterations(maxouterit), 
+      SinglePrecGrid4(_sp_grid4),
+      SinglePrecGrid5(_sp_grid5),
+      OuterLoopNormMult(100.) 
+    { 
+      /* Debugging instances of objects; references are stored
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl;
+      */
+    };
 
-  void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
+    void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) {
 
-    std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
+      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl;
 
-    SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU);
+      
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
+      //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
+      // Assumption made in code to extract gauge field
+      // We could avoid storing LinopD reference alltogether ?
+      assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
 
-    //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl;
-    //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl;
-    // Assumption made in code to extract gauge field
-    // We could avoid storing LinopD reference alltogether ?
-    assert(&(SchurOpU->_Mat)==&(LinOpD._Mat));
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Must snarf a single precision copy of the gauge field in Linop_d argument
+      ////////////////////////////////////////////////////////////////////////////////////
+      typedef typename FermionOperatorF::GaugeField GaugeFieldF;
+      typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
+      typedef typename FermionOperatorD::GaugeField GaugeFieldD;
+      typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
 
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Must snarf a single precision copy of the gauge field in Linop_d argument
-    ////////////////////////////////////////////////////////////////////////////////////
-    typedef typename FermionOperatorF::GaugeField GaugeFieldF;
-    typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF;
-    typedef typename FermionOperatorD::GaugeField GaugeFieldD;
-    typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD;
+      GridBase * GridPtrF = SinglePrecGrid4;
+      GridBase * GridPtrD = FermOpD.Umu.Grid();
+      GaugeFieldF     U_f  (GridPtrF);
+      GaugeLinkFieldF Umu_f(GridPtrF);
+      //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
+      //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
 
-    GridBase * GridPtrF = SinglePrecGrid4;
-    GridBase * GridPtrD = FermOpD.Umu.Grid();
-    GaugeFieldF     U_f  (GridPtrF);
-    GaugeLinkFieldF Umu_f(GridPtrF);
-    //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d
-    //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Moving this to a Clone method of fermion operator would allow to duplicate the 
+      // physics parameters and decrease gauge field copies
+      ////////////////////////////////////////////////////////////////////////////////////
+      GaugeLinkFieldD Umu_d(GridPtrD);
+      for(int mu=0;mu<Nd*2;mu++){ 
+	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
+	precisionChange(Umu_f,Umu_d);
+	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
+      }
+      pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
+      pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
 
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Moving this to a Clone method of fermion operator would allow to duplicate the
-    // physics parameters and decrease gauge field copies
-    ////////////////////////////////////////////////////////////////////////////////////
-    GaugeLinkFieldD Umu_d(GridPtrD);
-    for(int mu=0;mu<Nd*2;mu++){
-      Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu);
-      precisionChange(Umu_f,Umu_d);
-      PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu);
+      ////////////////////////////////////////////////////////////////////////////////////
+      // Make a mixed precision conjugate gradient
+      ////////////////////////////////////////////////////////////////////////////////////
+      MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
+      std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
+      MPCG(src,psi);
     }
-    pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu);
-    pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu);
-
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Make a mixed precision conjugate gradient
-    ////////////////////////////////////////////////////////////////////////////////////
-    MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD);
-    std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl;
-    MPCG(src,psi);
-  }
-};
+  };
 
 NAMESPACE_END(Grid);
 
@@ -167,12 +167,12 @@ int main(int argc, char **argv) {
   typedef typename FermionActionF::FermionField FermionFieldF;
 
   typedef Grid::XmlReader       Serialiser;
-
+  
   //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
 
-  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;
-  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
-  typedef GenericHMCRunner<ForceGradient> HMCWrapper;
+  //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; 
+  //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; 
+  typedef GenericHMCRunner<ForceGradient> HMCWrapper; 
 
   HMCparameters HMCparams;
   {
@@ -184,7 +184,7 @@ int main(int argc, char **argv) {
 
   // Grid from the command line arguments --grid and --mpi
   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
-
+  
   CheckpointerParameters CPparams;
   CPparams.config_prefix = "ckpoint_EODWF_lat";
   CPparams.rng_prefix    = "ckpoint_EODWF_rng";
@@ -198,7 +198,7 @@ int main(int argc, char **argv) {
   TheHMC.Resources.SetRNGSeeds(RNGpar);
 
   // Construct observables
-  // here there is too much indirection
+  // here there is too much indirection 
   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
   TheHMC.Resources.AddObservable<PlaqObs>();
   //////////////////////////////////////////////
@@ -209,7 +209,7 @@ int main(int argc, char **argv) {
   Real strange_mass = 0.02144;
   Real pv_mass      = 1.0;
   RealD M5  = 1.8;
-  RealD b   = 1.5;
+  RealD b   = 1.5; 
   RealD c   = 0.5;
 
   // Copied from paper
@@ -222,7 +222,7 @@ int main(int argc, char **argv) {
   ///////////////////////////////////////////////////////////////////////////////////////////////
   //Bad choices with large dH. Equalising force L2 norm was not wise.
   ///////////////////////////////////////////////////////////////////////////////////////////////
-  //std::vector<Real> hasenbusch({ 0.03, 0.2, 0.3, 0.5, 0.8 });
+  //std::vector<Real> hasenbusch({ 0.03, 0.2, 0.3, 0.5, 0.8 }); 
   //std::vector<Real> hasenbusch({ 0.05, 0.2, 0.4, 0.6, 0.8 });
 
   auto GridPtr   = TheHMC.Resources.GetCartesian();
@@ -249,7 +249,7 @@ int main(int argc, char **argv) {
   std::vector<Complex> boundary = {1,1,1,-1};
   FermionAction::ImplParams Params(boundary);
   FermionActionF::ImplParams ParamsF(boundary);
-
+  
   double ActionStoppingCondition     = 1e-10;
   double DerivativeStoppingCondition = 1e-7;
   double MaxCGIterations = 30000;
@@ -280,7 +280,7 @@ int main(int argc, char **argv) {
   OFRp.degree   = 12;
   OFRp.precision= 50;
 
-
+  
   MobiusEOFAFermionR Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
   MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *GridPtrF, *GridRBPtrF, strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
   MobiusEOFAFermionR Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c);
@@ -298,51 +298,51 @@ int main(int argc, char **argv) {
   LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF);
 
   MxPCG_EOFA ActionCGL(ActionStoppingCondition,
-                       MX_inner,
-                       MaxCGIterations,
-                       GridPtrF,
-                       FrbGridF,
-                       Strange_Op_LF,Strange_Op_L,
-                       Strange_LinOp_LF,Strange_LinOp_L);
+		       MX_inner,
+		       MaxCGIterations,
+		       GridPtrF,
+		       FrbGridF,
+		       Strange_Op_LF,Strange_Op_L,
+		       Strange_LinOp_LF,Strange_LinOp_L);
 
   MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition,
-                           MX_inner,
-                           MaxCGIterations,
-                           GridPtrF,
-                           FrbGridF,
-                           Strange_Op_LF,Strange_Op_L,
-                           Strange_LinOp_LF,Strange_LinOp_L);
-
+			   MX_inner,
+			   MaxCGIterations,
+			   GridPtrF,
+			   FrbGridF,
+			   Strange_Op_LF,Strange_Op_L,
+			   Strange_LinOp_LF,Strange_LinOp_L);
+  
   MxPCG_EOFA ActionCGR(ActionStoppingCondition,
-                       MX_inner,
-                       MaxCGIterations,
-                       GridPtrF,
-                       FrbGridF,
-                       Strange_Op_RF,Strange_Op_R,
-                       Strange_LinOp_RF,Strange_LinOp_R);
-
+		       MX_inner,
+		       MaxCGIterations,
+		       GridPtrF,
+		       FrbGridF,
+		       Strange_Op_RF,Strange_Op_R,
+		       Strange_LinOp_RF,Strange_LinOp_R);
+  
   MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition,
-                           MX_inner,
-                           MaxCGIterations,
-                           GridPtrF,
-                           FrbGridF,
-                           Strange_Op_RF,Strange_Op_R,
-                           Strange_LinOp_RF,Strange_LinOp_R);
+			   MX_inner,
+			   MaxCGIterations,
+			   GridPtrF,
+			   FrbGridF,
+			   Strange_Op_RF,Strange_Op_R,
+			   Strange_LinOp_RF,Strange_LinOp_R);
 
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
-    EOFA(Strange_Op_L, Strange_Op_R,
-         ActionCG,
-         ActionCGL, ActionCGR,
-         DerivativeCGL, DerivativeCGR,
-         OFRp, true);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 ActionCG, 
+	 ActionCGL, ActionCGR,
+	 DerivativeCGL, DerivativeCGR,
+	 OFRp, true);
 #else
-  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
-    EOFA(Strange_Op_L, Strange_Op_R,
-         ActionCG,
-         ActionCG, ActionCG,
-         ActionCG, ActionCG,
-         //         DerivativeCG, DerivativeCG,
-         OFRp, true);
+  ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy> 
+    EOFA(Strange_Op_L, Strange_Op_R, 
+	 ActionCG, 
+	 ActionCG, ActionCG,
+	 ActionCG, ActionCG,
+	 //         DerivativeCG, DerivativeCG,
+	 OFRp, true);
 #endif
   Level1.push_back(&EOFA);
 
@@ -373,7 +373,7 @@ int main(int argc, char **argv) {
   std::vector<MxPCG *> MPCG;
   std::vector<FermionActionF *> DenominatorsF;
   std::vector<LinearOperatorD *> LinOpD;
-  std::vector<LinearOperatorF *> LinOpF;
+  std::vector<LinearOperatorF *> LinOpF; 
 
   for(int h=0;h<n_hasenbusch+1;h++){
 
@@ -395,20 +395,20 @@ int main(int argc, char **argv) {
     double conv  = DerivativeStoppingCondition;
     if (h<3) conv= DerivativeStoppingConditionLoose; // Relax on first two hasenbusch factors
     MPCG.push_back(new MxPCG(conv,
-                             MX_inner,
-                             MaxCGIterations,
-                             GridPtrF,
-                             FrbGridF,
-                             *DenominatorsF[h],*Denominators[h],
-                             *LinOpF[h], *LinOpD[h]) );
+			     MX_inner,
+			     MaxCGIterations,
+			     GridPtrF,
+			     FrbGridF,
+			     *DenominatorsF[h],*Denominators[h],
+			     *LinOpF[h], *LinOpD[h]) );
 
     ActionMPCG.push_back(new MxPCG(ActionStoppingCondition,
-                                   MX_inner,
-                                   MaxCGIterations,
-                                   GridPtrF,
-                                   FrbGridF,
-                                   *DenominatorsF[h],*Denominators[h],
-                                   *LinOpF[h], *LinOpD[h]) );
+				   MX_inner,
+				   MaxCGIterations,
+				   GridPtrF,
+				   FrbGridF,
+				   *DenominatorsF[h],*Denominators[h],
+				   *LinOpF[h], *LinOpD[h]) );
 
     // Heatbath not mixed yet. As inverts numerators not so important as raised mass.
     Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],ActionCG));

From a9867861926722431f21158c6799c58f3dac5c13 Mon Sep 17 00:00:00 2001
From: Nils Asmussen <n.asmussen@soton.ac.uk>
Date: Mon, 8 Apr 2019 15:29:53 +0100
Subject: [PATCH 11/12] bootstrap.sh: verify checksum of Eigen tar file if
 sha256sum is installed

---
 bootstrap.sh | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/bootstrap.sh b/bootstrap.sh
index 49f6b89e..4bd3de5e 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,10 +1,21 @@
 #!/usr/bin/env bash
+set -e
 
 EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
+EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11'
+
 
 echo "-- deploying Eigen source..."
 ARC=`basename ${EIGEN_URL}`
-wget ${EIGEN_URL} --no-check-certificate && ./scripts/update_eigen.sh ${ARC} && rm ${ARC}
+wget ${EIGEN_URL} --no-check-certificate
+if command -v sha256sum; then
+   echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \
+      | sha256sum --check || exit 1
+else
+   echo "WARNING: could not verify checksum, please install sha256sum" >&2
+fi
+./scripts/update_eigen.sh ${ARC}
+rm ${ARC}
 # patch for non-portable includes in Eigen 3.3.5
 # apparently already fixed in Eigen HEAD so it should not be 
 # a problem in the future (A.P.)

From bbe48998a8ee402cf4900d81f5edb320fe5fce07 Mon Sep 17 00:00:00 2001
From: Nils Asmussen <n.asmussen@soton.ac.uk>
Date: Mon, 20 May 2019 17:25:19 +0100
Subject: [PATCH 12/12] sort Modules.hpp and modules.inc + add module
 JacobiSmear

---
 Hadrons/Modules.hpp                     |  13 +--
 Hadrons/Modules/MSource/JacobiSmear.cc  |   7 ++
 Hadrons/Modules/MSource/JacobiSmear.hpp | 105 ++++++++++++++++++++++++
 Hadrons/modules.inc                     |  22 ++---
 4 files changed, 131 insertions(+), 16 deletions(-)
 create mode 100644 Hadrons/Modules/MSource/JacobiSmear.cc
 create mode 100644 Hadrons/Modules/MSource/JacobiSmear.hpp

diff --git a/Hadrons/Modules.hpp b/Hadrons/Modules.hpp
index 3c30f206..c828e120 100644
--- a/Hadrons/Modules.hpp
+++ b/Hadrons/Modules.hpp
@@ -1,8 +1,8 @@
 #include <Hadrons/Modules/MAction/DWF.hpp>
 #include <Hadrons/Modules/MAction/MobiusDWF.hpp>
 #include <Hadrons/Modules/MAction/ScaledDWF.hpp>
-#include <Hadrons/Modules/MAction/Wilson.hpp>
 #include <Hadrons/Modules/MAction/WilsonClover.hpp>
+#include <Hadrons/Modules/MAction/Wilson.hpp>
 #include <Hadrons/Modules/MAction/ZMobiusDWF.hpp>
 #include <Hadrons/Modules/MContraction/A2AAslashField.hpp>
 #include <Hadrons/Modules/MContraction/A2AFourQuarkContraction.hpp>
@@ -33,8 +33,8 @@
 #include <Hadrons/Modules/MGauge/Random.hpp>
 #include <Hadrons/Modules/MGauge/StochEm.hpp>
 #include <Hadrons/Modules/MGauge/StoutSmearing.hpp>
-#include <Hadrons/Modules/MGauge/Unit.hpp>
 #include <Hadrons/Modules/MGauge/UnitEm.hpp>
+#include <Hadrons/Modules/MGauge/Unit.hpp>
 #include <Hadrons/Modules/MIO/LoadA2AMatrixDiskVector.hpp>
 #include <Hadrons/Modules/MIO/LoadA2AVectors.hpp>
 #include <Hadrons/Modules/MIO/LoadBinary.hpp>
@@ -44,12 +44,12 @@
 #include <Hadrons/Modules/MIO/LoadEigenPack.hpp>
 #include <Hadrons/Modules/MIO/LoadNersc.hpp>
 #include <Hadrons/Modules/MIO/LoadPerambulator.hpp>
-#include <Hadrons/Modules/MNPR/Amputate.hpp>
-#include <Hadrons/Modules/MNPR/Bilinear.hpp>
-#include <Hadrons/Modules/MNPR/FourQuark.hpp>
 #include <Hadrons/Modules/MNoise/FullVolumeSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MNoise/SparseSpinColorDiagonal.hpp>
 #include <Hadrons/Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp>
+#include <Hadrons/Modules/MNPR/Amputate.hpp>
+#include <Hadrons/Modules/MNPR/Bilinear.hpp>
+#include <Hadrons/Modules/MNPR/FourQuark.hpp>
 #include <Hadrons/Modules/MScalar/ChargedProp.hpp>
 #include <Hadrons/Modules/MScalar/FreeProp.hpp>
 #include <Hadrons/Modules/MScalar/Scalar.hpp>
@@ -57,10 +57,10 @@
 #include <Hadrons/Modules/MScalarSUN/EMT.hpp>
 #include <Hadrons/Modules/MScalarSUN/Grad.hpp>
 #include <Hadrons/Modules/MScalarSUN/StochFreeField.hpp>
+#include <Hadrons/Modules/MScalarSUN/TransProj.hpp>
 #include <Hadrons/Modules/MScalarSUN/TrKinetic.hpp>
 #include <Hadrons/Modules/MScalarSUN/TrMag.hpp>
 #include <Hadrons/Modules/MScalarSUN/TrPhi.hpp>
-#include <Hadrons/Modules/MScalarSUN/TransProj.hpp>
 #include <Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
 #include <Hadrons/Modules/MScalarSUN/TwoPointNPR.hpp>
 #include <Hadrons/Modules/MScalarSUN/Utils.hpp>
@@ -74,6 +74,7 @@
 #include <Hadrons/Modules/MSolver/RBPrecCG.hpp>
 #include <Hadrons/Modules/MSource/Convolution.hpp>
 #include <Hadrons/Modules/MSource/Gauss.hpp>
+#include <Hadrons/Modules/MSource/JacobiSmear.hpp>
 #include <Hadrons/Modules/MSource/Momentum.hpp>
 #include <Hadrons/Modules/MSource/MomentumPhase.hpp>
 #include <Hadrons/Modules/MSource/Point.hpp>
diff --git a/Hadrons/Modules/MSource/JacobiSmear.cc b/Hadrons/Modules/MSource/JacobiSmear.cc
new file mode 100644
index 00000000..e9b37f1b
--- /dev/null
+++ b/Hadrons/Modules/MSource/JacobiSmear.cc
@@ -0,0 +1,7 @@
+#include <Hadrons/Modules/MSource/JacobiSmear.hpp>
+
+using namespace Grid;
+using namespace Hadrons;
+using namespace MSource;
+
+template class Grid::Hadrons::MSource::TJacobiSmear<FIMPL>;
diff --git a/Hadrons/Modules/MSource/JacobiSmear.hpp b/Hadrons/Modules/MSource/JacobiSmear.hpp
new file mode 100644
index 00000000..5aa5c705
--- /dev/null
+++ b/Hadrons/Modules/MSource/JacobiSmear.hpp
@@ -0,0 +1,105 @@
+#ifndef Hadrons_MSource_JacobiSmear_hpp_
+#define Hadrons_MSource_JacobiSmear_hpp_
+
+#include <Hadrons/Global.hpp>
+#include <Hadrons/Module.hpp>
+#include <Hadrons/ModuleFactory.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+/******************************************************************************
+ *                         JacobiSmear                                 *
+ ******************************************************************************/
+BEGIN_MODULE_NAMESPACE(MSource)
+
+class JacobiSmearPar: Serializable
+{
+public:
+    GRID_SERIALIZABLE_CLASS_MEMBERS(JacobiSmearPar,
+                                    std::string, gauge,
+                                    double, width,
+                                    int, iterations,
+                                    int, orthog,
+                                    std::string, source);
+};
+
+template <typename FImpl>
+class TJacobiSmear: public Module<JacobiSmearPar>
+{
+public:
+    FERM_TYPE_ALIASES(FImpl,);
+    typedef typename FImpl::GaugeLinkField GaugeMat;
+public:
+    // constructor
+    TJacobiSmear(const std::string name);
+    // destructor
+    virtual ~TJacobiSmear(void) {};
+    // dependency relation
+    virtual std::vector<std::string> getInput(void);
+    virtual std::vector<std::string> getOutput(void);
+    // setup
+    virtual void setup(void);
+    // execution
+    virtual void execute(void);
+};
+
+MODULE_REGISTER_TMP(JacobiSmear, TJacobiSmear<FIMPL>, MSource);
+
+/******************************************************************************
+ *                 TJacobiSmear implementation                             *
+ ******************************************************************************/
+// constructor /////////////////////////////////////////////////////////////////
+template <typename FImpl>
+TJacobiSmear<FImpl>::TJacobiSmear(const std::string name)
+: Module<JacobiSmearPar>(name)
+{}
+
+// dependencies/products ///////////////////////////////////////////////////////
+template <typename FImpl>
+std::vector<std::string> TJacobiSmear<FImpl>::getInput(void)
+{
+    std::vector<std::string> in = {par().source, par().gauge};
+    
+    return in;
+}
+
+template <typename FImpl>
+std::vector<std::string> TJacobiSmear<FImpl>::getOutput(void)
+{
+    std::vector<std::string> out = {getName()};
+    
+    return out;
+}
+
+// setup ///////////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TJacobiSmear<FImpl>::setup(void)
+{
+    envCreateLat(PropagatorField, getName());
+    envTmp(std::vector<GaugeMat>, "Umu", 1, 4, envGetGrid(LatticeColourMatrix));
+}
+
+// execution ///////////////////////////////////////////////////////////////////
+template <typename FImpl>
+void TJacobiSmear<FImpl>::execute(void)
+{
+    auto &out = envGet(PropagatorField, getName());
+    auto &src = envGet(PropagatorField, par().source);
+    auto &U = envGet(GaugeField, par().gauge);
+    envGetTmp(std::vector<GaugeMat>, Umu);
+    for(int mu=0; mu<4; mu++)
+    {
+       Umu.at(mu)=peekLorentz(U,mu);
+    }
+    CovariantSmearing<FImpl> covsmear;
+    out=src;
+    startTimer("Jacobi iteration");
+    covsmear.GaussianSmear(Umu, out, par().width, par().iterations, par().orthog);
+    stopTimer("Jacobi iteration");
+}
+
+END_MODULE_NAMESPACE
+
+END_HADRONS_NAMESPACE
+
+#endif // Hadrons_MSource_JacobiSmear_hpp_
diff --git a/Hadrons/modules.inc b/Hadrons/modules.inc
index 2eb1923d..8bc87fd3 100644
--- a/Hadrons/modules.inc
+++ b/Hadrons/modules.inc
@@ -44,22 +44,22 @@ modules_cc =\
   Modules/MIO/LoadEigenPack.cc \
   Modules/MIO/LoadNersc.cc \
   Modules/MIO/LoadPerambulator.cc \
-  Modules/MNPR/Amputate.cc \
-  Modules/MNPR/Bilinear.cc \
-  Modules/MNPR/FourQuark.cc \
   Modules/MNoise/FullVolumeSpinColorDiagonal.cc \
   Modules/MNoise/SparseSpinColorDiagonal.cc \
   Modules/MNoise/TimeDilutedSpinColorDiagonal.cc \
+  Modules/MNPR/Amputate.cc \
+  Modules/MNPR/Bilinear.cc \
+  Modules/MNPR/FourQuark.cc \
   Modules/MScalar/ChargedProp.cc \
   Modules/MScalar/FreeProp.cc \
   Modules/MScalarSUN/Div.cc \
   Modules/MScalarSUN/EMT.cc \
   Modules/MScalarSUN/Grad.cc \
   Modules/MScalarSUN/StochFreeField.cc \
+  Modules/MScalarSUN/TransProj.cc \
   Modules/MScalarSUN/TrKinetic.cc \
   Modules/MScalarSUN/TrMag.cc \
   Modules/MScalarSUN/TrPhi.cc \
-  Modules/MScalarSUN/TransProj.cc \
   Modules/MScalarSUN/TwoPoint.cc \
   Modules/MScalarSUN/TwoPointNPR.cc \
   Modules/MSink/Point.cc \
@@ -71,6 +71,7 @@ modules_cc =\
   Modules/MSolver/RBPrecCG.cc \
   Modules/MSource/Convolution.cc \
   Modules/MSource/Gauss.cc \
+  Modules/MSource/JacobiSmear.cc \
   Modules/MSource/Momentum.cc \
   Modules/MSource/MomentumPhase.cc \
   Modules/MSource/Point.cc \
@@ -86,8 +87,8 @@ modules_hpp =\
   Modules/MAction/DWF.hpp \
   Modules/MAction/MobiusDWF.hpp \
   Modules/MAction/ScaledDWF.hpp \
-  Modules/MAction/Wilson.hpp \
   Modules/MAction/WilsonClover.hpp \
+  Modules/MAction/Wilson.hpp \
   Modules/MAction/ZMobiusDWF.hpp \
   Modules/MContraction/A2AAslashField.hpp \
   Modules/MContraction/A2AFourQuarkContraction.hpp \
@@ -118,8 +119,8 @@ modules_hpp =\
   Modules/MGauge/Random.hpp \
   Modules/MGauge/StochEm.hpp \
   Modules/MGauge/StoutSmearing.hpp \
-  Modules/MGauge/Unit.hpp \
   Modules/MGauge/UnitEm.hpp \
+  Modules/MGauge/Unit.hpp \
   Modules/MIO/LoadA2AMatrixDiskVector.hpp \
   Modules/MIO/LoadA2AVectors.hpp \
   Modules/MIO/LoadBinary.hpp \
@@ -129,12 +130,12 @@ modules_hpp =\
   Modules/MIO/LoadEigenPack.hpp \
   Modules/MIO/LoadNersc.hpp \
   Modules/MIO/LoadPerambulator.hpp \
-  Modules/MNPR/Amputate.hpp \
-  Modules/MNPR/Bilinear.hpp \
-  Modules/MNPR/FourQuark.hpp \
   Modules/MNoise/FullVolumeSpinColorDiagonal.hpp \
   Modules/MNoise/SparseSpinColorDiagonal.hpp \
   Modules/MNoise/TimeDilutedSpinColorDiagonal.hpp \
+  Modules/MNPR/Amputate.hpp \
+  Modules/MNPR/Bilinear.hpp \
+  Modules/MNPR/FourQuark.hpp \
   Modules/MScalar/ChargedProp.hpp \
   Modules/MScalar/FreeProp.hpp \
   Modules/MScalar/Scalar.hpp \
@@ -142,10 +143,10 @@ modules_hpp =\
   Modules/MScalarSUN/EMT.hpp \
   Modules/MScalarSUN/Grad.hpp \
   Modules/MScalarSUN/StochFreeField.hpp \
+  Modules/MScalarSUN/TransProj.hpp \
   Modules/MScalarSUN/TrKinetic.hpp \
   Modules/MScalarSUN/TrMag.hpp \
   Modules/MScalarSUN/TrPhi.hpp \
-  Modules/MScalarSUN/TransProj.hpp \
   Modules/MScalarSUN/TwoPoint.hpp \
   Modules/MScalarSUN/TwoPointNPR.hpp \
   Modules/MScalarSUN/Utils.hpp \
@@ -159,6 +160,7 @@ modules_hpp =\
   Modules/MSolver/RBPrecCG.hpp \
   Modules/MSource/Convolution.hpp \
   Modules/MSource/Gauss.hpp \
+  Modules/MSource/JacobiSmear.hpp \
   Modules/MSource/Momentum.hpp \
   Modules/MSource/MomentumPhase.hpp \
   Modules/MSource/Point.hpp \