Updating the feature/clover branch with the newest Hadron package

2026-07-17 23:53:27 +01:00 · 2018-01-12 13:35:51 +00:00
parent e199fda9dc 7bb405e790
commit 3923683e9b
158 changed files with 9659 additions and 5972 deletions
@@ -495,6 +495,14 @@ namespace QCD {
      return traceIndex<ColourIndex>(lhs);
    }

+    //////////////////////////////////////////
+    // Current types
+    //////////////////////////////////////////
+    GRID_SERIALIZABLE_ENUM(Current, undef,
+                           Vector,  0,
+                           Axial,   1,
+                           Tadpole, 2);
+
 }   //namespace QCD
 } // Grid

@@ -47,6 +47,7 @@ namespace Grid {
      INHERIT_IMPL_TYPES(Impl);

      FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {};
+      virtual ~FermionOperator(void) = default;

      virtual FermionField &tmp(void) = 0;

@@ -112,6 +113,21 @@ namespace Grid {
      ///////////////////////////////////////////////
      virtual void ImportGauge(const GaugeField & _U)=0;

+      //////////////////////////////////////////////////////////////////////
+      // Conserved currents, either contract at sink or insert sequentially.
+      //////////////////////////////////////////////////////////////////////
+      virtual void ContractConservedCurrent(PropagatorField &q_in_1,
+                                            PropagatorField &q_in_2,
+                                            PropagatorField &q_out,
+                                            Current curr_type,
+                                            unsigned int mu)=0;
+      virtual void SeqConservedCurrent(PropagatorField &q_in, 
+                                       PropagatorField &q_out,
+                                       Current curr_type,
+                                       unsigned int mu,
+                                       std::vector<Real> mom,
+                                       unsigned int tmin, 
+                                       unsigned int tmax)=0;
    };

  }
@@ -212,6 +212,13 @@ namespace QCD {
                         StencilImpl &St) {
      mult(&phi(), &U(mu), &chi());
    }
+    
+    inline void multLinkProp(SitePropagator &phi,
+                             const SiteDoubledGaugeField &U,
+                             const SitePropagator &chi,
+                             int mu) {
+       mult(&phi(), &U(mu), &chi());
+    }
      
    template <class ref>
    inline void loadLinkElement(Simd &reg, ref &memory) {
@@ -354,7 +361,20 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
    }
    mult(&phi(), &UU(), &chi());
  }
-      
+
+  inline void multLinkProp(SitePropagator &phi,
+                           const SiteDoubledGaugeField &U,
+                           const SitePropagator &chi,
+                           int mu) {
+    SiteGaugeLink UU;
+    for (int i = 0; i < Nrepresentation; i++) {
+      for (int j = 0; j < Nrepresentation; j++) {
+        vsplat(UU()()(i, j), U(mu)()(i, j));
+      }
+    }
+    mult(&phi(), &UU(), &chi());
+  }
+
  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
  {
    SiteScalarGaugeField  ScalarUmu;
@@ -564,7 +584,12 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
   }
   
 }
-
+    // Fixme: Gparity prop * link
+    inline void multLinkProp(SitePropagator &phi, const SiteDoubledGaugeField &U,
+                             const SitePropagator &chi, int mu)
+    {
+        assert(0);
+    }

 template <class ref>
 inline void loadLinkElement(Simd &reg, ref &memory) {
@@ -393,6 +393,31 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
  }
 };

+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                        PropagatorField &q_in_2,
+                                                        PropagatorField &q_out,
+                                                        Current curr_type,
+                                                        unsigned int mu)
+{
+    assert(0);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+                                                         PropagatorField &q_out,
+                                                         Current curr_type,
+                                                         unsigned int mu, 
+                                                         std::vector<Real> mom,
+                                                         unsigned int tmin,
+                                                         unsigned int tmax)
+{
+    assert(0);
+}
+
 FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);

  //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
@@ -157,6 +157,22 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS

  LebesgueOrder Lebesgue;
  LebesgueOrder LebesgueEvenOdd;
+  
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+                                PropagatorField &q_in_2,
+                                PropagatorField &q_out,
+                                Current curr_type,
+                                unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+                           PropagatorField &q_out,
+                           Current curr_type,
+                           unsigned int mu, 
+                           std::vector<Real> mom,
+                           unsigned int tmin,
+                           unsigned int tmax);
 };

 typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
@@ -405,6 +405,30 @@ void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
  MooeeInv(in, out);
 }

+//////////////////////////////////////////////////////// 
+// Conserved current - not yet implemented.
+////////////////////////////////////////////////////////
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                         PropagatorField &q_in_2,
+                                                         PropagatorField &q_out,
+                                                         Current curr_type,
+                                                         unsigned int mu)
+{
+    assert(0);
+}
+
+template <class Impl>
+void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
+                                                          PropagatorField &q_out,
+                                                          Current curr_type,
+                                                          unsigned int mu, 
+                                                          std::vector<Real> mom,
+                                                          unsigned int tmin,
+                                                          unsigned int tmax)
+{
+    assert(0);
+}

 FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
 FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
@@ -170,6 +170,21 @@ namespace QCD {
    // Comms buffer
    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
+    ///////////////////////////////////////////////////////////////
+    // Conserved current utilities
+    ///////////////////////////////////////////////////////////////
+    void ContractConservedCurrent(PropagatorField &q_in_1,
+                                  PropagatorField &q_in_2,
+                                  PropagatorField &q_out,
+                                  Current curr_type,
+                                  unsigned int mu);
+    void SeqConservedCurrent(PropagatorField &q_in,
+                             PropagatorField &q_out,
+                             Current curr_type,
+                             unsigned int mu, 
+                             std::vector<Real> mom,
+                             unsigned int tmin,
+                             unsigned int tmax);
  };

 }}
@@ -265,7 +265,6 @@ public:
    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
  }
-  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;

  std::vector<int> same_node;
  std::vector<int> surface_list;
@@ -371,6 +371,112 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
  }
 };

+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially.
+ ******************************************************************************/
+template <class Impl>
+void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                   PropagatorField &q_in_2,
+                                                   PropagatorField &q_out,
+                                                   Current curr_type,
+                                                   unsigned int mu)
+{
+    Gamma g5(Gamma::Algebra::Gamma5);
+    conformable(_grid, q_in_1._grid);
+    conformable(_grid, q_in_2._grid);
+    conformable(_grid, q_out._grid);
+    PropagatorField tmp1(_grid), tmp2(_grid);
+    q_out = zero;
+
+    // Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
+    // Inefficient comms method but not performance critical.
+    tmp1 = Cshift(q_in_1, mu, 1);
+    tmp2 = Cshift(q_in_2, mu, 1);
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sU],
+                                                 q_in_2._odata[sU],
+                                                 q_out._odata[sU],
+                                                 Umu, sU, mu);
+        Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sU],
+                                                 tmp2._odata[sU],
+                                                 q_out._odata[sU],
+                                                 Umu, sU, mu);
+    }
+}
+
+template <class Impl>
+void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
+                                              PropagatorField &q_out,
+                                              Current curr_type,
+                                              unsigned int mu,
+                                              std::vector<Real> mom,
+                                              unsigned int tmin, 
+                                              unsigned int tmax)
+{
+    conformable(_grid, q_in._grid);
+    conformable(_grid, q_out._grid);
+    Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
+    Complex i(0.0,1.0);
+    PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
+    unsigned int tshift = (mu == Tp) ? 1 : 0;
+    unsigned int LLt    = GridDefaultLatt()[Tp];
+
+    // Momentum projection
+    ph = zero;
+    for(unsigned int mu = 0; mu < Nd - 1; mu++)
+    {
+        LatticeCoordinate(coor, mu);
+        ph = ph + mom[mu]*coor*((1./(_grid->_fdimensions[mu])));
+    }
+    ph = exp((Real)(2*M_PI)*i*ph);
+
+    q_out = zero;
+    LatticeInteger coords(_grid);
+    LatticeCoordinate(coords, Tp);
+
+    // Need q(x + mu) and q(x - mu).
+    tmp = Cshift(q_in, mu, 1);
+    tmpFwd = tmp*ph;
+    tmp = ph*q_in;
+    tmpBwd = Cshift(tmp, mu, -1);
+
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        // Compute the sequential conserved current insertion only if our simd
+        // object contains a timeslice we need.
+        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
+                             (coords._odata[sU] <= tmax));
+        Integer timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sU], 
+                                                q_out._odata[sU], 
+                                                Umu, sU, mu, t_mask);
+        }
+
+        // Repeat for backward direction.
+        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
+                      (coords._odata[sU] <= (tmax + tshift)));
+
+	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+	unsigned int t0 = 0;
+	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
+
+        timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sU], 
+                                                q_out._odata[sU], 
+                                                Umu, sU, mu, t_mask);
+        }
+    }
+}
+
 FermOpTemplateInstantiate(WilsonFermion);
 AdjointFermOpTemplateInstantiate(WilsonFermion);
 TwoIndexFermOpTemplateInstantiate(WilsonFermion);
@@ -165,6 +165,22 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
  LebesgueOrder LebesgueEvenOdd;

  WilsonAnisotropyCoefficients anisotropyCoeff;
+  
+  ///////////////////////////////////////////////////////////////
+  // Conserved current utilities
+  ///////////////////////////////////////////////////////////////
+  void ContractConservedCurrent(PropagatorField &q_in_1,
+                                PropagatorField &q_in_2,
+                                PropagatorField &q_out,
+                                Current curr_type,
+                                unsigned int mu);
+  void SeqConservedCurrent(PropagatorField &q_in,
+                           PropagatorField &q_out,
+                           Current curr_type,
+                           unsigned int mu, 
+                           std::vector<Real> mom,
+                           unsigned int tmin,
+                           unsigned int tmax);
 };

 typedef WilsonFermion<WilsonImplF> WilsonFermionF;
@@ -12,6 +12,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
+Author: Andrew Lawson <andrew.lawson1991@gmail.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -702,6 +703,168 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe

 }

+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially.
+ ******************************************************************************/
+
+// Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
+#define REVERSE_LS(qSite, qSiteRev, Nsimd) \
+{ \
+    std::vector<typename SitePropagator::scalar_object> qSiteVec(Nsimd); \
+    extract(qSite, qSiteVec); \
+    for (int i = 0; i < Nsimd / 2; ++i) \
+    { \
+        typename SitePropagator::scalar_object tmp = qSiteVec[i]; \
+        qSiteVec[i] = qSiteVec[Nsimd - i - 1]; \
+        qSiteVec[Nsimd - i - 1] = tmp; \
+    } \
+    merge(qSiteRev, qSiteVec); \
+}
+
+template <class Impl>
+void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
+                                                     PropagatorField &q_in_2,
+                                                     PropagatorField &q_out,
+                                                     Current curr_type,
+                                                     unsigned int mu)
+{
+    conformable(q_in_1._grid, FermionGrid());
+    conformable(q_in_1._grid, q_in_2._grid);
+    conformable(_FourDimGrid, q_out._grid);
+    PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
+    unsigned int LLs = q_in_1._grid->_rdimensions[0];
+    q_out = zero;
+
+    // Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s), 
+    // q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
+    tmp1 = Cshift(q_in_1, mu + 1, 1);
+    tmp2 = Cshift(q_in_2, mu + 1, 1);
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        unsigned int sF1 = sU * LLs;
+        unsigned int sF2 = (sU + 1) * LLs - 1;
+
+        for (unsigned int s = 0; s < LLs; ++s)
+        {
+            bool axial_sign = ((curr_type == Current::Axial) && \
+                               (s < (LLs / 2)));
+            SitePropagator qSite2, qmuSite2;
+
+            // If vectorised in 5th dimension, reverse q2 vector to match up
+            // sites correctly.
+            if (Impl::LsVectorised)
+            {
+                REVERSE_LS(q_in_2._odata[sF2], qSite2, Ls / LLs);
+                REVERSE_LS(tmp2._odata[sF2], qmuSite2, Ls / LLs);
+            }
+            else
+            {
+                qSite2   = q_in_2._odata[sF2];
+                qmuSite2 = tmp2._odata[sF2];
+            }
+            Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sF1], 
+                                                     qSite2, 
+                                                     q_out._odata[sU],
+                                                     Umu, sU, mu, axial_sign);
+            Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sF1],
+                                                     qmuSite2,
+                                                     q_out._odata[sU],
+                                                     Umu, sU, mu, axial_sign);
+            sF1++;
+            sF2--;
+        }
+    }
+}
+
+
+template <class Impl>
+void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, 
+                                                PropagatorField &q_out,
+                                                Current curr_type, 
+                                                unsigned int mu,
+                                                std::vector<Real> mom,
+                                                unsigned int tmin, 
+                                                unsigned int tmax)
+{
+    conformable(q_in._grid, FermionGrid());
+    conformable(q_in._grid, q_out._grid);
+    Lattice<iSinglet<Simd>> ph(FermionGrid()), coor(FermionGrid());
+    PropagatorField tmpFwd(FermionGrid()), tmpBwd(FermionGrid()),
+                    tmp(FermionGrid());
+    Complex i(0.0, 1.0);
+    unsigned int tshift = (mu == Tp) ? 1 : 0;
+    unsigned int LLs = q_in._grid->_rdimensions[0];
+    unsigned int LLt    = GridDefaultLatt()[Tp];
+
+    // Momentum projection.
+    ph = zero;
+    for(unsigned int nu = 0; nu < Nd - 1; nu++)
+    {
+        // Shift coordinate lattice index by 1 to account for 5th dimension.
+        LatticeCoordinate(coor, nu + 1);
+        ph = ph + mom[nu]*coor*((1./(_FourDimGrid->_fdimensions[nu])));
+    }
+    ph = exp((Real)(2*M_PI)*i*ph);
+
+    q_out = zero;
+    LatticeInteger coords(_FourDimGrid);
+    LatticeCoordinate(coords, Tp);
+
+    // Need q(x + mu, s) and q(x - mu, s). 5D lattice so shift 4D coordinate mu
+    // by one.
+    tmp = Cshift(q_in, mu + 1, 1);
+    tmpFwd = tmp*ph;
+    tmp = ph*q_in;
+    tmpBwd = Cshift(tmp, mu + 1, -1);
+
+    parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
+    {
+        // Compute the sequential conserved current insertion only if our simd
+        // object contains a timeslice we need.
+        vInteger t_mask   = ((coords._odata[sU] >= tmin) &&
+                             (coords._odata[sU] <= tmax));
+        Integer timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            unsigned int sF = sU * LLs;
+            for (unsigned int s = 0; s < LLs; ++s)
+            {
+                bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
+                Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sF], 
+                                                    q_out._odata[sF], Umu, sU,
+                                                    mu, t_mask, axial_sign);
+                ++sF;
+            }
+        }
+
+        // Repeat for backward direction.
+        t_mask     = ((coords._odata[sU] >= (tmin + tshift)) && 
+                      (coords._odata[sU] <= (tmax + tshift)));
+
+	//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)	
+	unsigned int t0 = 0;
+	if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
+
+        timeSlices = Reduce(t_mask);
+
+        if (timeSlices > 0)
+        {
+            unsigned int sF = sU * LLs;
+            for (unsigned int s = 0; s < LLs; ++s)
+            {
+                bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
+                Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sF], 
+                                                    q_out._odata[sF], Umu, sU,
+                                                    mu, t_mask, axial_sign);
+                ++sF;
+            }
+        }
+    }
+}
+
 FermOpTemplateInstantiate(WilsonFermion5D);
 GparityFermOpTemplateInstantiate(WilsonFermion5D);
  
@@ -214,6 +214,21 @@ namespace QCD {
    // Comms buffer
    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
    
+    ///////////////////////////////////////////////////////////////
+    // Conserved current utilities
+    ///////////////////////////////////////////////////////////////
+    void ContractConservedCurrent(PropagatorField &q_in_1,
+                                  PropagatorField &q_in_2,
+                                  PropagatorField &q_out,
+                                  Current curr_type, 
+                                  unsigned int mu);
+    void SeqConservedCurrent(PropagatorField &q_in,
+                             PropagatorField &q_out,
+                             Current curr_type,
+                             unsigned int mu,
+                             std::vector<Real> mom,
+                             unsigned int tmin,
+                             unsigned int tmax);
  };

 }}
@@ -281,6 +281,172 @@ void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHal
  vstream(out._odata[sF], result);
 }

+/*******************************************************************************
+ * Conserved current utilities for Wilson fermions, for contracting propagators
+ * to make a conserved current sink or inserting the conserved current 
+ * sequentially. Common to both 4D and 5D.
+ ******************************************************************************/
+// N.B. Functions below assume a -1/2 factor within U.
+#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
+#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteFwd
+ * Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_1 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
+                                                  const SitePropagator &q_in_1,
+                                                  const SitePropagator &q_in_2,
+                                                  SitePropagator &q_out,
+                                                  DoubledGaugeField &U,
+                                                  unsigned int sU,
+                                                  unsigned int mu,
+                                                  bool switch_sign)
+{
+    SitePropagator result, tmp;
+    Gamma g5(Gamma::Algebra::Gamma5);
+    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu);
+    result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
+    if (switch_sign)
+    {
+        q_out -= result;
+    }
+    else
+    {
+        q_out += result;
+    }
+}
+
+/*******************************************************************************
+ * Name: ContractConservedCurrentSiteBwd
+ * Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in_2 shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
+                                                  const SitePropagator &q_in_1,
+                                                  const SitePropagator &q_in_2,
+                                                  SitePropagator &q_out,
+                                                  DoubledGaugeField &U,
+                                                  unsigned int sU,
+                                                  unsigned int mu,
+                                                  bool switch_sign)
+{
+    SitePropagator result, tmp;
+    Gamma g5(Gamma::Algebra::Gamma5);
+    Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu + Nd);
+    result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
+    if (switch_sign)
+    {
+        q_out += result;
+    }
+    else
+    {
+        q_out -= result;
+    }
+}
+
+// G-parity requires more specialised implementation.
+#define NO_CURR_SITE(Impl) \
+template <> \
+void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
+                                                  const SitePropagator &q_in_1, \
+                                                  const SitePropagator &q_in_2, \
+                                                  SitePropagator &q_out,        \
+                                                  DoubledGaugeField &U,         \
+                                                  unsigned int sU,              \
+                                                  unsigned int mu,              \
+                                                  bool switch_sign)             \
+{ \
+    assert(0); \
+} \
+template <> \
+void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
+                                                  const SitePropagator &q_in_1, \
+                                                  const SitePropagator &q_in_2, \
+                                                  SitePropagator &q_out,        \
+                                                  DoubledGaugeField &U,         \
+                                                  unsigned int mu,              \
+                                                  unsigned int sU,              \
+                                                  bool switch_sign)             \
+{ \
+    assert(0); \
+}
+
+NO_CURR_SITE(GparityWilsonImplF);
+NO_CURR_SITE(GparityWilsonImplD);
+NO_CURR_SITE(GparityWilsonImplFH);
+NO_CURR_SITE(GparityWilsonImplDF);
+
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in +ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeField &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vInteger t_mask,
+                                                     bool switch_sign)
+{
+    SitePropagator result;
+    Impl::multLinkProp(result, U._odata[sU], q_in, mu);
+    result = WilsonCurrentFwd(result, mu);
+
+    // Zero any unwanted timeslice entries.
+    result = predicatedWhere(t_mask, result, 0.*result);
+
+    if (switch_sign)
+    {
+        q_out -= result;
+    }
+    else
+    {
+        q_out += result;
+    }
+}
+
+/*******************************************************************************
+ * Name: SeqConservedCurrentSiteFwd
+ * Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
+ * Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
+ *        - Pass in q_in shifted in -ve mu direction.
+ ******************************************************************************/
+template<class Impl>
+void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in, 
+                                                     SitePropagator &q_out,
+                                                     DoubledGaugeField &U,
+                                                     unsigned int sU,
+                                                     unsigned int mu,
+                                                     vInteger t_mask,
+                                                     bool switch_sign)
+{
+    SitePropagator result;
+    Impl::multLinkProp(result, U._odata[sU], q_in, mu + Nd);
+    result = WilsonCurrentBwd(result, mu);
+
+    // Zero any unwanted timeslice entries.
+    result = predicatedWhere(t_mask, result, 0.*result);
+
+    if (switch_sign)
+    {
+        q_out += result;
+    }
+    else
+    {
+        q_out -= result;
+    }
+}
+
 FermOpTemplateInstantiate(WilsonKernels);
 AdjointFermOpTemplateInstantiate(WilsonKernels);
 TwoIndexFermOpTemplateInstantiate(WilsonKernels);
@@ -180,6 +180,38 @@ public:
  void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
 		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
      
+  //////////////////////////////////////////////////////////////////////////////
+  // Utilities for inserting Wilson conserved current.
+  //////////////////////////////////////////////////////////////////////////////
+  void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
+                                       const SitePropagator &q_in_2,
+                                       SitePropagator &q_out,
+                                       DoubledGaugeField &U,
+                                       unsigned int sU,
+                                       unsigned int mu,
+                                       bool switch_sign = false);
+  void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
+                                       const SitePropagator &q_in_2,
+                                       SitePropagator &q_out,
+                                       DoubledGaugeField &U,
+                                       unsigned int sU,
+                                       unsigned int mu,
+                                       bool switch_sign = false);
+  void SeqConservedCurrentSiteFwd(const SitePropagator &q_in, 
+                                  SitePropagator &q_out,
+                                  DoubledGaugeField &U,
+                                  unsigned int sU,
+                                  unsigned int mu,
+                                  vInteger t_mask,
+                                  bool switch_sign = false);
+  void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
+                                  SitePropagator &q_out,
+                                  DoubledGaugeField &U,
+                                  unsigned int sU,
+                                  unsigned int mu,
+                                  vInteger t_mask,
+                                  bool switch_sign = false);
+
 private:
     // Specialised variants
  void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
@@ -16,12 +16,12 @@ class ScalarImplTypes {
    typedef iImplField<Simd> SiteField;
    typedef SiteField        SitePropagator;
    typedef SiteField        SiteComplex;
-    
+
    typedef Lattice<SiteField> Field;
    typedef Field              ComplexField;
    typedef Field              FermionField;
    typedef Field              PropagatorField;
-    
+
    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
      gaussian(pRNG, P);
    }
@@ -47,54 +47,60 @@ class ScalarImplTypes {
    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
      U = 1.0;
    }
-    
+
    static void MomentumSpacePropagator(Field &out, RealD m)
    {
      GridBase           *grid = out._grid;
      Field              kmu(grid), one(grid);
      const unsigned int nd    = grid->_ndimension;
      std::vector<int>   &l    = grid->_fdimensions;
-      
+
      one = Complex(1.0,0.0);
      out = m*m;
      for(int mu = 0; mu < nd; mu++)
      {
        Real twoPiL = M_PI*2./l[mu];
-        
+
        LatticeCoordinate(kmu,mu);
        kmu = 2.*sin(.5*twoPiL*kmu);
        out = out + kmu*kmu;
      }
      out = one/out;
    }
-    
+
    static void FreePropagator(const Field &in, Field &out,
                               const Field &momKernel)
    {
      FFT   fft((GridCartesian *)in._grid);
      Field inFT(in._grid);
-      
+
      fft.FFT_all_dim(inFT, in, FFT::forward);
      inFT = inFT*momKernel;
      fft.FFT_all_dim(out, inFT, FFT::backward);
    }
-    
+
    static void FreePropagator(const Field &in, Field &out, RealD m)
    {
      Field momKernel(in._grid);
-      
+
      MomentumSpacePropagator(momKernel, m);
      FreePropagator(in, out, momKernel);
    }
-    
+
  };

+  #ifdef  USE_FFT_ACCELERATION
+  #ifndef FFT_MASS
+  #error  "USE_FFT_ACCELERATION is defined but not FFT_MASS"
+  #endif
+  #endif
+  
  template <class S, unsigned int N>
  class ScalarAdjMatrixImplTypes {
  public:
    typedef S Simd;
    typedef QCD::SU<N> Group;
-    
+
    template <typename vtype>
    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
    template <typename vtype>
@@ -103,24 +109,119 @@ class ScalarImplTypes {
    typedef iImplField<Simd>   SiteField;
    typedef SiteField          SitePropagator;
    typedef iImplComplex<Simd> SiteComplex;
-    
+
    typedef Lattice<SiteField>   Field;
    typedef Lattice<SiteComplex> ComplexField;
    typedef Field                FermionField;
    typedef Field                PropagatorField;

-    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
+    static void MomentaSquare(ComplexField &out)
+    {
+      GridBase *grid = out._grid;
+      const std::vector<int> &l = grid->FullDimensions();
+      ComplexField kmu(grid);
+
+      for (int mu = 0; mu < grid->Nd(); mu++)
+      {
+        Real twoPiL = M_PI * 2.0 / l[mu];
+        LatticeCoordinate(kmu, mu);
+        kmu = 2.0 * sin(0.5 * twoPiL * kmu);
+        out += kmu * kmu;
+      }
+    }
+
+    static void MomentumSpacePropagator(ComplexField &out, RealD m)
+    {
+      GridBase *grid = out._grid;
+      ComplexField one(grid);
+      one = Complex(1.0, 0.0);
+      out = m * m;
+      MomentaSquare(out);
+      out = one / out;
+    }
+
+    static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
+    {
+#ifndef USE_FFT_ACCELERATION
      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
+#else
+
+      Field Pgaussian(P._grid), Pp(P._grid);
+      ComplexField p2(P._grid); p2 = zero;
+      RealD M = FFT_MASS;
+      
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
+
+      FFT theFFT((GridCartesian*)P._grid);
+      theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward);
+      MomentaSquare(p2);
+      p2 += M * M;
+      p2 = sqrt(p2);
+      Pp *= p2;
+      theFFT.FFT_all_dim(P, Pp, FFT::backward);
+
+#endif //USE_FFT_ACCELERATION
    }

    static inline Field projectForce(Field& P) {return P;}

-    static inline void update_field(Field& P, Field& U, double ep) {
-      U += P*ep;
+    static inline void update_field(Field &P, Field &U, double ep)
+    {
+#ifndef USE_FFT_ACCELERATION
+      double t0=usecond(); 
+      U += P * ep;
+      double t1=usecond();
+      double total_time = (t1-t0)/1e6;
+      std::cout << GridLogIntegrator << "Total time for updating field (s)       : " << total_time << std::endl; 
+#else
+      // FFT transform P(x) -> P(p)
+      // divide by (M^2+p^2)  M external parameter (how to pass?)
+      // P'(p) = P(p)/(M^2+p^2)
+      // Transform back -> P'(x)
+      // U += P'(x)*ep
+
+      Field Pp(U._grid), P_FFT(U._grid);     
+      static ComplexField p2(U._grid);
+      RealD M = FFT_MASS;
+      
+      FFT theFFT((GridCartesian*)U._grid);
+      theFFT.FFT_all_dim(Pp, P, FFT::forward);
+
+      static bool first_call = true;
+      if (first_call)
+      {
+        // avoid recomputing
+        MomentumSpacePropagator(p2, M);
+        first_call = false;
+      }
+      Pp *= p2;
+      theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward);
+      U += P_FFT * ep;
+
+#endif //USE_FFT_ACCELERATION
    }

-    static inline RealD FieldSquareNorm(Field& U) {
-      return (TensorRemove(sum(trace(U*U))).real());
+    static inline RealD FieldSquareNorm(Field &U)
+    {
+#ifndef USE_FFT_ACCELERATION
+      return (TensorRemove(sum(trace(U * U))).real());
+#else
+      // In case of Fourier acceleration we have to:
+      // compute U(p)*U(p)/(M^2+p^2))   Parseval theorem
+      // 1 FFT needed U(x) -> U(p)
+      // M to be passed
+
+      FFT theFFT((GridCartesian*)U._grid);
+      Field Up(U._grid);
+
+      theFFT.FFT_all_dim(Up, U, FFT::forward);
+      RealD M = FFT_MASS;
+      ComplexField p2(U._grid);
+      MomentumSpacePropagator(p2, M);
+      Field Up2 = Up * p2;
+      // from the definition of the DFT we need to divide by the volume
+      return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U._grid->gSites());
+#endif //USE_FFT_ACCELERATION
    }

    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
@@ -146,7 +247,7 @@ class ScalarImplTypes {
  typedef ScalarImplTypes<vComplex> ScalarImplCR;
  typedef ScalarImplTypes<vComplexF> ScalarImplCF;
  typedef ScalarImplTypes<vComplexD> ScalarImplCD;
-    
+
  // Hardcoding here the size of the matrices
  typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR;
  typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
@@ -155,7 +256,7 @@ class ScalarImplTypes {
  template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >;
  template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >;
  template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >;
-  
+
  //}
 }

@@ -30,119 +30,179 @@ directory
 #ifndef SCALAR_INT_ACTION_H
 #define SCALAR_INT_ACTION_H

-
 // Note: this action can completely absorb the ScalarAction for real float fields
 // use the scalarObjs to generalise the structure

-namespace Grid {
-  // FIXME drop the QCD namespace everywhere here
+namespace Grid
+{
+// FIXME drop the QCD namespace everywhere here

-  template <class Impl, int Ndim >
-  class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
-  public:
-    INHERIT_FIELD_TYPES(Impl);
-  private:
-    RealD mass_square;
-    RealD lambda;
+template <class Impl, int Ndim>
+class ScalarInteractionAction : public QCD::Action<typename Impl::Field>
+{
+public:
+  INHERIT_FIELD_TYPES(Impl);

+private:
+  RealD mass_square;
+  RealD lambda;
+  RealD g;
+  const unsigned int N = Impl::Group::Dimension;

-    typedef typename Field::vector_object vobj;
-    typedef CartesianStencil<vobj,vobj> Stencil;
+  typedef typename Field::vector_object vobj;
+  typedef CartesianStencil<vobj, vobj> Stencil;

-    SimpleCompressor<vobj> compressor;
-    int npoint = 2*Ndim;
-    std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
-    std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1};
+  SimpleCompressor<vobj> compressor;
+  int npoint = 2 * Ndim;
+  std::vector<int> directions;    //
+  std::vector<int> displacements; //

-
-  public:
-
-    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
-      for (int mu = 0 ; mu < Ndim; mu++){
-		directions[mu]         = mu; directions[mu+Ndim]    = mu;
-		displacements[mu]      =  1; displacements[mu+Ndim] = -1;
-      }
+public:
+  ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
+  {
+    for (int mu = 0; mu < Ndim; mu++)
+    {
+      directions[mu] = mu;
+      directions[mu + Ndim] = mu;
+      displacements[mu] = 1;
+      displacements[mu + Ndim] = -1;
    }
+  }

-    virtual std::string LogParameters() {
-      std::stringstream sstream;
-      sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl;
-      sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
-      return sstream.str();
-    }
+  virtual std::string LogParameters()
+  {
+    std::stringstream sstream;
+    sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda << std::endl;
+    sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
+    sstream << GridLogMessage << "[ScalarAction] g           : " << g << std::endl;
+    return sstream.str();
+  }

-    virtual std::string action_name() {return "ScalarAction";}
+  virtual std::string action_name() { return "ScalarAction"; }

-    virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
+  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}

-    virtual RealD S(const Field &p) {
-      assert(p._grid->Nd() == Ndim);
-      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-      phiStencil.HaloExchange(p, compressor);
-      Field action(p._grid), pshift(p._grid), phisquared(p._grid);
-      phisquared = p*p;
-      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
-      for (int mu = 0; mu < Ndim; mu++) {
-	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
-	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
-	  int permute_type;
-	  StencilEntry *SE;
-	  vobj temp2;
-	  const vobj *temp, *t_p;
-	    
-	  SE = phiStencil.GetEntry(permute_type, mu, i);
-	  t_p  = &p._odata[i];
-	  if ( SE->_is_local ) {
-	    temp = &p._odata[SE->_offset];
-	    if ( SE->_permute ) {
-	      permute(temp2, *temp, permute_type);
-	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
-	    } else {
-	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
-	    }
-	  } else {
-	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
-	  }
-	}
-	//  action -= pshift*p + p*pshift;
-      }
-      // NB the trace in the algebra is normalised to 1/2
-      // minus sign coming from the antihermitian fields
-      return -(TensorRemove(sum(trace(action)))).real();
-    };
+  virtual RealD S(const Field &p)
+  {
+    assert(p._grid->Nd() == Ndim);
+    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+    phiStencil.HaloExchange(p, compressor);
+    Field action(p._grid), pshift(p._grid), phisquared(p._grid);
+    phisquared = p * p;
+    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
+    for (int mu = 0; mu < Ndim; mu++)
+    {
+      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
+      parallel_for(int i = 0; i < p._grid->oSites(); i++)
+      {
+        int permute_type;
+        StencilEntry *SE;
+        vobj temp2;
+        const vobj *temp, *t_p;

-    virtual void deriv(const Field &p, Field &force) {
-      assert(p._grid->Nd() == Ndim);
-      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
-      // move this outside
-      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-      phiStencil.HaloExchange(p, compressor);
-      
-      //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      for (int point = 0; point < npoint; point++) {
-	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
-	  const vobj *temp;
-	  vobj temp2;
-	  int permute_type;
-	  StencilEntry *SE;
-	  SE = phiStencil.GetEntry(permute_type, point, i);
-	  
-	  if ( SE->_is_local ) {
-	    temp = &p._odata[SE->_offset];
-	    if ( SE->_permute ) {
-	      permute(temp2, *temp, permute_type);
-	      force._odata[i] -= temp2;
-	    } else {
-	      force._odata[i] -= *temp;
-	    }
-	  } else {
-	    force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
-	  }
-	}
+        SE = phiStencil.GetEntry(permute_type, mu, i);
+        t_p = &p._odata[i];
+        if (SE->_is_local)
+        {
+          temp = &p._odata[SE->_offset];
+          if (SE->_permute)
+          {
+            permute(temp2, *temp, permute_type);
+            action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
+          }
+          else
+          {
+            action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
+          }
+        }
+        else
+        {
+          action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
+        }
      }
+      //  action -= pshift*p + p*pshift;
    }
+    // NB the trace in the algebra is normalised to 1/2
+    // minus sign coming from the antihermitian fields
+    return -(TensorRemove(sum(trace(action)))).real() * N / g;
  };
-  
-}  // namespace Grid

-#endif  // SCALAR_INT_ACTION_H
+  virtual void deriv(const Field &p, Field &force)
+  {
+    double t0 = usecond();
+    assert(p._grid->Nd() == Ndim);
+    force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
+    double interm_t = usecond();
+
+    // move this outside
+    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
+
+    phiStencil.HaloExchange(p, compressor);
+    double halo_t = usecond();
+    int chunk = 128;
+    //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
+
+    // inverting the order of the loops slows down the code(! g++ 7)
+    // cannot try to reduce the number of  force writes by factor npoint...
+    // use cache blocking
+    for (int point = 0; point < npoint; point++)
+    {
+
+#pragma omp parallel 
+{
+        int permute_type;
+        StencilEntry *SE;
+        const vobj *temp;
+
+#pragma omp for schedule(static, chunk)
+      for (int i = 0; i < p._grid->oSites(); i++)
+      {
+        SE = phiStencil.GetEntry(permute_type, point, i);
+        // prefetch next p?
+
+        if (SE->_is_local)
+        {
+          temp = &p._odata[SE->_offset];
+      
+          if (SE->_permute)
+          {
+            vobj temp2;
+            permute(temp2, *temp, permute_type);
+            force._odata[i] -= temp2;
+          }
+          else
+          {
+            force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
+          }
+        }
+        else
+        {
+          force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
+        }
+      }
+
+    }
+  }
+  force *= N / g;
+
+  double t1 = usecond();
+  double total_time = (t1 - t0) / 1e6;
+  double interm_time = (interm_t - t0) / 1e6;
+  double halo_time = (halo_t - interm_t) / 1e6;
+  double stencil_time = (t1 - halo_t) / 1e6;
+  std::cout << GridLogIntegrator << "Total time for force computation (s)       : " << total_time << std::endl;
+  std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
+  std::cout << GridLogIntegrator << "Halo time in force computation (s)         : " << halo_time << std::endl;
+  std::cout << GridLogIntegrator << "Stencil time in force computation (s)      : " << stencil_time << std::endl;
+  double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
+  double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
+  double Gflops = flops / (total_time * 1e9);
+  double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
+  std::cout << GridLogIntegrator << "Flops: " << flops << "  - Gflop/s : " << Gflops << std::endl;
+  std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << "  - Gflop/s NS: " << Gflops_no_stencil << std::endl;
+}
+};
+
+} // namespace Grid
+
+#endif // SCALAR_INT_ACTION_H
@@ -211,7 +211,7 @@ typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
    ScalarAdjGenericHMCRunner;

 template <int Colours> 
-using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
+using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, ForceGradient, ScalarNxNMatrixFields<Colours> >;

 }  // namespace QCD
 }  // namespace Grid
@@ -746,7 +746,7 @@ template<typename GaugeField,typename GaugeMat>
    }
  }
  template<typename GaugeField>
-  static void ColdConfiguration(GridParallelRNG &pRNG,GaugeField &out){
+  static void ColdConfiguration(GaugeField &out){
    typedef typename GaugeField::vector_type vector_type;
    typedef iSUnMatrix<vector_type> vMatrixType;
    typedef Lattice<vMatrixType> LatticeMatrixType;
@@ -757,6 +757,10 @@ template<typename GaugeField,typename GaugeMat>
      PokeIndex<LorentzIndex>(out,Umu,mu);
    }
  }
+  template<typename GaugeField>
+  static void ColdConfiguration(GridParallelRNG &pRNG,GaugeField &out){
+    ColdConfiguration(out);
+  }

  template<typename LatticeMatrixType>
  static void taProj( const LatticeMatrixType &in,  LatticeMatrixType &out){