Merge branch 'develop' into feature/hadrons

2025-08-10 00:17:05 +01:00 · 2017-05-29 12:58:08 +01:00
parent 064315c00b 7c6cc85df6
commit d8648307ff
11 changed files with 311 additions and 74 deletions
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -31,6 +31,32 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;

+struct time_statistics{
+  double mean;
+  double err;
+  double min;
+  double max;
+
+  void statistics(std::vector<double> v){
+      double sum = std::accumulate(v.begin(), v.end(), 0.0);
+      mean = sum / v.size();
+
+      std::vector<double> diff(v.size());
+      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
+      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
+
+      auto result = std::minmax_element(v.begin(), v.end());
+      min = *result.first;
+      max = *result.second;
+}
+};
+
+void header(){
+  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
+            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
+};
+
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@@ -40,15 +66,19 @@ int main (int argc, char ** argv)
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

-  int Nloop=10;
+  int Nloop=500;
  int nmu=0;
+  int maxlat=24;
  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;

+  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
+  std::vector<double> t_time(Nloop);
+  time_statistics timestat;
+
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-  int maxlat=24;
+  header();
  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=32;Ls*=2){

@@ -65,8 +95,8 @@ int main (int argc, char ** argv)
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

-      double start=usecond();
      for(int i=0;i<Nloop;i++){
+      double start=usecond();

 	std::vector<CartesianCommunicator::CommsRequest_t> requests;

@@ -102,18 +132,24 @@ int main (int argc, char ** argv)
 	}
 	Grid.SendToRecvFromComplete(requests);
 	Grid.Barrier();
-
+  double stop=usecond();
+  t_time[i] = stop-start; // microseconds
      }
-      double stop=usecond();
+
+      timestat.statistics(t_time);

      double dbytes    = bytes;
-      double xbytes    = Nloop*dbytes*2.0*ncomm;
+      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;

-      double time = stop-start; // microseconds
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
+               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
+               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
+               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
+               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;

-      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    

@@ -121,8 +157,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-
+  header();

  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=32;Ls*=2){
@@ -138,8 +173,8 @@ int main (int argc, char ** argv)
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

-      double start=usecond();
      for(int i=0;i<Nloop;i++){
+      double start=usecond();
    
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
@@ -178,27 +213,34 @@ int main (int argc, char ** argv)
 	  }
 	}
 	Grid.Barrier();
+      double stop=usecond();
+    t_time[i] = stop-start; // microseconds
+
      }

-      double stop=usecond();
+      timestat.statistics(t_time);
      
      double dbytes    = bytes;
-      double xbytes    = Nloop*dbytes*2.0*ncomm;
+      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;

-      double time = stop-start;
+    std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
+               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
+               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
+               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
+               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;

-      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+      
    }
  }  


-  Nloop=10;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+  header();

  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=32;Ls*=2){
@@ -221,8 +263,8 @@ int main (int argc, char ** argv)
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

-      double start=usecond();
      for(int i=0;i<Nloop;i++){
+      double start=usecond();

 	std::vector<CartesianCommunicator::CommsRequest_t> requests;

@@ -258,28 +300,34 @@ int main (int argc, char ** argv)
 	}
 	Grid.StencilSendToRecvFromComplete(requests);
 	Grid.Barrier();
+      double stop=usecond();
+    t_time[i] = stop-start; // microseconds

      }
-      double stop=usecond();
+
+      timestat.statistics(t_time);

      double dbytes    = bytes;
-      double xbytes    = Nloop*dbytes*2.0*ncomm;
+      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;

-      double time = stop-start; // microseconds
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
+               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
+               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
+               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
+               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
+

-      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    


-
-  Nloop=100;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
+  header();

  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=32;Ls*=2){
@@ -302,8 +350,8 @@ int main (int argc, char ** argv)
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

-      double start=usecond();
      for(int i=0;i<Nloop;i++){
+      double start=usecond();

 	std::vector<CartesianCommunicator::CommsRequest_t> requests;

@@ -341,19 +389,27 @@ int main (int argc, char ** argv)
 	  
 	  }
 	}
-	Grid.Barrier();
+	    Grid.Barrier();
+      double stop=usecond();
+      t_time[i] = stop-start; // microseconds

      }
-      double stop=usecond();
+
+      timestat.statistics(t_time);

      double dbytes    = bytes;
-      double xbytes    = Nloop*dbytes*2.0*ncomm;
+      double xbytes    = dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;

-      double time = stop-start; // microseconds

-      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
+               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
+               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
+               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
+               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
+               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
+ 
    }
  }    

--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,4 +1,4 @@
-]#!/usr/bin/env bash
+#!/usr/bin/env bash

 EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2'

--- a/lib/lattice/Lattice_unary.h
+++ b/lib/lattice/Lattice_unary.h
@@ -62,14 +62,20 @@ namespace Grid {
    return ret;
  }

-  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, ComplexD alpha, Integer Nexp = DEFAULT_MAT_EXP){
+  template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
    Lattice<obj> ret(rhs._grid);
    ret.checkerboard = rhs.checkerboard;
    conformable(ret,rhs);
    parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
      ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
    }
+
    return ret;
+
+    
+    
+
+    
  }


--- a/lib/qcd/action/gauge/GaugeImplTypes.h
+++ b/lib/qcd/action/gauge/GaugeImplTypes.h
@@ -59,7 +59,7 @@ public:
  typedef iImplGaugeLink<Simd>  SiteLink;
  typedef iImplGaugeField<Simd> SiteField;

-  typedef Lattice<SiteLink>  LinkField; 
+  typedef Lattice<SiteLink>  LinkField;
  typedef Lattice<SiteField> Field;

  // Guido: we can probably separate the types from the HMC functions
@@ -80,7 +80,7 @@ public:

  ///////////////////////////////////////////////////////////
  // Move these to another class
-  // HMC auxiliary functions 
+  // HMC auxiliary functions
  static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) {
    // specific for SU gauge fields
    LinkField Pmu(P._grid);
@@ -92,14 +92,19 @@ public:
  }

  static inline Field projectForce(Field &P) { return Ta(P); }
-  
+
  static inline void update_field(Field& P, Field& U, double ep){
-    for (int mu = 0; mu < Nd; mu++) {
-      auto Umu = PeekIndex<LorentzIndex>(U, mu);
-      auto Pmu = PeekIndex<LorentzIndex>(P, mu);
-      Umu = expMat(Pmu, ep, Nexp) * Umu;
-      PokeIndex<LorentzIndex>(U, ProjectOnGroup(Umu), mu);
+    //static std::chrono::duration<double> diff;
+
+    //auto start = std::chrono::high_resolution_clock::now();
+    parallel_for(int ss=0;ss<P._grid->oSites();ss++){
+      for (int mu = 0; mu < Nd; mu++) 
+        U[ss]._internal[mu] = ProjectOnGroup(Exponentiate(P[ss]._internal[mu], ep, Nexp) * U[ss]._internal[mu]);
    }
+    
+    //auto end = std::chrono::high_resolution_clock::now();
+   // diff += end - start;
+   // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
  }

  static inline RealD FieldSquareNorm(Field& U){
--- a/lib/qcd/action/gauge/WilsonGaugeAction.h
+++ b/lib/qcd/action/gauge/WilsonGaugeAction.h
@@ -71,14 +71,18 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {

    RealD factor = 0.5 * beta / RealD(Nc);

-    GaugeLinkField Umu(U._grid);
+    //GaugeLinkField Umu(U._grid);
    GaugeLinkField dSdU_mu(U._grid);
    for (int mu = 0; mu < Nd; mu++) {
-      Umu = PeekIndex<LorentzIndex>(U, mu);
+      //Umu = PeekIndex<LorentzIndex>(U, mu);

      // Staple in direction mu
-      WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
-      dSdU_mu = Ta(Umu * dSdU_mu) * factor;
+      //WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
+      //dSdU_mu = Ta(Umu * dSdU_mu) * factor;
+
+  
+      WilsonLoops<Gimpl>::StapleMult(dSdU_mu, U, mu);
+      dSdU_mu = Ta(dSdU_mu) * factor;

      PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
    }
--- a/lib/qcd/smearing/StoutSmearing.h
+++ b/lib/qcd/smearing/StoutSmearing.h
@@ -58,6 +58,8 @@ class Smear_Stout : public Smear<Gimpl> {
    SmearBase->smear(C, U);
  };

+
+  // Repetion of code here (use the Tensor_exp.h function)
  void exponentiate_iQ(GaugeLinkField& e_iQ, const GaugeLinkField& iQ) const {
    // Put this outside
    // only valid for SU(3) matrices
--- a/lib/qcd/smearing/WilsonFlow.h
+++ b/lib/qcd/smearing/WilsonFlow.h
@@ -36,8 +36,10 @@ namespace QCD {
 template <class Gimpl>
 class WilsonFlow: public Smear<Gimpl>{
    unsigned int Nstep;
+    unsigned int measure_interval;
    RealD epsilon;

+
    mutable WilsonGaugeAction<Gimpl> SG;

    void evolve_step(typename Gimpl::GaugeField&) const;
@@ -47,9 +49,10 @@ class WilsonFlow: public Smear<Gimpl>{
 public:
    INHERIT_GIMPL_TYPES(Gimpl)

-    explicit WilsonFlow(unsigned int Nstep, RealD epsilon):
+    explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1):
        Nstep(Nstep),
        epsilon(epsilon),
+        measure_interval(interval),
        SG(WilsonGaugeAction<Gimpl>(3.0)) {
            // WilsonGaugeAction with beta 3.0
            assert(epsilon > 0.0);
@@ -104,14 +107,28 @@ RealD WilsonFlow<Gimpl>::energyDensityPlaquette(unsigned int step, const GaugeFi
    return 2.0 * td * td * SG.S(U)/U._grid->gSites();
 }

+
+//#define WF_TIMING 
+
 template <class Gimpl>
 void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
    out = in;
-    for (unsigned int step = 0; step < Nstep; step++) {
+    for (unsigned int step = 1; step <= Nstep; step++) {
+        auto start = std::chrono::high_resolution_clock::now();
        evolve_step(out);
+        auto end = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> diff = end - start;
+        #ifdef WF_TIMING
+        std::cout << "Time to evolve " << diff.count() << " s\n";
+        #endif
        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
-            << step << " "
+            << step << "  "
            << energyDensityPlaquette(step,out) << std::endl;
+         if( step % measure_interval == 0){
+         std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : "
+            << step << "  " 
+            << WilsonLoops<PeriodicGimplR>::TopologicalCharge(out) << std::endl;
+        }
    }
 }

--- a/lib/qcd/utils/WilsonLoops.h
+++ b/lib/qcd/utils/WilsonLoops.h
@@ -188,6 +188,32 @@ public:
    }
  }

+
+// For the force term
+static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
+    GridBase *grid = Umu._grid;
+    std::vector<GaugeMat> U(Nd, grid);
+    for (int d = 0; d < Nd; d++) {
+      // this operation is taking too much time
+      U[d] = PeekIndex<LorentzIndex>(Umu, d);
+    }
+    staple = zero;
+    GaugeMat tmp1(grid);
+    GaugeMat tmp2(grid);
+
+    for (int nu = 0; nu < Nd; nu++) {
+      if (nu != mu) {
+        // this is ~10% faster than the Staple
+        tmp1 = Cshift(U[nu], mu, 1);
+        tmp2 = Cshift(U[mu], nu, 1);
+        staple += tmp1* adj(U[nu]*tmp2);
+        tmp2 = adj(U[mu]*tmp1)*U[nu];
+        staple += Cshift(tmp2, nu, -1);
+      }
+    }
+    staple = U[mu]*staple;
+}
+
  //////////////////////////////////////////////////
  // the sum over all staples on each site
  //////////////////////////////////////////////////
@@ -200,7 +226,6 @@ public:
      U[d] = PeekIndex<LorentzIndex>(Umu, d);
    }
    staple = zero;
-    GaugeMat tmp(grid);

    for (int nu = 0; nu < Nd; nu++) {

@@ -214,7 +239,7 @@ public:
        //      |
        //    __|
        //
-
+     
        staple += Gimpl::ShiftStaple(
            Gimpl::CovShiftForward(
                U[nu], nu,
@@ -227,6 +252,7 @@ public:
        // |__
        //
        //
+
        staple += Gimpl::ShiftStaple(
            Gimpl::CovShiftBackward(U[nu], nu,
                                    Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
@@ -289,8 +315,7 @@ public:
      //
      staple = Gimpl::ShiftStaple(
          Gimpl::CovShiftBackward(U[nu], nu,
-                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])),
-          mu);
+                                  Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
    }
  }

@@ -307,10 +332,10 @@ public:
      GaugeMat Vup(Umu._grid), Vdn(Umu._grid);
      StapleUpper(Vup, Umu, mu, nu);
      StapleLower(Vdn, Umu, mu, nu);
-      GaugeMat v = adj(Vup) - adj(Vdn);
+      GaugeMat v = Vup - Vdn;
      GaugeMat u = PeekIndex<LorentzIndex>(Umu, mu);  // some redundant copies
      GaugeMat vu = v*u;
-      FS = 0.25*Ta(u*v + Cshift(vu, mu, +1));
+      FS = 0.25*Ta(u*v + Cshift(vu, mu, -1));
  }

  static Real TopologicalCharge(GaugeLorentz &U){
--- a/lib/simd/Grid_generic.h
+++ b/lib/simd/Grid_generic.h
@@ -281,8 +281,8 @@ namespace Optimization {

  struct PrecisionChange {
    static inline vech StoH (const vecf &a,const vecf &b) {
-#ifdef USE_FP16
      vech ret;
+#ifdef USE_FP16
      vech *ha = (vech *)&a;
      vech *hb = (vech *)&b;
      const int nf = W<float>::r;
@@ -493,6 +493,8 @@ namespace Optimization {
    
    return a;
  }
+
+  #undef acc  // EIGEN compatibility
 }

 //////////////////////////////////////////////////////////////////////////////////////
--- a/lib/tensors/Tensor_exp.h
+++ b/lib/tensors/Tensor_exp.h
@@ -37,30 +37,105 @@ namespace Grid {
  /////////////////////////////////////////////// 


-  template<class vtype> inline iScalar<vtype> Exponentiate(const iScalar<vtype>&r, ComplexD alpha ,  Integer Nexp = DEFAULT_MAT_EXP)
+  template<class vtype> inline iScalar<vtype> Exponentiate(const iScalar<vtype>&r, RealD alpha ,  Integer Nexp = DEFAULT_MAT_EXP)
    {
      iScalar<vtype> ret;
      ret._internal = Exponentiate(r._internal, alpha, Nexp);
      return ret;
    }

-
-  template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
-    inline iMatrix<vtype,N> Exponentiate(const iMatrix<vtype,N> &arg, ComplexD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
+template<class vtype, int N> inline iVector<vtype, N> Exponentiate(const iVector<vtype,N>&r, RealD alpha ,  Integer Nexp = DEFAULT_MAT_EXP)
    {
-      iMatrix<vtype,N> unit(1.0);
-      iMatrix<vtype,N> temp(unit);
-      
-      for(int i=Nexp; i>=1;--i){
-	temp *= alpha/ComplexD(i);
-	temp = unit + temp*arg;
-      }
-      
-      return temp;
-      
+      iVector<vtype, N> ret;
+      for (int i = 0; i < N; i++)
+        ret._internal[i] = Exponentiate(r._internal[i], alpha, Nexp);
+      return ret;
    }



+    // Specialisation: Cayley-Hamilton exponential for SU(3)
+    template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
+    inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
+    {
+    // for SU(3) 2x faster than the std implementation using Nexp=12
+    // notice that it actually computes
+    // exp ( input matrix )
+    // the i sign is coming from outside
+    // input matrix is anti-hermitian NOT hermitian
+      typedef iMatrix<vtype,3> mat;
+      typedef iScalar<vtype> scalar;
+      mat unit(1.0);
+      mat temp(unit);
+      const Complex one_over_three = 1.0 / 3.0;
+      const Complex one_over_two = 1.0 / 2.0;
+
+      scalar c0, c1, tmp, c0max, theta, u, w;
+      scalar xi0, u2, w2, cosw;
+      scalar fden, h0, h1, h2;
+      scalar e2iu, emiu, ixi0, qt;
+      scalar f0, f1, f2;
+      scalar unity(1.0);
+      
+      mat iQ2 = arg*arg*alpha*alpha;
+      mat iQ3 = arg*iQ2*alpha;   
+      // sign in c0 from the conventions on the Ta
+      c0 = -imag( trace(iQ3) ) * one_over_three;  
+      c1 = -real( trace(iQ2) ) * one_over_two;
+
+      // Cayley Hamilton checks to machine precision, tested
+      tmp = c1 * one_over_three;
+      c0max = 2.0 * pow(tmp, 1.5);
+
+      theta = acos(c0 / c0max) * one_over_three;
+      u = sqrt(tmp) * cos(theta);
+      w = sqrt(c1) * sin(theta);
+
+      xi0 = sin(w) / w;
+      u2 = u * u;
+      w2 = w * w;
+      cosw = cos(w);
+
+      ixi0 = timesI(xi0);
+      emiu = cos(u) - timesI(sin(u));
+      e2iu = cos(2.0 * u) + timesI(sin(2.0 * u));
+
+      h0 = e2iu * (u2 - w2) +
+           emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0));
+      h1 = e2iu * (2.0 * u) - emiu * ((2.0 * u * cosw) - (3.0 * u2 - w2) * ixi0);
+      h2 = e2iu - emiu * (cosw + (3.0 * u) * ixi0);
+
+      fden = unity / (9.0 * u2 - w2);  // reals
+      f0 = h0 * fden;
+      f1 = h1 * fden;
+      f2 = h2 * fden;
+
+      return (f0 * unit + timesMinusI(f1) * arg*alpha - f2 * iQ2);
+    }
+
+
+
+// General exponential
+template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
+    inline iMatrix<vtype,N> Exponentiate(const iMatrix<vtype,N> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
+    {
+    // notice that it actually computes
+    // exp ( input matrix )
+    // the i sign is coming from outside
+    // input matrix is anti-hermitian NOT hermitian
+      typedef iMatrix<vtype,N> mat;
+      mat unit(1.0);
+      mat temp(unit);
+      for(int i=Nexp; i>=1;--i){
+	      temp *= alpha/RealD(i);
+	      temp = unit + temp*arg;
+      }
+      return temp;
+
+    }
+
+
+
+
 }
 #endif
--- a/tests/smearing/Test_WilsonFlow.cc
+++ b/tests/smearing/Test_WilsonFlow.cc
@@ -28,6 +28,37 @@ directory
 /*  END LEGAL */
 #include <Grid/Grid.h>

+namespace Grid{
+  struct WFParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters,
+            int, steps,
+            double, step_size,
+            int, meas_interval);
+       
+
+    template <class ReaderClass >
+    WFParameters(Reader<ReaderClass>& Reader){
+      read(Reader, "WilsonFlow", *this);
+    }
+
+  };
+
+  struct ConfParameters: Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(ConfParameters,
+           std::string, conf_prefix,
+            std::string, rng_prefix,
+				    int, StartConfiguration,
+				    int, EndConfiguration,
+            int, Skip);
+  
+    template <class ReaderClass >
+    ConfParameters(Reader<ReaderClass>& Reader){
+      read(Reader, "Configurations", *this);
+    }
+
+  };
+}
+
 int main(int argc, char **argv) {
  using namespace Grid;
  using namespace Grid::QCD;
@@ -42,22 +73,36 @@ int main(int argc, char **argv) {
  GridRedBlackCartesian     RBGrid(latt_size, simd_layout, mpi_layout);

  std::vector<int> seeds({1, 2, 3, 4, 5});
+  GridSerialRNG sRNG;
  GridParallelRNG pRNG(&Grid);
  pRNG.SeedFixedIntegers(seeds);

  LatticeGaugeField Umu(&Grid), Uflow(&Grid);
  SU<Nc>::HotConfiguration(pRNG, Umu);
+  
+  typedef Grid::JSONReader       Serialiser;
+  Serialiser Reader("input.json");
+  WFParameters WFPar(Reader);
+  ConfParameters CPar(Reader);
+  CheckpointerParameters CPPar(CPar.conf_prefix, CPar.rng_prefix);
+  BinaryHmcCheckpointer<PeriodicGimplR> CPBin(CPPar);
+
+  for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
+
+  CPBin.CheckpointRestore(conf, Umu, sRNG, pRNG);

  std::cout << std::setprecision(15);
-  std::cout << GridLogMessage << "Plaquette: "
+  std::cout << GridLogMessage << "Initial plaquette: "
    << WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;

-  WilsonFlow<PeriodicGimplR> WF(200, 0.01);
+  WilsonFlow<PeriodicGimplR> WF(WFPar.steps, WFPar.step_size, WFPar.meas_interval);

  WF.smear(Uflow, Umu);

  RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
-  std::cout << GridLogMessage << "Plaquette: "<< WFlow_plaq << std::endl;
+  RealD WFlow_TC   = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
+  std::cout << GridLogMessage << "Plaquette          "<< conf << "   " << WFlow_plaq << std::endl;
+  std::cout << GridLogMessage << "TopologicalCharge  "<< conf << "   " << WFlow_TC   << std::endl;

  std::cout<< GridLogMessage << " Admissibility check:\n";
  const double sp_adm = 0.067;                // admissible threshold
@@ -73,6 +118,6 @@ int main(int argc, char **argv) {
  std::cout<< GridLogMessage << "   (sp_admissible = "<< sp_adm <<")\n";
  //std::cout<< GridLogMessage << "   sp_admissible - sp_max = "<<sp_adm-sp_max <<"\n";
  std::cout<< GridLogMessage << "   sp_admissible - sp_ave = "<<sp_adm-sp_ave <<"\n";
-
+  }
  Grid_finalize();
 }  // main