diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 99ab190b..ce881ef6 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -31,6 +31,32 @@ using namespace std; using namespace Grid; using namespace Grid::QCD; +struct time_statistics{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v){ + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; +} +}; + +void header(){ + std::cout <1) nmu++; + std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; + std::vector t_time(Nloop); + time_statistics timestat; + std::cout< requests; @@ -102,18 +132,24 @@ int main (int argc, char ** argv) } Grid.SendToRecvFromComplete(requests); Grid.Barrier(); - + double stop=usecond(); + t_time[i] = stop-start; // microseconds } - double stop=usecond(); + + timestat.statistics(t_time); double dbytes = bytes; - double xbytes = Nloop*dbytes*2.0*ncomm; + double xbytes = dbytes*2.0*ncomm; double rbytes = xbytes; double bidibytes = xbytes+rbytes; - double time = stop-start; // microseconds + std::cout< requests; @@ -258,28 +300,34 @@ int main (int argc, char ** argv) } Grid.StencilSendToRecvFromComplete(requests); Grid.Barrier(); + double stop=usecond(); + t_time[i] = stop-start; // microseconds } - double stop=usecond(); + + timestat.statistics(t_time); double dbytes = bytes; - double xbytes = Nloop*dbytes*2.0*ncomm; + double xbytes = dbytes*2.0*ncomm; double rbytes = xbytes; double bidibytes = xbytes+rbytes; - double time = stop-start; // microseconds + std::cout< requests; @@ -341,19 +389,27 @@ int main (int argc, char ** argv) } } - Grid.Barrier(); + Grid.Barrier(); + double stop=usecond(); + t_time[i] = stop-start; // microseconds } - double stop=usecond(); + + timestat.statistics(t_time); double dbytes = bytes; - double xbytes = Nloop*dbytes*2.0*ncomm; + double xbytes = dbytes*2.0*ncomm; double rbytes = xbytes; double bidibytes = xbytes+rbytes; - double time = stop-start; // microseconds - std::cout< Lattice expMat(const Lattice &rhs, ComplexD alpha, Integer Nexp = DEFAULT_MAT_EXP){ + template Lattice expMat(const Lattice &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ Lattice ret(rhs._grid); ret.checkerboard = rhs.checkerboard; conformable(ret,rhs); parallel_for(int ss=0;ssoSites();ss++){ ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp); } + return ret; + + + + + } diff --git a/lib/qcd/action/gauge/GaugeImplTypes.h b/lib/qcd/action/gauge/GaugeImplTypes.h index 9d36eead..29e79581 100644 --- a/lib/qcd/action/gauge/GaugeImplTypes.h +++ b/lib/qcd/action/gauge/GaugeImplTypes.h @@ -59,7 +59,7 @@ public: typedef iImplGaugeLink SiteLink; typedef iImplGaugeField SiteField; - typedef Lattice LinkField; + typedef Lattice LinkField; typedef Lattice Field; // Guido: we can probably separate the types from the HMC functions @@ -80,7 +80,7 @@ public: /////////////////////////////////////////////////////////// // Move these to another class - // HMC auxiliary functions + // HMC auxiliary functions static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) { // specific for SU gauge fields LinkField Pmu(P._grid); @@ -92,14 +92,19 @@ public: } static inline Field projectForce(Field &P) { return Ta(P); } - + static inline void update_field(Field& P, Field& U, double ep){ - for (int mu = 0; mu < Nd; mu++) { - auto Umu = PeekIndex(U, mu); - auto Pmu = PeekIndex(P, mu); - Umu = expMat(Pmu, ep, Nexp) * Umu; - PokeIndex(U, ProjectOnGroup(Umu), mu); + //static std::chrono::duration diff; + + //auto start = std::chrono::high_resolution_clock::now(); + parallel_for(int ss=0;ssoSites();ss++){ + for (int mu = 0; mu < Nd; mu++) + U[ss]._internal[mu] = ProjectOnGroup(Exponentiate(P[ss]._internal[mu], ep, Nexp) * U[ss]._internal[mu]); } + + //auto end = std::chrono::high_resolution_clock::now(); + // diff += end - start; + // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n"; } static inline RealD FieldSquareNorm(Field& U){ diff --git a/lib/qcd/action/gauge/WilsonGaugeAction.h b/lib/qcd/action/gauge/WilsonGaugeAction.h index 77c2424c..1ea780b7 100644 --- a/lib/qcd/action/gauge/WilsonGaugeAction.h +++ b/lib/qcd/action/gauge/WilsonGaugeAction.h @@ -71,14 +71,18 @@ class WilsonGaugeAction : public Action { RealD factor = 0.5 * beta / RealD(Nc); - GaugeLinkField Umu(U._grid); + //GaugeLinkField Umu(U._grid); GaugeLinkField dSdU_mu(U._grid); for (int mu = 0; mu < Nd; mu++) { - Umu = PeekIndex(U, mu); + //Umu = PeekIndex(U, mu); // Staple in direction mu - WilsonLoops::Staple(dSdU_mu, U, mu); - dSdU_mu = Ta(Umu * dSdU_mu) * factor; + //WilsonLoops::Staple(dSdU_mu, U, mu); + //dSdU_mu = Ta(Umu * dSdU_mu) * factor; + + + WilsonLoops::StapleMult(dSdU_mu, U, mu); + dSdU_mu = Ta(dSdU_mu) * factor; PokeIndex(dSdU, dSdU_mu, mu); } diff --git a/lib/qcd/smearing/StoutSmearing.h b/lib/qcd/smearing/StoutSmearing.h index b50ffe21..bfc37d0d 100644 --- a/lib/qcd/smearing/StoutSmearing.h +++ b/lib/qcd/smearing/StoutSmearing.h @@ -58,6 +58,8 @@ class Smear_Stout : public Smear { SmearBase->smear(C, U); }; + + // Repetion of code here (use the Tensor_exp.h function) void exponentiate_iQ(GaugeLinkField& e_iQ, const GaugeLinkField& iQ) const { // Put this outside // only valid for SU(3) matrices diff --git a/lib/qcd/smearing/WilsonFlow.h b/lib/qcd/smearing/WilsonFlow.h index 8b8f9a81..00b7622e 100644 --- a/lib/qcd/smearing/WilsonFlow.h +++ b/lib/qcd/smearing/WilsonFlow.h @@ -36,8 +36,10 @@ namespace QCD { template class WilsonFlow: public Smear{ unsigned int Nstep; + unsigned int measure_interval; RealD epsilon; + mutable WilsonGaugeAction SG; void evolve_step(typename Gimpl::GaugeField&) const; @@ -47,9 +49,10 @@ class WilsonFlow: public Smear{ public: INHERIT_GIMPL_TYPES(Gimpl) - explicit WilsonFlow(unsigned int Nstep, RealD epsilon): + explicit WilsonFlow(unsigned int Nstep, RealD epsilon, unsigned int interval = 1): Nstep(Nstep), epsilon(epsilon), + measure_interval(interval), SG(WilsonGaugeAction(3.0)) { // WilsonGaugeAction with beta 3.0 assert(epsilon > 0.0); @@ -104,14 +107,28 @@ RealD WilsonFlow::energyDensityPlaquette(unsigned int step, const GaugeFi return 2.0 * td * td * SG.S(U)/U._grid->gSites(); } + +//#define WF_TIMING + template void WilsonFlow::smear(GaugeField& out, const GaugeField& in) const { out = in; - for (unsigned int step = 0; step < Nstep; step++) { + for (unsigned int step = 1; step <= Nstep; step++) { + auto start = std::chrono::high_resolution_clock::now(); evolve_step(out); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = end - start; + #ifdef WF_TIMING + std::cout << "Time to evolve " << diff.count() << " s\n"; + #endif std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " - << step << " " + << step << " " << energyDensityPlaquette(step,out) << std::endl; + if( step % measure_interval == 0){ + std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " + << step << " " + << WilsonLoops::TopologicalCharge(out) << std::endl; + } } } diff --git a/lib/qcd/utils/WilsonLoops.h b/lib/qcd/utils/WilsonLoops.h index 5382882e..3c0e9ee4 100644 --- a/lib/qcd/utils/WilsonLoops.h +++ b/lib/qcd/utils/WilsonLoops.h @@ -188,6 +188,32 @@ public: } } + +// For the force term +static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { + GridBase *grid = Umu._grid; + std::vector U(Nd, grid); + for (int d = 0; d < Nd; d++) { + // this operation is taking too much time + U[d] = PeekIndex(Umu, d); + } + staple = zero; + GaugeMat tmp1(grid); + GaugeMat tmp2(grid); + + for (int nu = 0; nu < Nd; nu++) { + if (nu != mu) { + // this is ~10% faster than the Staple + tmp1 = Cshift(U[nu], mu, 1); + tmp2 = Cshift(U[mu], nu, 1); + staple += tmp1* adj(U[nu]*tmp2); + tmp2 = adj(U[mu]*tmp1)*U[nu]; + staple += Cshift(tmp2, nu, -1); + } + } + staple = U[mu]*staple; +} + ////////////////////////////////////////////////// // the sum over all staples on each site ////////////////////////////////////////////////// @@ -200,7 +226,6 @@ public: U[d] = PeekIndex(Umu, d); } staple = zero; - GaugeMat tmp(grid); for (int nu = 0; nu < Nd; nu++) { @@ -214,7 +239,7 @@ public: // | // __| // - + staple += Gimpl::ShiftStaple( Gimpl::CovShiftForward( U[nu], nu, @@ -227,6 +252,7 @@ public: // |__ // // + staple += Gimpl::ShiftStaple( Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu); @@ -289,8 +315,7 @@ public: // staple = Gimpl::ShiftStaple( Gimpl::CovShiftBackward(U[nu], nu, - Gimpl::CovShiftBackward(U[mu], mu, U[nu])), - mu); + Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu); } } @@ -307,10 +332,10 @@ public: GaugeMat Vup(Umu._grid), Vdn(Umu._grid); StapleUpper(Vup, Umu, mu, nu); StapleLower(Vdn, Umu, mu, nu); - GaugeMat v = adj(Vup) - adj(Vdn); + GaugeMat v = Vup - Vdn; GaugeMat u = PeekIndex(Umu, mu); // some redundant copies GaugeMat vu = v*u; - FS = 0.25*Ta(u*v + Cshift(vu, mu, +1)); + FS = 0.25*Ta(u*v + Cshift(vu, mu, -1)); } static Real TopologicalCharge(GaugeLorentz &U){ diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index bdf5aa01..e1d5f894 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -281,8 +281,8 @@ namespace Optimization { struct PrecisionChange { static inline vech StoH (const vecf &a,const vecf &b) { -#ifdef USE_FP16 vech ret; +#ifdef USE_FP16 vech *ha = (vech *)&a; vech *hb = (vech *)&b; const int nf = W::r; @@ -493,6 +493,8 @@ namespace Optimization { return a; } + + #undef acc // EIGEN compatibility } ////////////////////////////////////////////////////////////////////////////////////// diff --git a/lib/tensors/Tensor_exp.h b/lib/tensors/Tensor_exp.h index ad0d2071..e18fed70 100644 --- a/lib/tensors/Tensor_exp.h +++ b/lib/tensors/Tensor_exp.h @@ -37,30 +37,105 @@ namespace Grid { /////////////////////////////////////////////// - template inline iScalar Exponentiate(const iScalar&r, ComplexD alpha , Integer Nexp = DEFAULT_MAT_EXP) + template inline iScalar Exponentiate(const iScalar&r, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP) { iScalar ret; ret._internal = Exponentiate(r._internal, alpha, Nexp); return ret; } - - template::TensorLevel == 0 >::type * =nullptr> - inline iMatrix Exponentiate(const iMatrix &arg, ComplexD alpha , Integer Nexp = DEFAULT_MAT_EXP ) +template inline iVector Exponentiate(const iVector&r, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP) { - iMatrix unit(1.0); - iMatrix temp(unit); - - for(int i=Nexp; i>=1;--i){ - temp *= alpha/ComplexD(i); - temp = unit + temp*arg; - } - - return temp; - + iVector ret; + for (int i = 0; i < N; i++) + ret._internal[i] = Exponentiate(r._internal[i], alpha, Nexp); + return ret; } + // Specialisation: Cayley-Hamilton exponential for SU(3) + template::TensorLevel == 0>::type * =nullptr> + inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) + { + // for SU(3) 2x faster than the std implementation using Nexp=12 + // notice that it actually computes + // exp ( input matrix ) + // the i sign is coming from outside + // input matrix is anti-hermitian NOT hermitian + typedef iMatrix mat; + typedef iScalar scalar; + mat unit(1.0); + mat temp(unit); + const Complex one_over_three = 1.0 / 3.0; + const Complex one_over_two = 1.0 / 2.0; + + scalar c0, c1, tmp, c0max, theta, u, w; + scalar xi0, u2, w2, cosw; + scalar fden, h0, h1, h2; + scalar e2iu, emiu, ixi0, qt; + scalar f0, f1, f2; + scalar unity(1.0); + + mat iQ2 = arg*arg*alpha*alpha; + mat iQ3 = arg*iQ2*alpha; + // sign in c0 from the conventions on the Ta + c0 = -imag( trace(iQ3) ) * one_over_three; + c1 = -real( trace(iQ2) ) * one_over_two; + + // Cayley Hamilton checks to machine precision, tested + tmp = c1 * one_over_three; + c0max = 2.0 * pow(tmp, 1.5); + + theta = acos(c0 / c0max) * one_over_three; + u = sqrt(tmp) * cos(theta); + w = sqrt(c1) * sin(theta); + + xi0 = sin(w) / w; + u2 = u * u; + w2 = w * w; + cosw = cos(w); + + ixi0 = timesI(xi0); + emiu = cos(u) - timesI(sin(u)); + e2iu = cos(2.0 * u) + timesI(sin(2.0 * u)); + + h0 = e2iu * (u2 - w2) + + emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0)); + h1 = e2iu * (2.0 * u) - emiu * ((2.0 * u * cosw) - (3.0 * u2 - w2) * ixi0); + h2 = e2iu - emiu * (cosw + (3.0 * u) * ixi0); + + fden = unity / (9.0 * u2 - w2); // reals + f0 = h0 * fden; + f1 = h1 * fden; + f2 = h2 * fden; + + return (f0 * unit + timesMinusI(f1) * arg*alpha - f2 * iQ2); + } + + + +// General exponential +template::TensorLevel == 0 >::type * =nullptr> + inline iMatrix Exponentiate(const iMatrix &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) + { + // notice that it actually computes + // exp ( input matrix ) + // the i sign is coming from outside + // input matrix is anti-hermitian NOT hermitian + typedef iMatrix mat; + mat unit(1.0); + mat temp(unit); + for(int i=Nexp; i>=1;--i){ + temp *= alpha/RealD(i); + temp = unit + temp*arg; + } + return temp; + + } + + + + } #endif diff --git a/tests/smearing/Test_WilsonFlow.cc b/tests/smearing/Test_WilsonFlow.cc index 4e6bd0af..c337bf29 100644 --- a/tests/smearing/Test_WilsonFlow.cc +++ b/tests/smearing/Test_WilsonFlow.cc @@ -28,6 +28,37 @@ directory /* END LEGAL */ #include +namespace Grid{ + struct WFParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters, + int, steps, + double, step_size, + int, meas_interval); + + + template + WFParameters(Reader& Reader){ + read(Reader, "WilsonFlow", *this); + } + + }; + + struct ConfParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(ConfParameters, + std::string, conf_prefix, + std::string, rng_prefix, + int, StartConfiguration, + int, EndConfiguration, + int, Skip); + + template + ConfParameters(Reader& Reader){ + read(Reader, "Configurations", *this); + } + + }; +} + int main(int argc, char **argv) { using namespace Grid; using namespace Grid::QCD; @@ -42,22 +73,36 @@ int main(int argc, char **argv) { GridRedBlackCartesian RBGrid(latt_size, simd_layout, mpi_layout); std::vector seeds({1, 2, 3, 4, 5}); + GridSerialRNG sRNG; GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); LatticeGaugeField Umu(&Grid), Uflow(&Grid); SU::HotConfiguration(pRNG, Umu); + + typedef Grid::JSONReader Serialiser; + Serialiser Reader("input.json"); + WFParameters WFPar(Reader); + ConfParameters CPar(Reader); + CheckpointerParameters CPPar(CPar.conf_prefix, CPar.rng_prefix); + BinaryHmcCheckpointer CPBin(CPPar); + + for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){ + + CPBin.CheckpointRestore(conf, Umu, sRNG, pRNG); std::cout << std::setprecision(15); - std::cout << GridLogMessage << "Plaquette: " + std::cout << GridLogMessage << "Initial plaquette: " << WilsonLoops::avgPlaquette(Umu) << std::endl; - WilsonFlow WF(200, 0.01); + WilsonFlow WF(WFPar.steps, WFPar.step_size, WFPar.meas_interval); WF.smear(Uflow, Umu); RealD WFlow_plaq = WilsonLoops::avgPlaquette(Uflow); - std::cout << GridLogMessage << "Plaquette: "<< WFlow_plaq << std::endl; + RealD WFlow_TC = WilsonLoops::TopologicalCharge(Uflow); + std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl; + std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl; std::cout<< GridLogMessage << " Admissibility check:\n"; const double sp_adm = 0.067; // admissible threshold @@ -73,6 +118,6 @@ int main(int argc, char **argv) { std::cout<< GridLogMessage << " (sp_admissible = "<< sp_adm <<")\n"; //std::cout<< GridLogMessage << " sp_admissible - sp_max = "<