Merge branch 'feature/scalar_adjointFT' into feature/hadrons

2025-07-12 03:07:06 +01:00 · 2017-12-26 12:59:33 +01:00
parent 67c3fa0f5f 84f9c37ed4
commit 6718fa8c4f
9 changed files with 293 additions and 121 deletions
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@ -59,6 +59,7 @@ public:

    virtual ~GridBase() = default;

+
    // Physics Grid information.
    std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
    std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@ -122,6 +122,7 @@ public:

        // Use a reduced simd grid
        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
+        //std::cout << _ldimensions[d] << "  " << _gdimensions[d] << "  " << _processors[d] << std::endl;
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);

        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
@ -166,6 +167,7 @@ public:
        block = block * _rdimensions[d];
      }
    };
+
 };
 }
 #endif
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@ -167,6 +167,7 @@ class CartesianCommunicator {
 #endif
 public:

+  
  ////////////////////////////////////////////////////////////////////////////////////////
  // Wraps MPI_Cart routines, or implements equivalent on other impls
  ////////////////////////////////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi3_leader.cc
+++ b/lib/communicator/Communicator_mpi3_leader.cc
@ -830,6 +830,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
 };

+CartesianCommunicator::~CartesianCommunicator() = default;
+
+
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@ -105,6 +105,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  assert(Size==_Nprocessors);
 }

+CartesianCommunicator::~CartesianCommunicator() = default;
+
+
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  static long long source ;
  static long long dest   ;
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@ -89,6 +89,12 @@ class ScalarImplTypes {

  };

+  #ifdef  USE_FFT_ACCELERATION
+  #ifndef FFT_MASS
+  #error  "USE_FFT_ACCELERATION is defined but not FFT_MASS"
+  #endif
+  #endif
+  
  template <class S, unsigned int N>
  class ScalarAdjMatrixImplTypes {
  public:
@ -109,18 +115,113 @@ class ScalarImplTypes {
    typedef Field                FermionField;
    typedef Field                PropagatorField;

-    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
+    static void MomentaSquare(ComplexField &out)
+    {
+      GridBase *grid = out._grid;
+      const std::vector<int> &l = grid->FullDimensions();
+      ComplexField kmu(grid);
+
+      for (int mu = 0; mu < grid->Nd(); mu++)
+      {
+        Real twoPiL = M_PI * 2.0 / l[mu];
+        LatticeCoordinate(kmu, mu);
+        kmu = 2.0 * sin(0.5 * twoPiL * kmu);
+        out += kmu * kmu;
+      }
+    }
+
+    static void MomentumSpacePropagator(ComplexField &out, RealD m)
+    {
+      GridBase *grid = out._grid;
+      ComplexField one(grid);
+      one = Complex(1.0, 0.0);
+      out = m * m;
+      MomentaSquare(out);
+      out = one / out;
+    }
+
+    static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
+    {
+#ifndef USE_FFT_ACCELERATION
      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
+#else
+
+      Field Pgaussian(P._grid), Pp(P._grid);
+      ComplexField p2(P._grid); p2 = zero;
+      RealD M = FFT_MASS;
+      
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
+
+      FFT theFFT((GridCartesian*)P._grid);
+      theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward);
+      MomentaSquare(p2);
+      p2 += M * M;
+      p2 = sqrt(p2);
+      Pp *= p2;
+      theFFT.FFT_all_dim(P, Pp, FFT::backward);
+
+#endif //USE_FFT_ACCELERATION
    }

    static inline Field projectForce(Field& P) {return P;}

-    static inline void update_field(Field& P, Field& U, double ep) {
-      U += P*ep;
+    static inline void update_field(Field &P, Field &U, double ep)
+    {
+#ifndef USE_FFT_ACCELERATION
+      double t0=usecond(); 
+      U += P * ep;
+      double t1=usecond();
+      double total_time = (t1-t0)/1e6;
+      std::cout << GridLogIntegrator << "Total time for updating field (s)       : " << total_time << std::endl; 
+#else
+      // FFT transform P(x) -> P(p)
+      // divide by (M^2+p^2)  M external parameter (how to pass?)
+      // P'(p) = P(p)/(M^2+p^2)
+      // Transform back -> P'(x)
+      // U += P'(x)*ep
+
+      Field Pp(U._grid), P_FFT(U._grid);     
+      static ComplexField p2(U._grid);
+      RealD M = FFT_MASS;
+      
+      FFT theFFT((GridCartesian*)U._grid);
+      theFFT.FFT_all_dim(Pp, P, FFT::forward);
+
+      static bool first_call = true;
+      if (first_call)
+      {
+        // avoid recomputing
+        MomentumSpacePropagator(p2, M);
+        first_call = false;
+      }
+      Pp *= p2;
+      theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward);
+      U += P_FFT * ep;
+
+#endif //USE_FFT_ACCELERATION
    }

-    static inline RealD FieldSquareNorm(Field& U) {
-      return (TensorRemove(sum(trace(U*U))).real());
+    static inline RealD FieldSquareNorm(Field &U)
+    {
+#ifndef USE_FFT_ACCELERATION
+      return (TensorRemove(sum(trace(U * U))).real());
+#else
+      // In case of Fourier acceleration we have to:
+      // compute U(p)*U(p)/(M^2+p^2))   Parseval theorem
+      // 1 FFT needed U(x) -> U(p)
+      // M to be passed
+
+      FFT theFFT((GridCartesian*)U._grid);
+      Field Up(U._grid);
+
+      theFFT.FFT_all_dim(Up, U, FFT::forward);
+      RealD M = FFT_MASS;
+      ComplexField p2(U._grid);
+      MomentumSpacePropagator(p2, M);
+      Field Up2 = Up * p2;
+      // from the definition of the DFT we need to divide by the volume
+      return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U._grid->gSites());
+#endif //USE_FFT_ACCELERATION
    }

    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@ -30,61 +30,71 @@ directory
 #ifndef SCALAR_INT_ACTION_H
 #define SCALAR_INT_ACTION_H

-
 // Note: this action can completely absorb the ScalarAction for real float fields
 // use the scalarObjs to generalise the structure

-namespace Grid {
-  // FIXME drop the QCD namespace everywhere here
+namespace Grid
+{
+// FIXME drop the QCD namespace everywhere here

-  template <class Impl, int Ndim >
-  class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
-  public:
+template <class Impl, int Ndim>
+class ScalarInteractionAction : public QCD::Action<typename Impl::Field>
+{
+public:
  INHERIT_FIELD_TYPES(Impl);
-  private:
+
+private:
  RealD mass_square;
  RealD lambda;
-
+  RealD g;
+  const unsigned int N = Impl::Group::Dimension;

  typedef typename Field::vector_object vobj;
-    typedef CartesianStencil<vobj,vobj> Stencil;
+  typedef CartesianStencil<vobj, vobj> Stencil;

  SimpleCompressor<vobj> compressor;
-    int npoint = 2*Ndim;
-    std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions
-    std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1};
+  int npoint = 2 * Ndim;
+  std::vector<int> directions;    //
+  std::vector<int> displacements; //

-
-  public:
-
-    ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
-      for (int mu = 0 ; mu < Ndim; mu++){
-		directions[mu]         = mu; directions[mu+Ndim]    = mu;
-		displacements[mu]      =  1; displacements[mu+Ndim] = -1;
+public:
+  ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
+  {
+    for (int mu = 0; mu < Ndim; mu++)
+    {
+      directions[mu] = mu;
+      directions[mu + Ndim] = mu;
+      displacements[mu] = 1;
+      displacements[mu + Ndim] = -1;
    }
  }

-    virtual std::string LogParameters() {
+  virtual std::string LogParameters()
+  {
    std::stringstream sstream;
    sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda << std::endl;
    sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
+    sstream << GridLogMessage << "[ScalarAction] g           : " << g << std::endl;
    return sstream.str();
  }

-    virtual std::string action_name() {return "ScalarAction";}
+  virtual std::string action_name() { return "ScalarAction"; }

  virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}

-    virtual RealD S(const Field &p) {
+  virtual RealD S(const Field &p)
+  {
    assert(p._grid->Nd() == Ndim);
    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
    phiStencil.HaloExchange(p, compressor);
    Field action(p._grid), pshift(p._grid), phisquared(p._grid);
-      phisquared = p*p;
-      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
-      for (int mu = 0; mu < Ndim; mu++) {
+    phisquared = p * p;
+    action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
+    for (int mu = 0; mu < Ndim; mu++)
+    {
      //  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
-	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
+      parallel_for(int i = 0; i < p._grid->oSites(); i++)
+      {
        int permute_type;
        StencilEntry *SE;
        vobj temp2;
@ -92,56 +102,106 @@ namespace Grid {

        SE = phiStencil.GetEntry(permute_type, mu, i);
        t_p = &p._odata[i];
-	  if ( SE->_is_local ) {
+        if (SE->_is_local)
+        {
          temp = &p._odata[SE->_offset];
-	    if ( SE->_permute ) {
+          if (SE->_permute)
+          {
            permute(temp2, *temp, permute_type);
-	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
-	    } else {
-	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
+            action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
          }
-	  } else {
-	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
+          else
+          {
+            action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
+          }
+        }
+        else
+        {
+          action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
        }
      }
      //  action -= pshift*p + p*pshift;
    }
    // NB the trace in the algebra is normalised to 1/2
    // minus sign coming from the antihermitian fields
-      return -(TensorRemove(sum(trace(action)))).real();
+    return -(TensorRemove(sum(trace(action)))).real() * N / g;
  };

-    virtual void deriv(const Field &p, Field &force) {
+  virtual void deriv(const Field &p, Field &force)
+  {
+    double t0 = usecond();
    assert(p._grid->Nd() == Ndim);
-      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
+    force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
+    double interm_t = usecond();
+
    // move this outside
    static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
-      phiStencil.HaloExchange(p, compressor);

+    phiStencil.HaloExchange(p, compressor);
+    double halo_t = usecond();
+    int chunk = 128;
    //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
-      for (int point = 0; point < npoint; point++) {
-	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
-	  const vobj *temp;
-	  vobj temp2;
+
+    // inverting the order of the loops slows down the code(! g++ 7)
+    // cannot try to reduce the number of  force writes by factor npoint...
+    // use cache blocking
+    for (int point = 0; point < npoint; point++)
+    {
+
+#pragma omp parallel 
+{
        int permute_type;
        StencilEntry *SE;
-	  SE = phiStencil.GetEntry(permute_type, point, i);
+        const vobj *temp;

-	  if ( SE->_is_local ) {
+#pragma omp for schedule(static, chunk)
+      for (int i = 0; i < p._grid->oSites(); i++)
+      {
+        SE = phiStencil.GetEntry(permute_type, point, i);
+        // prefetch next p?
+
+        if (SE->_is_local)
+        {
          temp = &p._odata[SE->_offset];
-	    if ( SE->_permute ) {
+      
+          if (SE->_permute)
+          {
+            vobj temp2;
            permute(temp2, *temp, permute_type);
            force._odata[i] -= temp2;
-	    } else {
-	      force._odata[i] -= *temp;
          }
-	  } else {
+          else
+          {
+            force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
+          }
+        }
+        else
+        {
          force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
        }
      }
+
    }
  }
-  };
+  force *= N / g;
+
+  double t1 = usecond();
+  double total_time = (t1 - t0) / 1e6;
+  double interm_time = (interm_t - t0) / 1e6;
+  double halo_time = (halo_t - interm_t) / 1e6;
+  double stencil_time = (t1 - halo_t) / 1e6;
+  std::cout << GridLogIntegrator << "Total time for force computation (s)       : " << total_time << std::endl;
+  std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
+  std::cout << GridLogIntegrator << "Halo time in force computation (s)         : " << halo_time << std::endl;
+  std::cout << GridLogIntegrator << "Stencil time in force computation (s)      : " << stencil_time << std::endl;
+  double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
+  double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
+  double Gflops = flops / (total_time * 1e9);
+  double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
+  std::cout << GridLogIntegrator << "Flops: " << flops << "  - Gflop/s : " << Gflops << std::endl;
+  std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << "  - Gflop/s NS: " << Gflops_no_stencil << std::endl;
+}
+};

 } // namespace Grid

--- a/lib/qcd/hmc/GenericHMCrunner.h
+++ b/lib/qcd/hmc/GenericHMCrunner.h
@ -211,7 +211,7 @@ typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
    ScalarAdjGenericHMCRunner;

 template <int Colours> 
-using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
+using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, ForceGradient, ScalarNxNMatrixFields<Colours> >;

 }  // namespace QCD
 }  // namespace Grid
--- a/tests/hmc/Test_hmc_ScalarActionNxN.cc
+++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc
@ -31,7 +31,8 @@ class ScalarActionParameters : Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarActionParameters,
    double, mass_squared,
-    double, lambda);
+    double, lambda,
+    double, g);

    template <class ReaderClass >
  ScalarActionParameters(Reader<ReaderClass>& Reader){
@ -140,7 +141,7 @@ int main(int argc, char **argv) {

  // Scalar action in adjoint representation
  ScalarActionParameters SPar(Reader);
-  ScalarAction Saction(SPar.mass_squared, SPar.lambda);
+  ScalarAction Saction(SPar.mass_squared, SPar.lambda, SPar.g);

  // Collect actions
  ActionLevel<ScalarAction::Field, ScalarNxNMatrixFields<Ncolours>> Level1(1);