diff --git a/lib/cartesian/Cartesian_base.h b/lib/cartesian/Cartesian_base.h index acc870de..c49dac84 100644 --- a/lib/cartesian/Cartesian_base.h +++ b/lib/cartesian/Cartesian_base.h @@ -59,6 +59,7 @@ public: virtual ~GridBase() = default; + // Physics Grid information. std::vector _simd_layout;// Which dimensions get relayed out over simd lanes. std::vector _fdimensions;// (full) Global dimensions of array prior to cb removal diff --git a/lib/cartesian/Cartesian_full.h b/lib/cartesian/Cartesian_full.h index 9273abf3..b2372575 100644 --- a/lib/cartesian/Cartesian_full.h +++ b/lib/cartesian/Cartesian_full.h @@ -122,6 +122,7 @@ public: // Use a reduced simd grid _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions + //std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl; assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition @@ -166,6 +167,7 @@ public: block = block * _rdimensions[d]; } }; + }; } #endif diff --git a/lib/communicator/Communicator_base.h b/lib/communicator/Communicator_base.h index 548515cd..c9dccd11 100644 --- a/lib/communicator/Communicator_base.h +++ b/lib/communicator/Communicator_base.h @@ -166,6 +166,7 @@ class CartesianCommunicator { void InitFromMPICommunicator(const std::vector &processors, MPI_Comm communicator_base); #endif public: + //////////////////////////////////////////////////////////////////////////////////////// // Wraps MPI_Cart routines, or implements equivalent on other impls diff --git a/lib/communicator/Communicator_mpi3_leader.cc b/lib/communicator/Communicator_mpi3_leader.cc index 6e26bd3e..da863508 100644 --- a/lib/communicator/Communicator_mpi3_leader.cc +++ b/lib/communicator/Communicator_mpi3_leader.cc @@ -830,6 +830,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); }; +CartesianCommunicator::~CartesianCommunicator() = default; + + void CartesianCommunicator::GlobalSum(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); assert(ierr==0); diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc index 03e3173e..d63a5af4 100644 --- a/lib/communicator/Communicator_shmem.cc +++ b/lib/communicator/Communicator_shmem.cc @@ -105,6 +105,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector &processors) assert(Size==_Nprocessors); } +CartesianCommunicator::~CartesianCommunicator() = default; + + void CartesianCommunicator::GlobalSum(uint32_t &u){ static long long source ; static long long dest ; diff --git a/lib/qcd/action/scalar/ScalarImpl.h b/lib/qcd/action/scalar/ScalarImpl.h index f85ab840..55f5049d 100644 --- a/lib/qcd/action/scalar/ScalarImpl.h +++ b/lib/qcd/action/scalar/ScalarImpl.h @@ -16,12 +16,12 @@ class ScalarImplTypes { typedef iImplField SiteField; typedef SiteField SitePropagator; typedef SiteField SiteComplex; - + typedef Lattice Field; typedef Field ComplexField; typedef Field FermionField; typedef Field PropagatorField; - + static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){ gaussian(pRNG, P); } @@ -47,54 +47,60 @@ class ScalarImplTypes { static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { U = 1.0; } - + static void MomentumSpacePropagator(Field &out, RealD m) { GridBase *grid = out._grid; Field kmu(grid), one(grid); const unsigned int nd = grid->_ndimension; std::vector &l = grid->_fdimensions; - + one = Complex(1.0,0.0); out = m*m; for(int mu = 0; mu < nd; mu++) { Real twoPiL = M_PI*2./l[mu]; - + LatticeCoordinate(kmu,mu); kmu = 2.*sin(.5*twoPiL*kmu); out = out + kmu*kmu; } out = one/out; } - + static void FreePropagator(const Field &in, Field &out, const Field &momKernel) { FFT fft((GridCartesian *)in._grid); Field inFT(in._grid); - + fft.FFT_all_dim(inFT, in, FFT::forward); inFT = inFT*momKernel; fft.FFT_all_dim(out, inFT, FFT::backward); } - + static void FreePropagator(const Field &in, Field &out, RealD m) { Field momKernel(in._grid); - + MomentumSpacePropagator(momKernel, m); FreePropagator(in, out, momKernel); } - + }; + #ifdef USE_FFT_ACCELERATION + #ifndef FFT_MASS + #error "USE_FFT_ACCELERATION is defined but not FFT_MASS" + #endif + #endif + template class ScalarAdjMatrixImplTypes { public: typedef S Simd; typedef QCD::SU Group; - + template using iImplField = iScalar>>; template @@ -103,24 +109,119 @@ class ScalarImplTypes { typedef iImplField SiteField; typedef SiteField SitePropagator; typedef iImplComplex SiteComplex; - + typedef Lattice Field; typedef Lattice ComplexField; typedef Field FermionField; typedef Field PropagatorField; - static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) { + static void MomentaSquare(ComplexField &out) + { + GridBase *grid = out._grid; + const std::vector &l = grid->FullDimensions(); + ComplexField kmu(grid); + + for (int mu = 0; mu < grid->Nd(); mu++) + { + Real twoPiL = M_PI * 2.0 / l[mu]; + LatticeCoordinate(kmu, mu); + kmu = 2.0 * sin(0.5 * twoPiL * kmu); + out += kmu * kmu; + } + } + + static void MomentumSpacePropagator(ComplexField &out, RealD m) + { + GridBase *grid = out._grid; + ComplexField one(grid); + one = Complex(1.0, 0.0); + out = m * m; + MomentaSquare(out); + out = one / out; + } + + static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) + { +#ifndef USE_FFT_ACCELERATION Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P); +#else + + Field Pgaussian(P._grid), Pp(P._grid); + ComplexField p2(P._grid); p2 = zero; + RealD M = FFT_MASS; + + Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian); + + FFT theFFT((GridCartesian*)P._grid); + theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward); + MomentaSquare(p2); + p2 += M * M; + p2 = sqrt(p2); + Pp *= p2; + theFFT.FFT_all_dim(P, Pp, FFT::backward); + +#endif //USE_FFT_ACCELERATION } static inline Field projectForce(Field& P) {return P;} - static inline void update_field(Field& P, Field& U, double ep) { - U += P*ep; + static inline void update_field(Field &P, Field &U, double ep) + { +#ifndef USE_FFT_ACCELERATION + double t0=usecond(); + U += P * ep; + double t1=usecond(); + double total_time = (t1-t0)/1e6; + std::cout << GridLogIntegrator << "Total time for updating field (s) : " << total_time << std::endl; +#else + // FFT transform P(x) -> P(p) + // divide by (M^2+p^2) M external parameter (how to pass?) + // P'(p) = P(p)/(M^2+p^2) + // Transform back -> P'(x) + // U += P'(x)*ep + + Field Pp(U._grid), P_FFT(U._grid); + static ComplexField p2(U._grid); + RealD M = FFT_MASS; + + FFT theFFT((GridCartesian*)U._grid); + theFFT.FFT_all_dim(Pp, P, FFT::forward); + + static bool first_call = true; + if (first_call) + { + // avoid recomputing + MomentumSpacePropagator(p2, M); + first_call = false; + } + Pp *= p2; + theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward); + U += P_FFT * ep; + +#endif //USE_FFT_ACCELERATION } - static inline RealD FieldSquareNorm(Field& U) { - return (TensorRemove(sum(trace(U*U))).real()); + static inline RealD FieldSquareNorm(Field &U) + { +#ifndef USE_FFT_ACCELERATION + return (TensorRemove(sum(trace(U * U))).real()); +#else + // In case of Fourier acceleration we have to: + // compute U(p)*U(p)/(M^2+p^2)) Parseval theorem + // 1 FFT needed U(x) -> U(p) + // M to be passed + + FFT theFFT((GridCartesian*)U._grid); + Field Up(U._grid); + + theFFT.FFT_all_dim(Up, U, FFT::forward); + RealD M = FFT_MASS; + ComplexField p2(U._grid); + MomentumSpacePropagator(p2, M); + Field Up2 = Up * p2; + // from the definition of the DFT we need to divide by the volume + return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U._grid->gSites()); +#endif //USE_FFT_ACCELERATION } static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { @@ -146,7 +247,7 @@ class ScalarImplTypes { typedef ScalarImplTypes ScalarImplCR; typedef ScalarImplTypes ScalarImplCF; typedef ScalarImplTypes ScalarImplCD; - + // Hardcoding here the size of the matrices typedef ScalarAdjMatrixImplTypes ScalarAdjImplR; typedef ScalarAdjMatrixImplTypes ScalarAdjImplF; @@ -155,7 +256,7 @@ class ScalarImplTypes { template using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes; template using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes; template using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes; - + //} } diff --git a/lib/qcd/action/scalar/ScalarInteractionAction.h b/lib/qcd/action/scalar/ScalarInteractionAction.h index 4d189352..8738b647 100644 --- a/lib/qcd/action/scalar/ScalarInteractionAction.h +++ b/lib/qcd/action/scalar/ScalarInteractionAction.h @@ -30,119 +30,179 @@ directory #ifndef SCALAR_INT_ACTION_H #define SCALAR_INT_ACTION_H - // Note: this action can completely absorb the ScalarAction for real float fields // use the scalarObjs to generalise the structure -namespace Grid { - // FIXME drop the QCD namespace everywhere here +namespace Grid +{ +// FIXME drop the QCD namespace everywhere here - template - class ScalarInteractionAction : public QCD::Action { - public: - INHERIT_FIELD_TYPES(Impl); - private: - RealD mass_square; - RealD lambda; +template +class ScalarInteractionAction : public QCD::Action +{ +public: + INHERIT_FIELD_TYPES(Impl); +private: + RealD mass_square; + RealD lambda; + RealD g; + const unsigned int N = Impl::Group::Dimension; - typedef typename Field::vector_object vobj; - typedef CartesianStencil Stencil; + typedef typename Field::vector_object vobj; + typedef CartesianStencil Stencil; - SimpleCompressor compressor; - int npoint = 2*Ndim; - std::vector directions;// = {0,1,2,3,0,1,2,3}; // forcing 4 dimensions - std::vector displacements;// = {1,1,1,1, -1,-1,-1,-1}; + SimpleCompressor compressor; + int npoint = 2 * Ndim; + std::vector directions; // + std::vector displacements; // - - public: - - ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){ - for (int mu = 0 ; mu < Ndim; mu++){ - directions[mu] = mu; directions[mu+Ndim] = mu; - displacements[mu] = 1; displacements[mu+Ndim] = -1; - } +public: + ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0) + { + for (int mu = 0; mu < Ndim; mu++) + { + directions[mu] = mu; + directions[mu + Ndim] = mu; + displacements[mu] = 1; + displacements[mu + Ndim] = -1; } + } - virtual std::string LogParameters() { - std::stringstream sstream; - sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl; - sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl; - return sstream.str(); - } + virtual std::string LogParameters() + { + std::stringstream sstream; + sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl; + sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl; + sstream << GridLogMessage << "[ScalarAction] g : " << g << std::endl; + return sstream.str(); + } - virtual std::string action_name() {return "ScalarAction";} + virtual std::string action_name() { return "ScalarAction"; } - virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} + virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} - virtual RealD S(const Field &p) { - assert(p._grid->Nd() == Ndim); - static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); - phiStencil.HaloExchange(p, compressor); - Field action(p._grid), pshift(p._grid), phisquared(p._grid); - phisquared = p*p; - action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared; - for (int mu = 0; mu < Ndim; mu++) { - // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils - parallel_for (int i = 0; i < p._grid->oSites(); i++) { - int permute_type; - StencilEntry *SE; - vobj temp2; - const vobj *temp, *t_p; - - SE = phiStencil.GetEntry(permute_type, mu, i); - t_p = &p._odata[i]; - if ( SE->_is_local ) { - temp = &p._odata[SE->_offset]; - if ( SE->_permute ) { - permute(temp2, *temp, permute_type); - action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2; - } else { - action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp); - } - } else { - action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset]; - } - } - // action -= pshift*p + p*pshift; - } - // NB the trace in the algebra is normalised to 1/2 - // minus sign coming from the antihermitian fields - return -(TensorRemove(sum(trace(action)))).real(); - }; - - virtual void deriv(const Field &p, Field &force) { - assert(p._grid->Nd() == Ndim); - force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p; - // move this outside - static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); - phiStencil.HaloExchange(p, compressor); - - //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1); - for (int point = 0; point < npoint; point++) { - parallel_for (int i = 0; i < p._grid->oSites(); i++) { - const vobj *temp; - vobj temp2; - int permute_type; - StencilEntry *SE; - SE = phiStencil.GetEntry(permute_type, point, i); - - if ( SE->_is_local ) { - temp = &p._odata[SE->_offset]; - if ( SE->_permute ) { - permute(temp2, *temp, permute_type); - force._odata[i] -= temp2; - } else { - force._odata[i] -= *temp; - } - } else { - force._odata[i] -= phiStencil.CommBuf()[SE->_offset]; - } - } + virtual RealD S(const Field &p) + { + assert(p._grid->Nd() == Ndim); + static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); + phiStencil.HaloExchange(p, compressor); + Field action(p._grid), pshift(p._grid), phisquared(p._grid); + phisquared = p * p; + action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared; + for (int mu = 0; mu < Ndim; mu++) + { + // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils + parallel_for(int i = 0; i < p._grid->oSites(); i++) + { + int permute_type; + StencilEntry *SE; + vobj temp2; + const vobj *temp, *t_p; + + SE = phiStencil.GetEntry(permute_type, mu, i); + t_p = &p._odata[i]; + if (SE->_is_local) + { + temp = &p._odata[SE->_offset]; + if (SE->_permute) + { + permute(temp2, *temp, permute_type); + action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2; + } + else + { + action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp); + } + } + else + { + action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset]; + } } + // action -= pshift*p + p*pshift; } + // NB the trace in the algebra is normalised to 1/2 + // minus sign coming from the antihermitian fields + return -(TensorRemove(sum(trace(action)))).real() * N / g; }; - -} // namespace Grid -#endif // SCALAR_INT_ACTION_H + virtual void deriv(const Field &p, Field &force) + { + double t0 = usecond(); + assert(p._grid->Nd() == Ndim); + force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p; + double interm_t = usecond(); + + // move this outside + static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); + + phiStencil.HaloExchange(p, compressor); + double halo_t = usecond(); + int chunk = 128; + //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1); + + // inverting the order of the loops slows down the code(! g++ 7) + // cannot try to reduce the number of force writes by factor npoint... + // use cache blocking + for (int point = 0; point < npoint; point++) + { + +#pragma omp parallel +{ + int permute_type; + StencilEntry *SE; + const vobj *temp; + +#pragma omp for schedule(static, chunk) + for (int i = 0; i < p._grid->oSites(); i++) + { + SE = phiStencil.GetEntry(permute_type, point, i); + // prefetch next p? + + if (SE->_is_local) + { + temp = &p._odata[SE->_offset]; + + if (SE->_permute) + { + vobj temp2; + permute(temp2, *temp, permute_type); + force._odata[i] -= temp2; + } + else + { + force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW) + } + } + else + { + force._odata[i] -= phiStencil.CommBuf()[SE->_offset]; + } + } + + } + } + force *= N / g; + + double t1 = usecond(); + double total_time = (t1 - t0) / 1e6; + double interm_time = (interm_t - t0) / 1e6; + double halo_time = (halo_t - interm_t) / 1e6; + double stencil_time = (t1 - halo_t) / 1e6; + std::cout << GridLogIntegrator << "Total time for force computation (s) : " << total_time << std::endl; + std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl; + std::cout << GridLogIntegrator << "Halo time in force computation (s) : " << halo_time << std::endl; + std::cout << GridLogIntegrator << "Stencil time in force computation (s) : " << stencil_time << std::endl; + double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2); + double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2); + double Gflops = flops / (total_time * 1e9); + double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9); + std::cout << GridLogIntegrator << "Flops: " << flops << " - Gflop/s : " << Gflops << std::endl; + std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << " - Gflop/s NS: " << Gflops_no_stencil << std::endl; +} +}; + +} // namespace Grid + +#endif // SCALAR_INT_ACTION_H diff --git a/lib/qcd/hmc/GenericHMCrunner.h b/lib/qcd/hmc/GenericHMCrunner.h index 4f6c1af0..26fec3d5 100644 --- a/lib/qcd/hmc/GenericHMCrunner.h +++ b/lib/qcd/hmc/GenericHMCrunner.h @@ -211,7 +211,7 @@ typedef HMCWrapperTemplate ScalarAdjGenericHMCRunner; template -using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR, MinimumNorm2, ScalarNxNMatrixFields >; +using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR, ForceGradient, ScalarNxNMatrixFields >; } // namespace QCD } // namespace Grid diff --git a/tests/hmc/Test_hmc_ScalarActionNxN.cc b/tests/hmc/Test_hmc_ScalarActionNxN.cc index a4dad1a3..9e40beac 100644 --- a/tests/hmc/Test_hmc_ScalarActionNxN.cc +++ b/tests/hmc/Test_hmc_ScalarActionNxN.cc @@ -31,7 +31,8 @@ class ScalarActionParameters : Serializable { public: GRID_SERIALIZABLE_CLASS_MEMBERS(ScalarActionParameters, double, mass_squared, - double, lambda); + double, lambda, + double, g); template ScalarActionParameters(Reader& Reader){ @@ -140,7 +141,7 @@ int main(int argc, char **argv) { // Scalar action in adjoint representation ScalarActionParameters SPar(Reader); - ScalarAction Saction(SPar.mass_squared, SPar.lambda); + ScalarAction Saction(SPar.mass_squared, SPar.lambda, SPar.g); // Collect actions ActionLevel> Level1(1);