diff --git a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h new file mode 100644 index 00000000..3a166134 --- /dev/null +++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h @@ -0,0 +1,240 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h + + Copyright (C) 2020 - 2022 + + Author: Daniel Richtmann + Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#pragma once + +#include +#include + +NAMESPACE_BEGIN(Grid); + +// see Grid/qcd/action/fermion/WilsonCloverFermion.h for description +// +// Modifications done here: +// +// Original: clover term = 12x12 matrix per site +// +// But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running) +// Sufficient to store/transfer only the real parts of the diagonal and one triangular part +// 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered +// +// Here: Above but diagonal as complex numbers, i.e., need to store/transfer +// 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words +// +// Words per site and improvement compared to original (combined with the input and output spinors): +// +// - Original: 2*12 + 12*12 = 168 words -> 1.00 x less +// - Minimal: 2*12 + 36 = 60 words -> 2.80 x less +// - Here: 2*12 + 42 = 66 words -> 2.55 x less +// +// These improvements directly translate to wall-clock time +// +// Data layout: +// +// - diagonal and triangle part as separate lattice fields, +// this was faster than as 1 combined field on all tested machines +// - diagonal: as expected +// - triangle: store upper right triangle in row major order +// - graphical: +// 0 1 2 3 4 +// 5 6 7 8 +// 9 10 11 = upper right triangle indices +// 12 13 +// 14 +// 0 +// 1 +// 2 +// 3 = diagonal indices +// 4 +// 5 +// 0 +// 1 5 +// 2 6 9 = lower left triangle indices +// 3 7 10 12 +// 4 8 11 13 14 +// +// Impact on total memory consumption: +// - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site +// - Here: (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts = 24 complex words per site +// + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts = 60 complex words per site +// = 84 complex words per site + +template +class CompactWilsonCloverFermion : public WilsonFermion, + public WilsonCloverHelpers, + public CompactWilsonCloverHelpers { + ///////////////////////////////////////////// + // Sizes + ///////////////////////////////////////////// + +public: + + INHERIT_COMPACT_CLOVER_SIZES(Impl); + + ///////////////////////////////////////////// + // Type definitions + ///////////////////////////////////////////// + +public: + + INHERIT_IMPL_TYPES(Impl); + INHERIT_CLOVER_TYPES(Impl); + INHERIT_COMPACT_CLOVER_TYPES(Impl); + + typedef WilsonFermion WilsonBase; + typedef WilsonCloverHelpers Helpers; + typedef CompactWilsonCloverHelpers CompactHelpers; + + ///////////////////////////////////////////// + // Constructors + ///////////////////////////////////////////// + +public: + + CompactWilsonCloverFermion(GaugeField& _Umu, + GridCartesian& Fgrid, + GridRedBlackCartesian& Hgrid, + const RealD _mass, + const RealD _csw_r = 0.0, + const RealD _csw_t = 0.0, + const RealD _cF = 1.0, + const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(), + const ImplParams& impl_p = ImplParams()); + + ///////////////////////////////////////////// + // Member functions (implementing interface) + ///////////////////////////////////////////// + +public: + + virtual void Instantiatable() {}; + int ConstEE() override { return 0; }; + int isTrivialEE() override { return 0; }; + + void Dhop(const FermionField& in, FermionField& out, int dag) override; + + void DhopOE(const FermionField& in, FermionField& out, int dag) override; + + void DhopEO(const FermionField& in, FermionField& out, int dag) override; + + void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override; + + void DhopDirAll(const FermionField& in, std::vector& out) /* override */; + + void M(const FermionField& in, FermionField& out) override; + + void Mdag(const FermionField& in, FermionField& out) override; + + void Meooe(const FermionField& in, FermionField& out) override; + + void MeooeDag(const FermionField& in, FermionField& out) override; + + void Mooee(const FermionField& in, FermionField& out) override; + + void MooeeDag(const FermionField& in, FermionField& out) override; + + void MooeeInv(const FermionField& in, FermionField& out) override; + + void MooeeInvDag(const FermionField& in, FermionField& out) override; + + void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override; + + void MdirAll(const FermionField& in, std::vector& out) override; + + void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override; + + void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override; + + void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override; + + ///////////////////////////////////////////// + // Member functions (internals) + ///////////////////////////////////////////// + + void MooeeInternal(const FermionField& in, + FermionField& out, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle); + + ///////////////////////////////////////////// + // Helpers + ///////////////////////////////////////////// + + void ImportGauge(const GaugeField& _Umu) override; + + ///////////////////////////////////////////// + // Helpers + ///////////////////////////////////////////// + +private: + + template + const MaskField* getCorrectMaskField(const Field &in) const { + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + return &this->BoundaryMaskOdd; + } else { + return &this->BoundaryMaskEven; + } + } else { + return &this->BoundaryMask; + } + } + + template + void ApplyBoundaryMask(Field& f) { + const MaskField* m = getCorrectMaskField(f); assert(m != nullptr); + assert(m != nullptr); + CompactHelpers::ApplyBoundaryMask(f, *m); + } + + ///////////////////////////////////////////// + // Member Data + ///////////////////////////////////////////// + +public: + + RealD csw_r; + RealD csw_t; + RealD cF; + + bool open_boundaries; + + CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd; + CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd; + + CloverTriangleField Triangle, TriangleEven, TriangleOdd; + CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd; + + FermionField Tmp; + + MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd; +}; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index 59fc17d5..78cf7851 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -53,6 +53,7 @@ NAMESPACE_CHECK(Wilson); #include // 4d wilson like NAMESPACE_CHECK(WilsonTM); #include // 4d wilson clover fermions +#include // 4d compact wilson clover fermions NAMESPACE_CHECK(WilsonClover); #include // 5d base used by all 5d overlap types NAMESPACE_CHECK(Wilson5D); @@ -153,6 +154,23 @@ typedef WilsonCloverFermion WilsonCloverTwoInd typedef WilsonCloverFermion WilsonCloverTwoIndexAntiSymmetricFermionF; typedef WilsonCloverFermion WilsonCloverTwoIndexAntiSymmetricFermionD; +// Compact Clover fermions +typedef CompactWilsonCloverFermion CompactWilsonCloverFermionR; +typedef CompactWilsonCloverFermion CompactWilsonCloverFermionF; +typedef CompactWilsonCloverFermion CompactWilsonCloverFermionD; + +typedef CompactWilsonCloverFermion CompactWilsonCloverAdjFermionR; +typedef CompactWilsonCloverFermion CompactWilsonCloverAdjFermionF; +typedef CompactWilsonCloverFermion CompactWilsonCloverAdjFermionD; + +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexSymmetricFermionR; +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexSymmetricFermionF; +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexSymmetricFermionD; + +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexAntiSymmetricFermionR; +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexAntiSymmetricFermionF; +typedef CompactWilsonCloverFermion CompactWilsonCloverTwoIndexAntiSymmetricFermionD; + // Domain Wall fermions typedef DomainWallFermion DomainWallFermionR; typedef DomainWallFermion DomainWallFermionF; diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h index 92af7111..cd19fc10 100644 --- a/Grid/qcd/action/fermion/WilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h @@ -4,10 +4,11 @@ Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h - Copyright (C) 2017 + Copyright (C) 2017 - 2022 Author: Guido Cossu Author: David Preti <> + Author: Daniel Richtmann This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -29,7 +30,8 @@ #pragma once -#include +#include +#include NAMESPACE_BEGIN(Grid); @@ -50,18 +52,15 @@ NAMESPACE_BEGIN(Grid); ////////////////////////////////////////////////////////////////// template -class WilsonCloverFermion : public WilsonFermion +class WilsonCloverFermion : public WilsonFermion, + public WilsonCloverHelpers { public: - // Types definitions INHERIT_IMPL_TYPES(Impl); - template - using iImplClover = iScalar, Ns>>; - typedef iImplClover SiteCloverType; - typedef Lattice CloverFieldType; + INHERIT_CLOVER_TYPES(Impl); -public: - typedef WilsonFermion WilsonBase; + typedef WilsonFermion WilsonBase; + typedef WilsonCloverHelpers Helpers; virtual int ConstEE(void) { return 0; }; virtual void Instantiatable(void){}; @@ -72,42 +71,7 @@ public: const RealD _csw_r = 0.0, const RealD _csw_t = 0.0, const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(), - const ImplParams &impl_p = ImplParams()) : WilsonFermion(_Umu, - Fgrid, - Hgrid, - _mass, impl_p, clover_anisotropy), - CloverTerm(&Fgrid), - CloverTermInv(&Fgrid), - CloverTermEven(&Hgrid), - CloverTermOdd(&Hgrid), - CloverTermInvEven(&Hgrid), - CloverTermInvOdd(&Hgrid), - CloverTermDagEven(&Hgrid), - CloverTermDagOdd(&Hgrid), - CloverTermInvDagEven(&Hgrid), - CloverTermInvDagOdd(&Hgrid) - { - assert(Nd == 4); // require 4 dimensions - - if (clover_anisotropy.isAnisotropic) - { - csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0; - diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0); - } - else - { - csw_r = _csw_r * 0.5; - diag_mass = 4.0 + _mass; - } - csw_t = _csw_t * 0.5; - - if (csw_r == 0) - std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl; - if (csw_t == 0) - std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl; - - ImportGauge(_Umu); - } + const ImplParams &impl_p = ImplParams()); virtual void M(const FermionField &in, FermionField &out); virtual void Mdag(const FermionField &in, FermionField &out); @@ -124,250 +88,21 @@ public: void ImportGauge(const GaugeField &_Umu); // Derivative parts unpreconditioned pseudofermions - void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag) - { - conformable(X.Grid(), Y.Grid()); - conformable(X.Grid(), force.Grid()); - GaugeLinkField force_mu(force.Grid()), lambda(force.Grid()); - GaugeField clover_force(force.Grid()); - PropagatorField Lambda(force.Grid()); + void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag); - // Guido: Here we are hitting some performance issues: - // need to extract the components of the DoubledGaugeField - // for each call - // Possible solution - // Create a vector object to store them? (cons: wasting space) - std::vector U(Nd, this->Umu.Grid()); - - Impl::extractLinkField(U, this->Umu); - - force = Zero(); - // Derivative of the Wilson hopping term - this->DhopDeriv(force, X, Y, dag); - - /////////////////////////////////////////////////////////// - // Clover term derivative - /////////////////////////////////////////////////////////// - Impl::outerProductImpl(Lambda, X, Y); - //std::cout << "Lambda:" << Lambda << std::endl; - - Gamma::Algebra sigma[] = { - Gamma::Algebra::SigmaXY, - Gamma::Algebra::SigmaXZ, - Gamma::Algebra::SigmaXT, - Gamma::Algebra::MinusSigmaXY, - Gamma::Algebra::SigmaYZ, - Gamma::Algebra::SigmaYT, - Gamma::Algebra::MinusSigmaXZ, - Gamma::Algebra::MinusSigmaYZ, - Gamma::Algebra::SigmaZT, - Gamma::Algebra::MinusSigmaXT, - Gamma::Algebra::MinusSigmaYT, - Gamma::Algebra::MinusSigmaZT}; - - /* - sigma_{\mu \nu}= - | 0 sigma[0] sigma[1] sigma[2] | - | sigma[3] 0 sigma[4] sigma[5] | - | sigma[6] sigma[7] 0 sigma[8] | - | sigma[9] sigma[10] sigma[11] 0 | - */ - - int count = 0; - clover_force = Zero(); - for (int mu = 0; mu < 4; mu++) - { - force_mu = Zero(); - for (int nu = 0; nu < 4; nu++) - { - if (mu == nu) - continue; - - RealD factor; - if (nu == 4 || mu == 4) - { - factor = 2.0 * csw_t; - } - else - { - factor = 2.0 * csw_r; - } - PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked - Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok - force_mu -= factor*Cmunu(U, lambda, mu, nu); // checked - count++; - } - - pokeLorentz(clover_force, U[mu] * force_mu, mu); - } - //clover_force *= csw; - force += clover_force; - } - - // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis - GaugeLinkField Cmunu(std::vector &U, GaugeLinkField &lambda, int mu, int nu) - { - conformable(lambda.Grid(), U[0].Grid()); - GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid()); - // insertion in upper staple - // please check redundancy of shift operations - - // C1+ - tmp = lambda * U[nu]; - out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu); - - // C2+ - tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu); - out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu); - - // C3+ - tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu); - out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu); - - // C4+ - out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda; - - // insertion in lower staple - // C1- - out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu); - - // C2- - tmp = adj(lambda) * U[nu]; - out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu); - - // C3- - tmp = lambda * U[nu]; - out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu); - - // C4- - out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda; - - return out; - } - -protected: +public: // here fixing the 4 dimensions, make it more general? RealD csw_r; // Clover coefficient - spatial RealD csw_t; // Clover coefficient - temporal RealD diag_mass; // Mass term - CloverFieldType CloverTerm, CloverTermInv; // Clover term - CloverFieldType CloverTermEven, CloverTermOdd; // Clover term EO - CloverFieldType CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO - CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO - CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO - - public: - // eventually these can be compressed into 6x6 blocks instead of the 12x12 - // using the DeGrand-Rossi basis for the gamma matrices - CloverFieldType fillCloverYZ(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - T = Zero(); - autoView(T_v,T,AcceleratorWrite); - autoView(F_v,F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); - T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); - T_v[i]()(2, 3) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 2) = timesMinusI(F_v[i]()()); - }); - - return T; - } - - CloverFieldType fillCloverXZ(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - T = Zero(); - - autoView(T_v, T,AcceleratorWrite); - autoView(F_v, F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 1) = -F_v[i]()(); - T_v[i]()(1, 0) = F_v[i]()(); - T_v[i]()(2, 3) = -F_v[i]()(); - T_v[i]()(3, 2) = F_v[i]()(); - }); - - return T; - } - - CloverFieldType fillCloverXY(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - T = Zero(); - - autoView(T_v,T,AcceleratorWrite); - autoView(F_v,F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); - T_v[i]()(1, 1) = timesI(F_v[i]()()); - T_v[i]()(2, 2) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 3) = timesI(F_v[i]()()); - }); - - return T; - } - - CloverFieldType fillCloverXT(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - T = Zero(); - - autoView( T_v , T, AcceleratorWrite); - autoView( F_v , F, AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 1) = timesI(F_v[i]()()); - T_v[i]()(1, 0) = timesI(F_v[i]()()); - T_v[i]()(2, 3) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 2) = timesMinusI(F_v[i]()()); - }); - - return T; - } - - CloverFieldType fillCloverYT(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - T = Zero(); - - autoView( T_v ,T,AcceleratorWrite); - autoView( F_v ,F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 1) = -(F_v[i]()()); - T_v[i]()(1, 0) = (F_v[i]()()); - T_v[i]()(2, 3) = (F_v[i]()()); - T_v[i]()(3, 2) = -(F_v[i]()()); - }); - - return T; - } - - CloverFieldType fillCloverZT(const GaugeLinkField &F) - { - CloverFieldType T(F.Grid()); - - T = Zero(); - - autoView( T_v , T,AcceleratorWrite); - autoView( F_v , F,AcceleratorRead); - accelerator_for(i, CloverTerm.Grid()->oSites(),1, - { - T_v[i]()(0, 0) = timesI(F_v[i]()()); - T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); - T_v[i]()(2, 2) = timesMinusI(F_v[i]()()); - T_v[i]()(3, 3) = timesI(F_v[i]()()); - }); - - return T; - } + CloverField CloverTerm, CloverTermInv; // Clover term + CloverField CloverTermEven, CloverTermOdd; // Clover term EO + CloverField CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO + CloverField CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO + CloverField CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO }; + NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/WilsonCloverHelpers.h b/Grid/qcd/action/fermion/WilsonCloverHelpers.h new file mode 100644 index 00000000..588525cc --- /dev/null +++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h @@ -0,0 +1,761 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h + + Copyright (C) 2021 - 2022 + + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#pragma once + +// Helper routines that implement common clover functionality + +NAMESPACE_BEGIN(Grid); + +template class WilsonCloverHelpers { +public: + + INHERIT_IMPL_TYPES(Impl); + INHERIT_CLOVER_TYPES(Impl); + + // Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis + static GaugeLinkField Cmunu(std::vector &U, GaugeLinkField &lambda, int mu, int nu) + { + conformable(lambda.Grid(), U[0].Grid()); + GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid()); + // insertion in upper staple + // please check redundancy of shift operations + + // C1+ + tmp = lambda * U[nu]; + out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu); + + // C2+ + tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu); + out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu); + + // C3+ + tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu); + out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu); + + // C4+ + out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda; + + // insertion in lower staple + // C1- + out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu); + + // C2- + tmp = adj(lambda) * U[nu]; + out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu); + + // C3- + tmp = lambda * U[nu]; + out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu); + + // C4- + out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda; + + return out; + } + + static CloverField fillCloverYZ(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + T = Zero(); + autoView(T_v,T,AcceleratorWrite); + autoView(F_v,F,AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), + { + coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()()))); + }); + + return T; + } + + static CloverField fillCloverXZ(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + T = Zero(); + + autoView(T_v, T,AcceleratorWrite); + autoView(F_v, F,AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), + { + coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()())); + coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()())); + coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()())); + coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()())); + }); + + return T; + } + + static CloverField fillCloverXY(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + T = Zero(); + + autoView(T_v,T,AcceleratorWrite); + autoView(F_v,F,AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), + { + coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()()))); + coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()()))); + }); + + return T; + } + + static CloverField fillCloverXT(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + T = Zero(); + + autoView( T_v , T, AcceleratorWrite); + autoView( F_v , F, AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), + { + coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()()))); + coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()()))); + coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()()))); + }); + + return T; + } + + static CloverField fillCloverYT(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + T = Zero(); + + autoView( T_v ,T,AcceleratorWrite); + autoView( F_v ,F,AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), + { + coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()()))); + coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()()))); + coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()()))); + coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()()))); + }); + + return T; + } + + static CloverField fillCloverZT(const GaugeLinkField &F) + { + CloverField T(F.Grid()); + + T = Zero(); + + autoView( T_v , T,AcceleratorWrite); + autoView( F_v , F,AcceleratorRead); + accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(), + { + coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()()))); + coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()()))); + coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()()))); + }); + + return T; + } + + template + static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) { + auto CC = coalescedRead(C); + mult(&phi, &CC, &chi); + } + + template + inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) { + const int Nsimd = SiteSpinor::Nsimd(); + autoView(out_v, out, AcceleratorWrite); + autoView(phi_v, phi, AcceleratorRead); + autoView(C_v, C, AcceleratorRead); + typedef decltype(coalescedRead(out_v[0])) calcSpinor; + accelerator_for(sss,out.Grid()->oSites(),Nsimd,{ + calcSpinor tmp; + multClover(tmp,C_v[sss],phi_v(sss)); + coalescedWrite(out_v[sss],tmp); + }); + } +}; + + +template class CompactWilsonCloverHelpers { +public: + + INHERIT_COMPACT_CLOVER_SIZES(Impl); + + INHERIT_IMPL_TYPES(Impl); + INHERIT_CLOVER_TYPES(Impl); + INHERIT_COMPACT_CLOVER_TYPES(Impl); + + #if 0 + static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) { + assert(i != j); + if(i < j) { + return triangle()(block)(triangle_index(i, j)); + } else { // i > j + return conjugate(triangle()(block)(triangle_index(i, j))); + } + } + #else + template + static accelerator_inline vobj triangle_elem(const iImplCloverTriangle& triangle, int block, int i, int j) { + assert(i != j); + if(i < j) { + return triangle()(block)(triangle_index(i, j)); + } else { // i > j + return conjugate(triangle()(block)(triangle_index(i, j))); + } + } + #endif + + static accelerator_inline int triangle_index(int i, int j) { + if(i == j) + return 0; + else if(i < j) + return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1; + else // i > j + return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1; + } + + static void MooeeKernel_gpu(int Nsite, + int Ls, + const FermionField& in, + FermionField& out, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle) { + autoView(diagonal_v, diagonal, AcceleratorRead); + autoView(triangle_v, triangle, AcceleratorRead); + autoView(in_v, in, AcceleratorRead); + autoView(out_v, out, AcceleratorWrite); + + typedef decltype(coalescedRead(out_v[0])) CalcSpinor; + + const uint64_t NN = Nsite * Ls; + + accelerator_for(ss, NN, Simd::Nsimd(), { + int sF = ss; + int sU = ss/Ls; + CalcSpinor res; + CalcSpinor in_t = in_v(sF); + auto diagonal_t = diagonal_v(sU); + auto triangle_t = triangle_v(sU); + for(int block=0; block penalty -> disable */ \ + \ + if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \ + base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \ + svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL1STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL1STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL1STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \ + } \ + \ + if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \ + base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \ + svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL2STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL2STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL2STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \ + svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \ + } \ + } +// TODO: Implement/generalize this for other architectures +// I played around a bit on KNL (see below) but didn't bring anything +// #elif defined(AVX512) +// #define PREFETCH_CLOVER(BASE) { \ +// uint64_t base; \ +// int pf_dist_L1 = 1; \ +// int pf_dist_L2 = +4; \ +// \ +// if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \ +// base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \ +// _mm_prefetch((const char*)(base + 0), _MM_HINT_T0); \ +// _mm_prefetch((const char*)(base + 64), _MM_HINT_T0); \ +// _mm_prefetch((const char*)(base + 128), _MM_HINT_T0); \ +// _mm_prefetch((const char*)(base + 192), _MM_HINT_T0); \ +// _mm_prefetch((const char*)(base + 256), _MM_HINT_T0); \ +// _mm_prefetch((const char*)(base + 320), _MM_HINT_T0); \ +// } \ +// \ +// if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \ +// base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \ +// _mm_prefetch((const char*)(base + 0), _MM_HINT_T1); \ +// _mm_prefetch((const char*)(base + 64), _MM_HINT_T1); \ +// _mm_prefetch((const char*)(base + 128), _MM_HINT_T1); \ +// _mm_prefetch((const char*)(base + 192), _MM_HINT_T1); \ +// _mm_prefetch((const char*)(base + 256), _MM_HINT_T1); \ +// _mm_prefetch((const char*)(base + 320), _MM_HINT_T1); \ +// } \ +// } +#else +#define PREFETCH_CLOVER(BASE) +#endif + + const uint64_t NN = Nsite * Ls; + + thread_for(ss, NN, { + int sF = ss; + int sU = ss/Ls; + CalcSpinor res; + CalcSpinor in_t = in_v[sF]; + auto diag_t = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read + auto triangle_t = triangle_v[sU]; + + // upper half + PREFETCH_CLOVER(0); + + auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number + auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from + auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20 + auto in_cc_1_0 = conjugate(in_t()(1)(0)); + auto in_cc_1_1 = conjugate(in_t()(1)(1)); + + res()(0)(0) = diag_t()(0)( 0) * in_t()(0)(0) + + triangle_t()(0)( 0) * in_t()(0)(1) + + triangle_t()(0)( 1) * in_t()(0)(2) + + triangle_t()(0)( 2) * in_t()(1)(0) + + triangle_t()(0)( 3) * in_t()(1)(1) + + triangle_t()(0)( 4) * in_t()(1)(2); + + res()(0)(1) = triangle_t()(0)( 0) * in_cc_0_0; + res()(0)(1) = diag_t()(0)( 1) * in_t()(0)(1) + + triangle_t()(0)( 5) * in_t()(0)(2) + + triangle_t()(0)( 6) * in_t()(1)(0) + + triangle_t()(0)( 7) * in_t()(1)(1) + + triangle_t()(0)( 8) * in_t()(1)(2) + + conjugate( res()(0)( 1)); + + res()(0)(2) = triangle_t()(0)( 1) * in_cc_0_0 + + triangle_t()(0)( 5) * in_cc_0_1; + res()(0)(2) = diag_t()(0)( 2) * in_t()(0)(2) + + triangle_t()(0)( 9) * in_t()(1)(0) + + triangle_t()(0)(10) * in_t()(1)(1) + + triangle_t()(0)(11) * in_t()(1)(2) + + conjugate( res()(0)( 2)); + + res()(1)(0) = triangle_t()(0)( 2) * in_cc_0_0 + + triangle_t()(0)( 6) * in_cc_0_1 + + triangle_t()(0)( 9) * in_cc_0_2; + res()(1)(0) = diag_t()(0)( 3) * in_t()(1)(0) + + triangle_t()(0)(12) * in_t()(1)(1) + + triangle_t()(0)(13) * in_t()(1)(2) + + conjugate( res()(1)( 0)); + + res()(1)(1) = triangle_t()(0)( 3) * in_cc_0_0 + + triangle_t()(0)( 7) * in_cc_0_1 + + triangle_t()(0)(10) * in_cc_0_2 + + triangle_t()(0)(12) * in_cc_1_0; + res()(1)(1) = diag_t()(0)( 4) * in_t()(1)(1) + + triangle_t()(0)(14) * in_t()(1)(2) + + conjugate( res()(1)( 1)); + + res()(1)(2) = triangle_t()(0)( 4) * in_cc_0_0 + + triangle_t()(0)( 8) * in_cc_0_1 + + triangle_t()(0)(11) * in_cc_0_2 + + triangle_t()(0)(13) * in_cc_1_0 + + triangle_t()(0)(14) * in_cc_1_1; + res()(1)(2) = diag_t()(0)( 5) * in_t()(1)(2) + + conjugate( res()(1)( 2)); + + vstream(out_v[sF]()(0)(0), res()(0)(0)); + vstream(out_v[sF]()(0)(1), res()(0)(1)); + vstream(out_v[sF]()(0)(2), res()(0)(2)); + vstream(out_v[sF]()(1)(0), res()(1)(0)); + vstream(out_v[sF]()(1)(1), res()(1)(1)); + vstream(out_v[sF]()(1)(2), res()(1)(2)); + + // lower half + PREFETCH_CLOVER(1); + + auto in_cc_2_0 = conjugate(in_t()(2)(0)); + auto in_cc_2_1 = conjugate(in_t()(2)(1)); + auto in_cc_2_2 = conjugate(in_t()(2)(2)); + auto in_cc_3_0 = conjugate(in_t()(3)(0)); + auto in_cc_3_1 = conjugate(in_t()(3)(1)); + + res()(2)(0) = diag_t()(1)( 0) * in_t()(2)(0) + + triangle_t()(1)( 0) * in_t()(2)(1) + + triangle_t()(1)( 1) * in_t()(2)(2) + + triangle_t()(1)( 2) * in_t()(3)(0) + + triangle_t()(1)( 3) * in_t()(3)(1) + + triangle_t()(1)( 4) * in_t()(3)(2); + + res()(2)(1) = triangle_t()(1)( 0) * in_cc_2_0; + res()(2)(1) = diag_t()(1)( 1) * in_t()(2)(1) + + triangle_t()(1)( 5) * in_t()(2)(2) + + triangle_t()(1)( 6) * in_t()(3)(0) + + triangle_t()(1)( 7) * in_t()(3)(1) + + triangle_t()(1)( 8) * in_t()(3)(2) + + conjugate( res()(2)( 1)); + + res()(2)(2) = triangle_t()(1)( 1) * in_cc_2_0 + + triangle_t()(1)( 5) * in_cc_2_1; + res()(2)(2) = diag_t()(1)( 2) * in_t()(2)(2) + + triangle_t()(1)( 9) * in_t()(3)(0) + + triangle_t()(1)(10) * in_t()(3)(1) + + triangle_t()(1)(11) * in_t()(3)(2) + + conjugate( res()(2)( 2)); + + res()(3)(0) = triangle_t()(1)( 2) * in_cc_2_0 + + triangle_t()(1)( 6) * in_cc_2_1 + + triangle_t()(1)( 9) * in_cc_2_2; + res()(3)(0) = diag_t()(1)( 3) * in_t()(3)(0) + + triangle_t()(1)(12) * in_t()(3)(1) + + triangle_t()(1)(13) * in_t()(3)(2) + + conjugate( res()(3)( 0)); + + res()(3)(1) = triangle_t()(1)( 3) * in_cc_2_0 + + triangle_t()(1)( 7) * in_cc_2_1 + + triangle_t()(1)(10) * in_cc_2_2 + + triangle_t()(1)(12) * in_cc_3_0; + res()(3)(1) = diag_t()(1)( 4) * in_t()(3)(1) + + triangle_t()(1)(14) * in_t()(3)(2) + + conjugate( res()(3)( 1)); + + res()(3)(2) = triangle_t()(1)( 4) * in_cc_2_0 + + triangle_t()(1)( 8) * in_cc_2_1 + + triangle_t()(1)(11) * in_cc_2_2 + + triangle_t()(1)(13) * in_cc_3_0 + + triangle_t()(1)(14) * in_cc_3_1; + res()(3)(2) = diag_t()(1)( 5) * in_t()(3)(2) + + conjugate( res()(3)( 2)); + + vstream(out_v[sF]()(2)(0), res()(2)(0)); + vstream(out_v[sF]()(2)(1), res()(2)(1)); + vstream(out_v[sF]()(2)(2), res()(2)(2)); + vstream(out_v[sF]()(3)(0), res()(3)(0)); + vstream(out_v[sF]()(3)(1), res()(3)(1)); + vstream(out_v[sF]()(3)(2), res()(3)(2)); + }); + } + + static void MooeeKernel(int Nsite, + int Ls, + const FermionField& in, + FermionField& out, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle) { +#if defined(GRID_CUDA) || defined(GRID_HIP) + MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle); +#else + MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle); +#endif + } + + static void Invert(const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle, + CloverDiagonalField& diagonalInv, + CloverTriangleField& triangleInv) { + conformable(diagonal, diagonalInv); + conformable(triangle, triangleInv); + conformable(diagonal, triangle); + + diagonalInv.Checkerboard() = diagonal.Checkerboard(); + triangleInv.Checkerboard() = triangle.Checkerboard(); + + GridBase* grid = diagonal.Grid(); + + long lsites = grid->lSites(); + + typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal; + typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle; + + autoView(diagonal_v, diagonal, CpuRead); + autoView(triangle_v, triangle, CpuRead); + autoView(diagonalInv_v, diagonalInv, CpuWrite); + autoView(triangleInv_v, triangleInv, CpuWrite); + + thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite + Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc); + Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc); + + scalar_object_diagonal diagonal_tmp = Zero(); + scalar_object_diagonal diagonal_inv_tmp = Zero(); + scalar_object_triangle triangle_tmp = Zero(); + scalar_object_triangle triangle_inv_tmp = Zero(); + + Coordinate lcoor; + grid->LocalIndexToLocalCoor(site, lcoor); + + peekLocalSite(diagonal_tmp, diagonal_v, lcoor); + peekLocalSite(triangle_tmp, triangle_v, lcoor); + + // TODO: can we save time here by inverting the two 6x6 hermitian matrices separately? + for (long s_row=0;s_row 1 || s_row + s_col == 3) continue; + int block = s_row / Nhs; + int s_row_block = s_row % Nhs; + int s_col_block = s_col % Nhs; + for (long c_row=0;c_row(TensorRemove(diagonal_tmp()(block)(i))); + else + clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast(TensorRemove(triangle_elem(triangle_tmp, block, i, j))); + } + } + } + } + + clover_inv_eigen = clover_eigen.inverse(); + + for (long s_row=0;s_row 1 || s_row + s_col == 3) continue; + int block = s_row / Nhs; + int s_row_block = s_row % Nhs; + int s_col_block = s_col % Nhs; + for (long c_row=0;c_rowoSites(), 1, { + for(int s_row = 0; s_row < Ns; s_row++) { + for(int s_col = 0; s_col < Ns; s_col++) { + if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue; + int block = s_row / Nhs; + int s_row_block = s_row % Nhs; + int s_col_block = s_col % Nhs; + for(int c_row = 0; c_row < Nc; c_row++) { + for(int c_col = 0; c_col < Nc; c_col++) { + int i = s_row_block * Nc + c_row; + int j = s_col_block * Nc + c_col; + if(i == j) + diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col); + else if(i < j) + triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col); + else + continue; + } + } + } + } + }); + } + + + static void ConvertLayout(const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle, + CloverField& full) { + conformable(full, diagonal); + conformable(full, triangle); + + full.Checkerboard() = diagonal.Checkerboard(); + + full = Zero(); + + autoView(diagonal_v, diagonal, AcceleratorRead); + autoView(triangle_v, triangle, AcceleratorRead); + autoView(full_v, full, AcceleratorWrite); + + // NOTE: this function cannot be 'private' since nvcc forbids this for kernels + accelerator_for(ss, full.Grid()->oSites(), 1, { + for(int s_row = 0; s_row < Ns; s_row++) { + for(int s_col = 0; s_col < Ns; s_col++) { + if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue; + int block = s_row / Nhs; + int s_row_block = s_row % Nhs; + int s_col_block = s_col % Nhs; + for(int c_row = 0; c_row < Nc; c_row++) { + for(int c_col = 0; c_col < Nc; c_col++) { + int i = s_row_block * Nc + c_row; + int j = s_col_block * Nc + c_col; + if(i == j) + full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i); + else + full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j); + } + } + } + } + }); + } + + static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) { + // Checks/grid + double t0 = usecond(); + conformable(diagonal, triangle); + GridBase* grid = diagonal.Grid(); + + // Determine the boundary coordinates/sites + double t1 = usecond(); + int t_dir = Nd - 1; + Lattice> t_coor(grid); + LatticeCoordinate(t_coor, t_dir); + int T = grid->GlobalDimensions()[t_dir]; + + // Set off-diagonal parts at boundary to zero -- OK + double t2 = usecond(); + CloverTriangleField zeroTriangle(grid); + zeroTriangle.Checkerboard() = triangle.Checkerboard(); + zeroTriangle = Zero(); + triangle = where(t_coor == 0, zeroTriangle, triangle); + triangle = where(t_coor == T-1, zeroTriangle, triangle); + + // Set diagonal to unity (scaled correctly) -- OK + double t3 = usecond(); + CloverDiagonalField tmp(grid); + tmp.Checkerboard() = diagonal.Checkerboard(); + tmp = -1.0 * csw_t + diag_mass; + diagonal = where(t_coor == 0, tmp, diagonal); + diagonal = where(t_coor == T-1, tmp, diagonal); + + // Correct values next to boundary + double t4 = usecond(); + if(cF != 1.0) { + tmp = cF - 1.0; + tmp += diagonal; + diagonal = where(t_coor == 1, tmp, diagonal); + diagonal = where(t_coor == T-2, tmp, diagonal); + } + + // Report timings + double t5 = usecond(); +#if 0 + std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:" + << " checks = " << (t1 - t0) / 1e6 + << ", coordinate = " << (t2 - t1) / 1e6 + << ", off-diag zero = " << (t3 - t2) / 1e6 + << ", diagonal unity = " << (t4 - t3) / 1e6 + << ", near-boundary = " << (t5 - t4) / 1e6 + << ", total = " << (t5 - t0) / 1e6 + << std::endl; +#endif + } + + template + static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) { + conformable(f, m); + auto grid = f.Grid(); + const int Nsite = grid->oSites(); + const int Nsimd = grid->Nsimd(); + autoView(f_v, f, AcceleratorWrite); + autoView(m_v, m, AcceleratorRead); + // NOTE: this function cannot be 'private' since nvcc forbids this for kernels + accelerator_for(ss, Nsite, Nsimd, { + coalescedWrite(f_v[ss], m_v(ss) * f_v(ss)); + }); + } + + template + static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) { + assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even); + assert(odd.Grid()->_isCheckerBoarded && odd.Checkerboard() == Odd); + assert(!full.Grid()->_isCheckerBoarded); + + GridBase* grid = full.Grid(); + int t_dir = Nd-1; + Lattice> t_coor(grid); + LatticeCoordinate(t_coor, t_dir); + int T = grid->GlobalDimensions()[t_dir]; + + MaskField zeroMask(grid); zeroMask = Zero(); + full = 1.0; + full = where(t_coor == 0, zeroMask, full); + full = where(t_coor == T-1, zeroMask, full); + + pickCheckerboard(Even, even, full); + pickCheckerboard(Odd, odd, full); + } +}; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/WilsonCloverTypes.h b/Grid/qcd/action/fermion/WilsonCloverTypes.h new file mode 100644 index 00000000..a5a95d93 --- /dev/null +++ b/Grid/qcd/action/fermion/WilsonCloverTypes.h @@ -0,0 +1,92 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h + + Copyright (C) 2021 - 2022 + + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#pragma once + +NAMESPACE_BEGIN(Grid); + +template +class WilsonCloverTypes { +public: + INHERIT_IMPL_TYPES(Impl); + + template using iImplClover = iScalar, Ns>>; + + typedef iImplClover SiteClover; + + typedef Lattice CloverField; +}; + +template +class CompactWilsonCloverTypes { +public: + INHERIT_IMPL_TYPES(Impl); + + static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions"); + + static constexpr int Nred = Nc * Nhs; // 6 + static constexpr int Nblock = Nhs; // 2 + static constexpr int Ndiagonal = Nred; // 6 + static constexpr int Ntriangle = (Nred - 1) * Nc; // 15 + + template using iImplCloverDiagonal = iScalar, Nblock>>; + template using iImplCloverTriangle = iScalar, Nblock>>; + + typedef iImplCloverDiagonal SiteCloverDiagonal; + typedef iImplCloverTriangle SiteCloverTriangle; + typedef iSinglet SiteMask; + + typedef Lattice CloverDiagonalField; + typedef Lattice CloverTriangleField; + typedef Lattice MaskField; +}; + +#define INHERIT_CLOVER_TYPES(Impl) \ + typedef typename WilsonCloverTypes::SiteClover SiteClover; \ + typedef typename WilsonCloverTypes::CloverField CloverField; + +#define INHERIT_COMPACT_CLOVER_TYPES(Impl) \ + typedef typename CompactWilsonCloverTypes::SiteCloverDiagonal SiteCloverDiagonal; \ + typedef typename CompactWilsonCloverTypes::SiteCloverTriangle SiteCloverTriangle; \ + typedef typename CompactWilsonCloverTypes::SiteMask SiteMask; \ + typedef typename CompactWilsonCloverTypes::CloverDiagonalField CloverDiagonalField; \ + typedef typename CompactWilsonCloverTypes::CloverTriangleField CloverTriangleField; \ + typedef typename CompactWilsonCloverTypes::MaskField MaskField; \ + /* ugly duplication but needed inside functionality classes */ \ + template using iImplCloverDiagonal = \ + iScalar::Ndiagonal>, CompactWilsonCloverTypes::Nblock>>; \ + template using iImplCloverTriangle = \ + iScalar::Ntriangle>, CompactWilsonCloverTypes::Nblock>>; + +#define INHERIT_COMPACT_CLOVER_SIZES(Impl) \ + static constexpr int Nred = CompactWilsonCloverTypes::Nred; \ + static constexpr int Nblock = CompactWilsonCloverTypes::Nblock; \ + static constexpr int Ndiagonal = CompactWilsonCloverTypes::Ndiagonal; \ + static constexpr int Ntriangle = CompactWilsonCloverTypes::Ntriangle; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h new file mode 100644 index 00000000..3dfcb133 --- /dev/null +++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h @@ -0,0 +1,363 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h + + Copyright (C) 2017 - 2022 + + Author: paboyle + Author: Guido Cossu + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#include +#include +#include + +NAMESPACE_BEGIN(Grid); +template +CompactWilsonCloverFermion::CompactWilsonCloverFermion(GaugeField& _Umu, + GridCartesian& Fgrid, + GridRedBlackCartesian& Hgrid, + const RealD _mass, + const RealD _csw_r, + const RealD _csw_t, + const RealD _cF, + const WilsonAnisotropyCoefficients& clover_anisotropy, + const ImplParams& impl_p) + : WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy) + , csw_r(_csw_r) + , csw_t(_csw_t) + , cF(_cF) + , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0) + , Diagonal(&Fgrid), Triangle(&Fgrid) + , DiagonalEven(&Hgrid), TriangleEven(&Hgrid) + , DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid) + , DiagonalInv(&Fgrid), TriangleInv(&Fgrid) + , DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid) + , DiagonalInvOdd(&Hgrid), TriangleInvOdd(&Hgrid) + , Tmp(&Fgrid) + , BoundaryMask(&Fgrid) + , BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid) +{ + csw_r *= 0.5; + csw_t *= 0.5; + if (clover_anisotropy.isAnisotropic) + csw_r /= clover_anisotropy.xi_0; + + ImportGauge(_Umu); + if (open_boundaries) + CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd); +} + +template +void CompactWilsonCloverFermion::Dhop(const FermionField& in, FermionField& out, int dag) { + WilsonBase::Dhop(in, out, dag); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::DhopOE(const FermionField& in, FermionField& out, int dag) { + WilsonBase::DhopOE(in, out, dag); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::DhopEO(const FermionField& in, FermionField& out, int dag) { + WilsonBase::DhopEO(in, out, dag); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) { + WilsonBase::DhopDir(in, out, dir, disp); + if(this->open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::DhopDirAll(const FermionField& in, std::vector& out) { + WilsonBase::DhopDirAll(in, out); + if(this->open_boundaries) { + for(auto& o : out) ApplyBoundaryMask(o); + } +} + +template +void CompactWilsonCloverFermion::M(const FermionField& in, FermionField& out) { + out.Checkerboard() = in.Checkerboard(); + WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc + Mooee(in, Tmp); + axpy(out, 1.0, out, Tmp); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::Mdag(const FermionField& in, FermionField& out) { + out.Checkerboard() = in.Checkerboard(); + WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc + MooeeDag(in, Tmp); + axpy(out, 1.0, out, Tmp); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::Meooe(const FermionField& in, FermionField& out) { + WilsonBase::Meooe(in, out); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::MeooeDag(const FermionField& in, FermionField& out) { + WilsonBase::MeooeDag(in, out); + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::Mooee(const FermionField& in, FermionField& out) { + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + MooeeInternal(in, out, DiagonalOdd, TriangleOdd); + } else { + MooeeInternal(in, out, DiagonalEven, TriangleEven); + } + } else { + MooeeInternal(in, out, Diagonal, Triangle); + } + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::MooeeDag(const FermionField& in, FermionField& out) { + Mooee(in, out); // blocks are hermitian +} + +template +void CompactWilsonCloverFermion::MooeeInv(const FermionField& in, FermionField& out) { + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd); + } else { + MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven); + } + } else { + MooeeInternal(in, out, DiagonalInv, TriangleInv); + } + if(open_boundaries) ApplyBoundaryMask(out); +} + +template +void CompactWilsonCloverFermion::MooeeInvDag(const FermionField& in, FermionField& out) { + MooeeInv(in, out); // blocks are hermitian +} + +template +void CompactWilsonCloverFermion::Mdir(const FermionField& in, FermionField& out, int dir, int disp) { + DhopDir(in, out, dir, disp); +} + +template +void CompactWilsonCloverFermion::MdirAll(const FermionField& in, std::vector& out) { + DhopDirAll(in, out); +} + +template +void CompactWilsonCloverFermion::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) { + assert(!open_boundaries); // TODO check for changes required for open bc + + // NOTE: code copied from original clover term + conformable(X.Grid(), Y.Grid()); + conformable(X.Grid(), force.Grid()); + GaugeLinkField force_mu(force.Grid()), lambda(force.Grid()); + GaugeField clover_force(force.Grid()); + PropagatorField Lambda(force.Grid()); + + // Guido: Here we are hitting some performance issues: + // need to extract the components of the DoubledGaugeField + // for each call + // Possible solution + // Create a vector object to store them? (cons: wasting space) + std::vector U(Nd, this->Umu.Grid()); + + Impl::extractLinkField(U, this->Umu); + + force = Zero(); + // Derivative of the Wilson hopping term + this->DhopDeriv(force, X, Y, dag); + + /////////////////////////////////////////////////////////// + // Clover term derivative + /////////////////////////////////////////////////////////// + Impl::outerProductImpl(Lambda, X, Y); + //std::cout << "Lambda:" << Lambda << std::endl; + + Gamma::Algebra sigma[] = { + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::MinusSigmaZT}; + + /* + sigma_{\mu \nu}= + | 0 sigma[0] sigma[1] sigma[2] | + | sigma[3] 0 sigma[4] sigma[5] | + | sigma[6] sigma[7] 0 sigma[8] | + | sigma[9] sigma[10] sigma[11] 0 | + */ + + int count = 0; + clover_force = Zero(); + for (int mu = 0; mu < 4; mu++) + { + force_mu = Zero(); + for (int nu = 0; nu < 4; nu++) + { + if (mu == nu) + continue; + + RealD factor; + if (nu == 4 || mu == 4) + { + factor = 2.0 * csw_t; + } + else + { + factor = 2.0 * csw_r; + } + PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked + Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok + force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked + count++; + } + + pokeLorentz(clover_force, U[mu] * force_mu, mu); + } + //clover_force *= csw; + force += clover_force; +} + +template +void CompactWilsonCloverFermion::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) { + assert(0); +} + +template +void CompactWilsonCloverFermion::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) { + assert(0); +} + +template +void CompactWilsonCloverFermion::MooeeInternal(const FermionField& in, + FermionField& out, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle) { + assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); + out.Checkerboard() = in.Checkerboard(); + conformable(in, out); + conformable(in, diagonal); + conformable(in, triangle); + + CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle); +} + +template +void CompactWilsonCloverFermion::ImportGauge(const GaugeField& _Umu) { + // NOTE: parts copied from original implementation + + // Import gauge into base class + double t0 = usecond(); + WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that + + // Initialize temporary variables + double t1 = usecond(); + conformable(_Umu.Grid(), this->GaugeGrid()); + GridBase* grid = _Umu.Grid(); + typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid); + CloverField TmpOriginal(grid); + + // Compute the field strength terms mu>nu + double t2 = usecond(); + WilsonLoops::FieldStrength(Bx, _Umu, Zdir, Ydir); + WilsonLoops::FieldStrength(By, _Umu, Zdir, Xdir); + WilsonLoops::FieldStrength(Bz, _Umu, Ydir, Xdir); + WilsonLoops::FieldStrength(Ex, _Umu, Tdir, Xdir); + WilsonLoops::FieldStrength(Ey, _Umu, Tdir, Ydir); + WilsonLoops::FieldStrength(Ez, _Umu, Tdir, Zdir); + + // Compute the Clover Operator acting on Colour and Spin + // multiply here by the clover coefficients for the anisotropy + double t3 = usecond(); + TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r; + TmpOriginal += Helpers::fillCloverXZ(By) * csw_r; + TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r; + TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t; + TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t; + TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t; + TmpOriginal += this->diag_mass; + + // Convert the data layout of the clover term + double t4 = usecond(); + CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle); + + // Possible modify the boundary values + double t5 = usecond(); + if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass); + + // Invert the clover term in the improved layout + double t6 = usecond(); + CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv); + + // Fill the remaining clover fields + double t7 = usecond(); + pickCheckerboard(Even, DiagonalEven, Diagonal); + pickCheckerboard(Even, TriangleEven, Triangle); + pickCheckerboard(Odd, DiagonalOdd, Diagonal); + pickCheckerboard(Odd, TriangleOdd, Triangle); + pickCheckerboard(Even, DiagonalInvEven, DiagonalInv); + pickCheckerboard(Even, TriangleInvEven, TriangleInv); + pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv); + pickCheckerboard(Odd, TriangleInvOdd, TriangleInv); + + // Report timings + double t8 = usecond(); +#if 0 + std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:" + << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 + << ", allocations = " << (t2 - t1) / 1e6 + << ", field strength = " << (t3 - t2) / 1e6 + << ", fill clover = " << (t4 - t3) / 1e6 + << ", convert = " << (t5 - t4) / 1e6 + << ", boundaries = " << (t6 - t5) / 1e6 + << ", inversions = " << (t7 - t6) / 1e6 + << ", pick cbs = " << (t8 - t7) / 1e6 + << ", total = " << (t8 - t0) / 1e6 + << std::endl; +#endif +} + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index 3032a80c..e4d2e736 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -2,12 +2,13 @@ Grid physics library, www.github.com/paboyle/Grid - Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc + Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h - Copyright (C) 2017 + Copyright (C) 2017 - 2022 Author: paboyle Author: Guido Cossu + Author: Daniel Richtmann This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,6 +34,45 @@ NAMESPACE_BEGIN(Grid); +template +WilsonCloverFermion::WilsonCloverFermion(GaugeField& _Umu, + GridCartesian& Fgrid, + GridRedBlackCartesian& Hgrid, + const RealD _mass, + const RealD _csw_r, + const RealD _csw_t, + const WilsonAnisotropyCoefficients& clover_anisotropy, + const ImplParams& impl_p) + : WilsonFermion(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy) + , CloverTerm(&Fgrid) + , CloverTermInv(&Fgrid) + , CloverTermEven(&Hgrid) + , CloverTermOdd(&Hgrid) + , CloverTermInvEven(&Hgrid) + , CloverTermInvOdd(&Hgrid) + , CloverTermDagEven(&Hgrid) + , CloverTermDagOdd(&Hgrid) + , CloverTermInvDagEven(&Hgrid) + , CloverTermInvDagOdd(&Hgrid) { + assert(Nd == 4); // require 4 dimensions + + if(clover_anisotropy.isAnisotropic) { + csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0; + diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0); + } else { + csw_r = _csw_r * 0.5; + diag_mass = 4.0 + _mass; + } + csw_t = _csw_t * 0.5; + + if(csw_r == 0) + std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl; + if(csw_t == 0) + std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl; + + ImportGauge(_Umu); +} + // *NOT* EO template void WilsonCloverFermion::M(const FermionField &in, FermionField &out) @@ -67,10 +107,13 @@ void WilsonCloverFermion::Mdag(const FermionField &in, FermionField &out) template void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) { + double t0 = usecond(); WilsonFermion::ImportGauge(_Umu); + double t1 = usecond(); GridBase *grid = _Umu.Grid(); typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid); + double t2 = usecond(); // Compute the field strength terms mu>nu WilsonLoops::FieldStrength(Bx, _Umu, Zdir, Ydir); WilsonLoops::FieldStrength(By, _Umu, Zdir, Xdir); @@ -79,19 +122,22 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) WilsonLoops::FieldStrength(Ey, _Umu, Tdir, Ydir); WilsonLoops::FieldStrength(Ez, _Umu, Tdir, Zdir); + double t3 = usecond(); // Compute the Clover Operator acting on Colour and Spin // multiply here by the clover coefficients for the anisotropy - CloverTerm = fillCloverYZ(Bx) * csw_r; - CloverTerm += fillCloverXZ(By) * csw_r; - CloverTerm += fillCloverXY(Bz) * csw_r; - CloverTerm += fillCloverXT(Ex) * csw_t; - CloverTerm += fillCloverYT(Ey) * csw_t; - CloverTerm += fillCloverZT(Ez) * csw_t; + CloverTerm = Helpers::fillCloverYZ(Bx) * csw_r; + CloverTerm += Helpers::fillCloverXZ(By) * csw_r; + CloverTerm += Helpers::fillCloverXY(Bz) * csw_r; + CloverTerm += Helpers::fillCloverXT(Ex) * csw_t; + CloverTerm += Helpers::fillCloverYT(Ey) * csw_t; + CloverTerm += Helpers::fillCloverZT(Ez) * csw_t; CloverTerm += diag_mass; + double t4 = usecond(); int lvol = _Umu.Grid()->lSites(); int DimRep = Impl::Dimension; + double t5 = usecond(); { autoView(CTv,CloverTerm,CpuRead); autoView(CTIv,CloverTermInv,CpuWrite); @@ -100,7 +146,7 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) grid->LocalIndexToLocalCoor(site, lcoor); Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); + typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero(); peekLocalSite(Qx, CTv, lcoor); //if (csw!=0){ for (int j = 0; j < Ns; j++) @@ -125,6 +171,7 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) }); } + double t6 = usecond(); // Separate the even and odd parts pickCheckerboard(Even, CloverTermEven, CloverTerm); pickCheckerboard(Odd, CloverTermOdd, CloverTerm); @@ -137,6 +184,20 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv)); pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv)); + double t7 = usecond(); + +#if 0 + std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:" + << " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 + << ", allocations = " << (t2 - t1) / 1e6 + << ", field strength = " << (t3 - t2) / 1e6 + << ", fill clover = " << (t4 - t3) / 1e6 + << ", misc = " << (t5 - t4) / 1e6 + << ", inversions = " << (t6 - t5) / 1e6 + << ", pick cbs = " << (t7 - t6) / 1e6 + << ", total = " << (t7 - t0) / 1e6 + << std::endl; +#endif } template @@ -167,7 +228,7 @@ template void WilsonCloverFermion::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv) { out.Checkerboard() = in.Checkerboard(); - CloverFieldType *Clover; + CloverField *Clover; assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); if (dag) @@ -182,12 +243,12 @@ void WilsonCloverFermion::MooeeInternal(const FermionField &in, FermionFie { Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven; } - out = *Clover * in; + Helpers::multCloverField(out, *Clover, in); } else { Clover = (inv) ? &CloverTermInv : &CloverTerm; - out = adj(*Clover) * in; + Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway } } else @@ -205,18 +266,98 @@ void WilsonCloverFermion::MooeeInternal(const FermionField &in, FermionFie // std::cout << "Calling clover term Even" << std::endl; Clover = (inv) ? &CloverTermInvEven : &CloverTermEven; } - out = *Clover * in; + Helpers::multCloverField(out, *Clover, in); // std::cout << GridLogMessage << "*Clover.Checkerboard() " << (*Clover).Checkerboard() << std::endl; } else { Clover = (inv) ? &CloverTermInv : &CloverTerm; - out = *Clover * in; + Helpers::multCloverField(out, *Clover, in); } } - } // MooeeInternal +// Derivative parts unpreconditioned pseudofermions +template +void WilsonCloverFermion::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag) +{ + conformable(X.Grid(), Y.Grid()); + conformable(X.Grid(), force.Grid()); + GaugeLinkField force_mu(force.Grid()), lambda(force.Grid()); + GaugeField clover_force(force.Grid()); + PropagatorField Lambda(force.Grid()); + + // Guido: Here we are hitting some performance issues: + // need to extract the components of the DoubledGaugeField + // for each call + // Possible solution + // Create a vector object to store them? (cons: wasting space) + std::vector U(Nd, this->Umu.Grid()); + + Impl::extractLinkField(U, this->Umu); + + force = Zero(); + // Derivative of the Wilson hopping term + this->DhopDeriv(force, X, Y, dag); + + /////////////////////////////////////////////////////////// + // Clover term derivative + /////////////////////////////////////////////////////////// + Impl::outerProductImpl(Lambda, X, Y); + //std::cout << "Lambda:" << Lambda << std::endl; + + Gamma::Algebra sigma[] = { + Gamma::Algebra::SigmaXY, + Gamma::Algebra::SigmaXZ, + Gamma::Algebra::SigmaXT, + Gamma::Algebra::MinusSigmaXY, + Gamma::Algebra::SigmaYZ, + Gamma::Algebra::SigmaYT, + Gamma::Algebra::MinusSigmaXZ, + Gamma::Algebra::MinusSigmaYZ, + Gamma::Algebra::SigmaZT, + Gamma::Algebra::MinusSigmaXT, + Gamma::Algebra::MinusSigmaYT, + Gamma::Algebra::MinusSigmaZT}; + + /* + sigma_{\mu \nu}= + | 0 sigma[0] sigma[1] sigma[2] | + | sigma[3] 0 sigma[4] sigma[5] | + | sigma[6] sigma[7] 0 sigma[8] | + | sigma[9] sigma[10] sigma[11] 0 | + */ + + int count = 0; + clover_force = Zero(); + for (int mu = 0; mu < 4; mu++) + { + force_mu = Zero(); + for (int nu = 0; nu < 4; nu++) + { + if (mu == nu) + continue; + + RealD factor; + if (nu == 4 || mu == 4) + { + factor = 2.0 * csw_t; + } + else + { + factor = 2.0 * csw_r; + } + PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked + Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok + force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked + count++; + } + + pokeLorentz(clover_force, U[mu] * force_mu, mu); + } + //clover_force *= csw; + force += clover_force; +} // Derivative parts template diff --git a/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master new file mode 100644 index 00000000..7c91c252 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master @@ -0,0 +1,41 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation.cc.master + + Copyright (C) 2017 - 2022 + + Author: paboyle + Author: Guido Cossu + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#include +#include +#include +#include + +NAMESPACE_BEGIN(Grid); + +#include "impl.h" +template class CompactWilsonCloverFermion; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplD/CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplD/CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplD/CompactWilsonCloverFermionInstantiationGparityWilsonImplD.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplF/CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplF/CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplF/CompactWilsonCloverFermionInstantiationGparityWilsonImplF.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplD/CompactWilsonCloverFermionInstantiationWilsonAdjImplD.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonAdjImplF/CompactWilsonCloverFermionInstantiationWilsonAdjImplF.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplD/CompactWilsonCloverFermionInstantiationWilsonImplD.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonImplF/CompactWilsonCloverFermionInstantiationWilsonImplF.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplD.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexAntiSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexAntiSymmetricImplF.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplD/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplD.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc new file mode 120000 index 00000000..a739bb99 --- /dev/null +++ b/Grid/qcd/action/fermion/instantiation/WilsonTwoIndexSymmetricImplF/CompactWilsonCloverFermionInstantiationWilsonTwoIndexSymmetricImplF.cc @@ -0,0 +1 @@ +../CompactWilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh index d7553cdb..a09fd420 100755 --- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh +++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh @@ -40,7 +40,7 @@ EOF done -CC_LIST="WilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation" +CC_LIST="WilsonCloverFermionInstantiation CompactWilsonCloverFermionInstantiation WilsonFermionInstantiation WilsonKernelsInstantiation WilsonTMFermionInstantiation" for impl in $WILSON_IMPL_LIST do diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 697f3ac1..6992129e 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -167,6 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val) return; } +void GridCmdOptionFloat(std::string &str,float & val) +{ + std::stringstream ss(str); + ss>>val; + return; +} + void GridParseLayout(char **argv,int argc, Coordinate &latt_c, diff --git a/Grid/util/Init.h b/Grid/util/Init.h index 4eb8f06c..585660a1 100644 --- a/Grid/util/Init.h +++ b/Grid/util/Init.h @@ -57,6 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector & vec); template void GridCmdOptionIntVector(const std::string &str,VectorInt & vec); void GridCmdOptionInt(std::string &str,int & val); +void GridCmdOptionFloat(std::string &str,float & val); void GridParseLayout(char **argv,int argc, diff --git a/tests/core/Test_compact_wilson_clover_speedup.cc b/tests/core/Test_compact_wilson_clover_speedup.cc new file mode 100644 index 00000000..83e3da1f --- /dev/null +++ b/tests/core/Test_compact_wilson_clover_speedup.cc @@ -0,0 +1,226 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/core/Test_compact_wilson_clover_speedup.cc + + Copyright (C) 2020 - 2022 + + Author: Daniel Richtmann + Author: Nils Meyer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include + +using namespace Grid; + +NAMESPACE_BEGIN(CommandlineHelpers); + +static bool checkPresent(int* argc, char*** argv, const std::string& option) { + return GridCmdOptionExists(*argv, *argv + *argc, option); +} + +static std::string getContent(int* argc, char*** argv, const std::string& option) { + return GridCmdOptionPayload(*argv, *argv + *argc, option); +} + +static int readInt(int* argc, char*** argv, std::string&& option, int defaultValue) { + std::string arg; + int ret = defaultValue; + if(checkPresent(argc, argv, option)) { + arg = getContent(argc, argv, option); + GridCmdOptionInt(arg, ret); + } + return ret; +} + +static float readFloat(int* argc, char*** argv, std::string&& option, float defaultValue) { + std::string arg; + float ret = defaultValue; + if(checkPresent(argc, argv, option)) { + arg = getContent(argc, argv, option); + GridCmdOptionFloat(arg, ret); + } + return ret; +} + +NAMESPACE_END(CommandlineHelpers); + + +#define _grid_printf(LOGGER, ...) \ + { \ + if((LOGGER).isActive()) { /* this makes it safe to put, e.g., norm2 in the calling code w.r.t. performance */ \ + char _printf_buf[1024]; \ + std::sprintf(_printf_buf, __VA_ARGS__); \ + std::cout << (LOGGER) << _printf_buf; \ + fflush(stdout); \ + } \ + } +#define grid_printf_msg(...) _grid_printf(GridLogMessage, __VA_ARGS__) + + +template +bool resultsAgree(const Field& ref, const Field& res, const std::string& name) { + RealD checkTolerance = (getPrecision::value == 2) ? 1e-15 : 1e-7; + Field diff(ref.Grid()); + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage + << "norm2(reference), norm2(" << name << "), abs. deviation, rel. deviation: " << norm2(ref) << " " + << norm2(res) << " " << absDev << " " << relDev << " -> check " + << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + + return relDev <= checkTolerance; +} + + +template +void runBenchmark(int* argc, char*** argv) { + // read from command line + const int nIter = CommandlineHelpers::readInt( argc, argv, "--niter", 1000); + const RealD mass = CommandlineHelpers::readFloat( argc, argv, "--mass", 0.5); + const RealD csw = CommandlineHelpers::readFloat( argc, argv, "--csw", 1.0); + const RealD cF = CommandlineHelpers::readFloat( argc, argv, "--cF", 1.0); + const bool antiPeriodic = CommandlineHelpers::checkPresent(argc, argv, "--antiperiodic"); + + // precision + static_assert(getPrecision::value == 2 || getPrecision::value == 1, "Incorrect precision"); // double or single + std::string precision = (getPrecision::value == 2 ? "double" : "single"); + + // setup grids + GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vCoeff_t::Nsimd()), GridDefaultMpi()); + GridRedBlackCartesian* UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + // clang-format on + + // setup rng + std::vector seeds({1, 2, 3, 4}); + GridParallelRNG pRNG(UGrid); + pRNG.SeedFixedIntegers(seeds); + + // type definitions + typedef WilsonImpl WImpl; + typedef WilsonCloverFermion WilsonCloverOperator; + typedef CompactWilsonCloverFermion CompactWilsonCloverOperator; + typedef typename WilsonCloverOperator::FermionField Fermion; + typedef typename WilsonCloverOperator::GaugeField Gauge; + + // setup fields + Fermion src(UGrid); random(pRNG, src); + Fermion ref(UGrid); ref = Zero(); + Fermion res(UGrid); res = Zero(); + Fermion hop(UGrid); hop = Zero(); + Fermion diff(UGrid); diff = Zero(); + Gauge Umu(UGrid); SU3::HotConfiguration(pRNG, Umu); + + // setup boundary phases + typename WilsonCloverOperator::ImplParams implParams; + std::vector boundary_phases(Nd, 1.); + if(antiPeriodic) boundary_phases[Nd-1] = -1.; + implParams.boundary_phases = boundary_phases; + WilsonAnisotropyCoefficients anisParams; + + // misc stuff needed for benchmarks + double volume=1.0; for(int mu=0; mu_fdimensions[mu]; + + // setup fermion operators + WilsonCloverOperator Dwc( Umu, *UGrid, *UrbGrid, mass, csw, csw, anisParams, implParams); + CompactWilsonCloverOperator Dwc_compact(Umu, *UGrid, *UrbGrid, mass, csw, csw, cF, anisParams, implParams); + + // now test the conversions + typename CompactWilsonCloverOperator::CloverField tmp_ref(UGrid); tmp_ref = Dwc.CloverTerm; + typename CompactWilsonCloverOperator::CloverField tmp_res(UGrid); tmp_res = Zero(); + typename CompactWilsonCloverOperator::CloverField tmp_diff(UGrid); tmp_diff = Zero(); + typename CompactWilsonCloverOperator::CloverDiagonalField diagonal(UGrid); diagonal = Zero(); + typename CompactWilsonCloverOperator::CloverTriangleField triangle(UGrid); diagonal = Zero(); + CompactWilsonCloverOperator::CompactHelpers::ConvertLayout(tmp_ref, diagonal, triangle); + CompactWilsonCloverOperator::CompactHelpers::ConvertLayout(diagonal, triangle, tmp_res); + tmp_diff = tmp_ref - tmp_res; + std::cout << GridLogMessage << "conversion: ref, res, diff, eps" + << " " << norm2(tmp_ref) + << " " << norm2(tmp_res) + << " " << norm2(tmp_diff) + << " " << norm2(tmp_diff) / norm2(tmp_ref) + << std::endl; + + // performance per site (use minimal values necessary) + double hop_flop_per_site = 1320; // Rich's Talk + what Peter uses + double hop_byte_per_site = (8 * 9 + 9 * 12) * 2 * getPrecision::value * 4; + double clov_flop_per_site = 504; // Rich's Talk and 1412.2629 + double clov_byte_per_site = (2 * 18 + 12 + 12) * 2 * getPrecision::value * 4; + double clov_flop_per_site_performed = 1128; + double clov_byte_per_site_performed = (12 * 12 + 12 + 12) * 2 * getPrecision::value * 4; + + // total performance numbers + double hop_gflop_total = volume * nIter * hop_flop_per_site / 1e9; + double hop_gbyte_total = volume * nIter * hop_byte_per_site / 1e9; + double clov_gflop_total = volume * nIter * clov_flop_per_site / 1e9; + double clov_gbyte_total = volume * nIter * clov_byte_per_site / 1e9; + double clov_gflop_performed_total = volume * nIter * clov_flop_per_site_performed / 1e9; + double clov_gbyte_performed_total = volume * nIter * clov_byte_per_site_performed / 1e9; + + // warmup + measure dhop + for(auto n : {1, 2, 3, 4, 5}) Dwc.Dhop(src, hop, 0); + double t0 = usecond(); + for(int n = 0; n < nIter; n++) Dwc.Dhop(src, hop, 0); + double t1 = usecond(); + double secs_hop = (t1-t0)/1e6; + grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", + "hop", precision.c_str(), secs_hop, hop_gflop_total/secs_hop, hop_gbyte_total/secs_hop, 0.0, secs_hop/secs_hop); + +#define BENCH_CLOVER_KERNEL(KERNEL) { \ + /* warmup + measure reference clover */ \ + for(auto n : {1, 2, 3, 4, 5}) Dwc.KERNEL(src, ref); \ + double t2 = usecond(); \ + for(int n = 0; n < nIter; n++) Dwc.KERNEL(src, ref); \ + double t3 = usecond(); \ + double secs_ref = (t3-t2)/1e6; \ + grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", \ + "reference_"#KERNEL, precision.c_str(), secs_ref, clov_gflop_total/secs_ref, clov_gbyte_total/secs_ref, secs_ref/secs_ref, secs_ref/secs_hop); \ + grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", /* to see how well the ET performs */ \ + "reference_"#KERNEL"_performed", precision.c_str(), secs_ref, clov_gflop_performed_total/secs_ref, clov_gbyte_performed_total/secs_ref, secs_ref/secs_ref, secs_ref/secs_hop); \ +\ + /* warmup + measure compact clover */ \ + for(auto n : {1, 2, 3, 4, 5}) Dwc_compact.KERNEL(src, res); \ + double t4 = usecond(); \ + for(int n = 0; n < nIter; n++) Dwc_compact.KERNEL(src, res); \ + double t5 = usecond(); \ + double secs_res = (t5-t4)/1e6; \ + grid_printf_msg("Performance(%35s, %s): %2.4f s, %6.0f GFlop/s, %6.0f GByte/s, speedup vs ref = %.2f, fraction of hop = %.2f\n", \ + "compact_"#KERNEL, precision.c_str(), secs_res, clov_gflop_total/secs_res, clov_gbyte_total/secs_res, secs_ref/secs_res, secs_res/secs_hop); \ + assert(resultsAgree(ref, res, #KERNEL)); \ +} + + BENCH_CLOVER_KERNEL(Mooee); + BENCH_CLOVER_KERNEL(MooeeDag); + BENCH_CLOVER_KERNEL(MooeeInv); + BENCH_CLOVER_KERNEL(MooeeInvDag); + + grid_printf_msg("finalize %s\n", precision.c_str()); +} + +int main(int argc, char** argv) { + Grid_init(&argc, &argv); + + runBenchmark(&argc, &argv); + runBenchmark(&argc, &argv); + + Grid_finalize(); +}