From 5f75735dab3fcad328aeb0d92f9e5dcffd0aec53 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 6 Apr 2023 18:25:05 +0200 Subject: [PATCH 001/114] Add M and Mdag to WilsonTMFermion --- Grid/qcd/action/fermion/WilsonTMFermion.h | 4 +++- .../WilsonTMFermionImplementation.h | 20 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/WilsonTMFermion.h b/Grid/qcd/action/fermion/WilsonTMFermion.h index 12c4b71a..cca716a0 100644 --- a/Grid/qcd/action/fermion/WilsonTMFermion.h +++ b/Grid/qcd/action/fermion/WilsonTMFermion.h @@ -63,7 +63,9 @@ public: virtual void MooeeDag(const FermionField &in, FermionField &out) ; virtual void MooeeInv(const FermionField &in, FermionField &out) ; virtual void MooeeInvDag(const FermionField &in, FermionField &out) ; - + virtual void M(const FermionField &in, FermionField &out) ; + virtual void Mdag(const FermionField &in, FermionField &out) ; + private: RealD mu; // TwistedMass parameter diff --git a/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h index 9a1a152c..12771c29 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h @@ -93,5 +93,25 @@ void WilsonTMFermion::MooeeInvDag(const FermionField &in, FermionField &ou RealD b = tm /sq; axpibg5x(out,in,a,b); } +template +void WilsonTMFermion::M(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + this->Dhop(in, out, DaggerNo); + FermionField tmp(out.Grid()); + RealD a = 4.0+this->mass; + RealD b = this->mu; + axpibg5x(tmp,in,a,b); + axpy(out, 1.0, tmp, out); +} +template +void WilsonTMFermion::Mdag(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + this->Dhop(in, out, DaggerYes); + FermionField tmp(out.Grid()); + RealD a = 4.0+this->mass; + RealD b = -this->mu; + axpibg5x(tmp,in,a,b); + axpy(out, 1.0, tmp, out); +} NAMESPACE_END(Grid); From bf91778550218008230e56aff7c4c2f6b9574589 Mon Sep 17 00:00:00 2001 From: david clarke Date: Wed, 17 May 2023 15:15:54 -0600 Subject: [PATCH 002/114] verbose plaquette example; fat link test frame --- examples/Example_plaquette.cc | 180 ++++++++++++++++++++++++++++++++ tests/smearing/Test_fatLinks.cc | 40 +++++++ 2 files changed, 220 insertions(+) create mode 100644 examples/Example_plaquette.cc create mode 100644 tests/smearing/Test_fatLinks.cc diff --git a/examples/Example_plaquette.cc b/examples/Example_plaquette.cc new file mode 100644 index 00000000..2aec72ff --- /dev/null +++ b/examples/Example_plaquette.cc @@ -0,0 +1,180 @@ +/* + * Example_plaquette.cc + * + * D. Clarke + * + * Here I just want to create an incredibly simple main to get started with GRID and get used + * to its syntax. If the reader is like me, they vaguely understand something about lattice coding, + * they don't know a ton of C++, don't know much of the fine details, and certainly know nothing about GRID. + * + * Once you've made a new executable, like this one, you can bootstrap.sh again. At this point, + * the code should be able to find your new executable. You can tell that bootstrap.sh worked by + * having a look at Make.inc. You should see your executable inside there. + * + * Warning: This code illustrative only, not well tested, and not meant for production use. The best + * way to read this code is to start at the main. + * + */ + + +// All your mains should have this +#include +using namespace Grid; + + +// This copies what already exists in WilsonLoops.h. The point here is to be pedagogical and explain in +// detail what everything does so we can see how GRID works. +template class WLoops : public Gimpl { +public: + // Gimpl seems to be an arbitrary class. Within this class, it is expected that certain types are + // already defined, things like Scalar and Field. This macro includes a bunch of #typedefs that + // implement this equivalence at compile time. + // WARNING: The first time you include this or take it out, the compile time will increase a lot. + INHERIT_GIMPL_TYPES(Gimpl); + + // Some example Gimpls can be found in GaugeImplementations.h, at the bottom. These are in turn built + // out of GaugeImplTypes, which can be found in GaugeImplTypes.h. The GaugeImplTypes contain the base + // field/vector/link/whatever types. These inherit from iScalar, iVector, and iMatrix objects, which + // are sort of the building blocks for gerenal math objects. The "i" at the beginning of these names + // indicates that they should be for internal use only. It seems like these base types have the + // acceleration, e.g. SIMD or GPU or what-have-you, abstracted away. How you accelerate these things + // appears to be controlled through a template parameter called vtype. + + // The general math/physics objects, such as a color matrix, are built up by nesting these objects. + // For instance a general color matrix has two color indices, so it's built up like + // iScalar &U, const int mu, const int nu) { + // These CovShift calls seem to carry out the multiplication already. A positive shift moves the lattice + // site x_mu = 1 in the RHS to x_mu = 0 in the result. + plaq = Gimpl::CovShiftForward(U[mu],mu, + Gimpl::CovShiftForward(U[nu],nu, + Gimpl::CovShiftBackward(U[mu],mu, + Gimpl::CovShiftIdentityBackward(U[nu], nu)))); + } + + // tr U_mu_nu(x) + static void traceDirPlaquette(ComplexField &plaq, const std::vector &U, const int mu, const int nu) { + // This .Grid() syntax seems to get the pointer to the GridBase. Apparently this is needed as argument + // to instantiate a Lattice object. + GaugeMat sp(U[0].Grid()); + dirPlaquette(sp, U, mu, nu); + plaq = trace(sp); + } + + // sum_mu_nu tr U_mu_nu(x) + static void sitePlaquette(ComplexField &Plaq, const std::vector &U) { + ComplexField sitePlaq(U[0].Grid()); + Plaq = Zero(); + // Nd=4 and Nc=3 are set as global constants in QCD.h + for (int mu = 1; mu < Nd; mu++) { + for (int nu = 0; nu < mu; nu++) { + traceDirPlaquette(sitePlaq, U, mu, nu); + Plaq = Plaq + sitePlaq; + } + } + } + + // sum_mu_nu_x Re tr U_mu_nu(x) + static RealD sumPlaquette(const GaugeLorentz &Umu) { + std::vector U(Nd, Umu.Grid()); + for (int mu = 0; mu < Nd; mu++) { + // Umu is a GaugeLorentz object, and as such has a non-trivial Lorentz index. We can + // access the element in the mu Lorentz index with this PeekIndex syntax. + U[mu] = PeekIndex(Umu, mu); + } + ComplexField Plaq(Umu.Grid()); + sitePlaquette(Plaq, U); + // I guess this should be the line that sums over all space-time sites. + auto Tp = sum(Plaq); + // Until now, we have been working with objects inside the tensor nest. This TensorRemove gets + // rid of the tensor nest to return whatever is inside. + auto p = TensorRemove(Tp); + return p.real(); + } + + // < Re tr U_mu_nu(x) > + static RealD avgPlaquette(const GaugeLorentz &Umu) { + // Real double type + RealD sumplaq = sumPlaquette(Umu); + // gSites() is the number of global sites. there is also lSites() for local sites. + double vol = Umu.Grid()->gSites(); + // The number of orientations. 4*3/2=6 for Nd=4, as known. + double faces = (1.0 * Nd * (Nd - 1)) / 2.0; + return sumplaq / vol / faces / Nc; + } +}; + + +// Next we show an example of how to construct an input parameter class. We first inherit +// from Serializable. Then all class data members have to be defined using the +// GRID_SERIALIZABLE_CLASS_MEMBERS macro. This variadic macro allows for arbitrarily many +// class data members. In the below case, we make a parameter file holding the configuration +// name. Here, it expects the name to be labeled with "conf_name" in the configuration file. +struct ConfParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS( + ConfParameters, + std::string, conf_name); + + template + ConfParameters(Reader& Reader){ + // If we are reading an XML file, it should be structured like: + // + // + // l20t20b06498a_nersc.302500 + // + // + read(Reader, "parameters", *this); + } +}; + + + +// This syntax lets you pass command line arguments to main. An asterisk means that what follows is +// a pointer. Two asterisks means what follows is a pointer to an array. +int main (int argc, char **argv) +{ + // This initializes Grid. Some command line options include + // --mpi n.n.n.n + // --threads n + // --grid n.n.n.n + Grid_init(&argc, &argv); + + // This is where you would specify a custom lattice size, if not from the command line. + Coordinate simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + Coordinate latt_size = GridDefaultLatt(); + + // Instantiate the Grid on which everything will be built. + GridCartesian spacetime(latt_size,simd_layout,mpi_layout); + + // The PeriodicGimplD type is what you want for gauge matrices. There is also a LatticeGaugeFieldD + // type that you can use, which will work perfectly with what follows. + PeriodicGimplD::Field U(&spacetime); + + // Here we read in the parameter file params.json to get conf_name. The last argument is what the + // top organizational level is called in the param file. + XmlReader Reader("params.xml",false, "grid"); + ConfParameters param(Reader); + + // Load a lattice from SIMULATeQCD into U. SIMULATeQCD finds plaquette = 0.6381995717 + FieldMetaData header; + NerscIO::readConfiguration(U, header, param.conf_name); + + // Let's see what we find. + RealD plaq = WLoops::avgPlaquette(U); + + // This is how you make log messages. + std::cout << GridLogMessage << std::setprecision(std::numeric_limits::digits10 + 1) << "Plaquette = " << plaq << std::endl; + + // To wrap things up. + Grid_finalize(); +} \ No newline at end of file diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc new file mode 100644 index 00000000..4f9d608d --- /dev/null +++ b/tests/smearing/Test_fatLinks.cc @@ -0,0 +1,40 @@ +/* + * Test_fatLinks.cc + * + * D. Clarke + * + * Test the various constructs used to make fat links. + * + */ + + +#include +using namespace Grid; + +template class : public Gimpl { +public: + + INHERIT_GIMPL_TYPES(Gimpl); + + typedef typename Gimpl::GaugeLinkField GaugeMat; + typedef typename Gimpl::GaugeField GaugeLorentz; + + static void staple(GaugeMat &plaq, const std::vector &U, const int mu, const int nu) { + } + +} + +int main (int argc, char **argv) +{ + Grid_init(&argc, &argv); + + Coordinate simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + Coordinate latt_size = GridDefaultLatt(); + + GridCartesian spacetime(latt_size,simd_layout,mpi_layout); + + PeriodicGimplD::Field U(&spacetime); + + Grid_finalize(); +} \ No newline at end of file From da9cbfc7cc4c3954e3064210e4d801a9c48f90fe Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 19 May 2023 20:22:20 +0200 Subject: [PATCH 003/114] Suppress BuildSurfaceList verbosity in Stencil.h --- Grid/stencil/Stencil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 40f224e6..760657ae 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -705,7 +705,7 @@ public: } } } - std::cout << "BuildSurfaceList size is "< Date: Sun, 21 May 2023 04:33:20 -0600 Subject: [PATCH 004/114] 3-link test at least gives an answer --- examples/Example_plaquette.cc | 13 +-- tests/debug/Test_padded_cell.cc | 104 +++++++++++++--------- tests/smearing/Test_fatLinks.cc | 151 ++++++++++++++++++++++++++++---- 3 files changed, 202 insertions(+), 66 deletions(-) diff --git a/examples/Example_plaquette.cc b/examples/Example_plaquette.cc index 2aec72ff..17de4762 100644 --- a/examples/Example_plaquette.cc +++ b/examples/Example_plaquette.cc @@ -148,21 +148,22 @@ int main (int argc, char **argv) // --grid n.n.n.n Grid_init(&argc, &argv); - // This is where you would specify a custom lattice size, if not from the command line. - Coordinate simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); + // This is where you would specify a custom lattice size, if not from the command line. Here + // Nd is a global quantity that is currently set to 4. + Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); Coordinate latt_size = GridDefaultLatt(); - // Instantiate the Grid on which everything will be built. - GridCartesian spacetime(latt_size,simd_layout,mpi_layout); + // Instantiate the spacetime Grid on which everything will be built. + GridCartesian GRID(latt_size,simd_layout,mpi_layout); // The PeriodicGimplD type is what you want for gauge matrices. There is also a LatticeGaugeFieldD // type that you can use, which will work perfectly with what follows. - PeriodicGimplD::Field U(&spacetime); + PeriodicGimplD::Field U(&GRID); // Here we read in the parameter file params.json to get conf_name. The last argument is what the // top organizational level is called in the param file. - XmlReader Reader("params.xml",false, "grid"); + XmlReader Reader("Example_plaquette.xml",false, "grid"); ConfParameters param(Reader); // Load a lattice from SIMULATeQCD into U. SIMULATeQCD finds plaquette = 0.6381995717 diff --git a/tests/debug/Test_padded_cell.cc b/tests/debug/Test_padded_cell.cc index 4fb461fe..f110df46 100644 --- a/tests/debug/Test_padded_cell.cc +++ b/tests/debug/Test_padded_cell.cc @@ -32,6 +32,7 @@ Author: Peter Boyle using namespace std; using namespace Grid; +// This is to optimize the SIMD template void gpermute(vobj & inout,int perm){ vobj tmp=inout; if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;} @@ -39,7 +40,8 @@ template void gpermute(vobj & inout,int perm){ if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;} if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;} } - + + int main (int argc, char ** argv) { Grid_init(&argc,&argv); @@ -47,20 +49,21 @@ int main (int argc, char ** argv) Coordinate latt_size = GridDefaultLatt(); Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); - std::cout << " mpi "<({45,12,81,9})); LatticeGaugeField Umu(&GRID); - + pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); SU::HotConfiguration(pRNG,Umu); Real plaq=WilsonLoops::avgPlaquette(Umu); LatticeComplex trplaq(&GRID); + // Store Umu in U. Peek/Poke mean respectively getElement/setElement. std::vector U(Nd, Umu.Grid()); for (int mu = 0; mu < Nd; mu++) { U[mu] = PeekIndex(Umu, mu); @@ -70,9 +73,7 @@ int main (int argc, char ** argv) LatticeComplex cplaq(&GRID); cplaq=Zero(); - ///////////////////////////////////////////////// // Create a padded cell of extra padding depth=1 - ///////////////////////////////////////////////// int depth = 1; PaddedCell Ghost(depth,&GRID); LatticeGaugeField Ughost = Ghost.Exchange(Umu); @@ -114,18 +115,25 @@ int main (int argc, char ** argv) } #endif - ///// Array for the site plaquette + // Array for the site plaquette GridBase *GhostGrid = Ughost.Grid(); LatticeComplex gplaq(GhostGrid); - + + // Now we're going to put together the "stencil" that will be useful to us when + // calculating the plaquette. Our eventual goal is to make the product + // Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x), + // which requires, in order, the sites x, x+mu, x+nu, and x. We arrive at these + // sites relative to x through "shifts", which is represented here by a 4-d + // vector of 0s (no movement) and 1s (shift one unit) at each site. The + // "stencil" is the set of all these shifts. std::vector shifts; for(int mu=0;mu_offset; - int o1 = SE1->_offset; - int o2 = SE2->_offset; - int o3 = SE3->_offset; - - auto U0 = U_v[o0](mu); - auto U1 = U_v[o1](nu); - auto U2 = adj(U_v[o2](mu)); - auto U3 = adj(U_v[o3](nu)); + // Before doing accelerator stuff, there is an opening and closing of "Views". I guess the + // "Views" are stored in *_v variables listed below. + autoView( gp_v , gplaq, CpuWrite); + autoView( t_v , trplaq, CpuRead); + autoView( U_v , Ughost, CpuRead); - gpermute(U0,SE0->_permute); - gpermute(U1,SE1->_permute); - gpermute(U2,SE2->_permute); - gpermute(U3,SE3->_permute); - - gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 ); - s=s+4; - } - } + // This is now a loop over stencil shift elements. That is, s increases as we make our + // way through the spacetimes sites, but also as we make our way around the plaquette. + for(int ss=0;ss_offset; + int o1 = SE1->_offset; + int o2 = SE2->_offset; + int o3 = SE3->_offset; + + auto U0 = U_v[o0](mu); + auto U1 = U_v[o1](nu); + auto U2 = adj(U_v[o2](mu)); + auto U3 = adj(U_v[o3](nu)); + + gpermute(U0,SE0->_permute); + gpermute(U1,SE1->_permute); + gpermute(U2,SE2->_permute); + gpermute(U3,SE3->_permute); + + gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 ); + s=s+4; + } } } + + // Here is my understanding of this part: The padded cell has its own periodic BCs, so + // if I take a step to the right at the right-most side of the cell, I end up on the + // left-most side. This means that the plaquettes in the padding are wrong. Luckily + // all we care about are the plaquettes in the cell, which we obtain from Extract. cplaq = Ghost.Extract(gplaq); RealD vol = cplaq.Grid()->gSites(); RealD faces = (Nd * (Nd-1))/2; diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index 4f9d608d..85a60ee3 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -7,34 +7,151 @@ * */ - #include +#include +#include using namespace Grid; -template class : public Gimpl { -public: - - INHERIT_GIMPL_TYPES(Gimpl); - - typedef typename Gimpl::GaugeLinkField GaugeMat; - typedef typename Gimpl::GaugeField GaugeLorentz; - - static void staple(GaugeMat &plaq, const std::vector &U, const int mu, const int nu) { - } - +// This is to optimize the SIMD +template void gpermute(vobj & inout,int perm) { + vobj tmp=inout; + if (perm & 0x1) {permute(inout,tmp,0); tmp=inout;} + if (perm & 0x2) {permute(inout,tmp,1); tmp=inout;} + if (perm & 0x4) {permute(inout,tmp,2); tmp=inout;} + if (perm & 0x8) {permute(inout,tmp,3); tmp=inout;} } +// Make the logger work like Python print() +template +inline std::string sjoin(Args&&... args) noexcept { + std::ostringstream msg; + (msg << ... << args); + return msg.str(); +} +template +inline void Grid_log(Args&&... args) { + std::string msg = sjoin(std::forward(args)...); + std::cout << GridLogMessage << msg << std::endl; +} + +struct fatParams: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS( + fatParams, + std::string, conf_in, + std::string, conf_out); + + template + fatParams(Reader& Reader){ + read(Reader, "parameters", *this); + } +}; + + + int main (int argc, char **argv) { - Grid_init(&argc, &argv); + Grid_init(&argc,&argv); - Coordinate simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); - Coordinate mpi_layout = GridDefaultMpi(); Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); - GridCartesian spacetime(latt_size,simd_layout,mpi_layout); + Grid_log("mpi = ",mpi_layout); + Grid_log("simd = ",simd_layout); + Grid_log("latt = ",latt_size); - PeriodicGimplD::Field U(&spacetime); + GridCartesian GRID(latt_size,simd_layout,mpi_layout); + + XmlReader Reader("fatParams.xml",false, "grid"); + fatParams param(Reader); + + LatticeGaugeField Umu(&GRID); + FieldMetaData header; + NerscIO::readConfiguration(Umu, header, param.conf_in); + + // Create a padded cell of extra padding depth=1 + int depth = 1; + PaddedCell Ghost(depth,&GRID); + LatticeGaugeField Ughost = Ghost.Exchange(Umu); + + // Array for (x) + GridBase *GhostGrid = Ughost.Grid(); + LatticeComplex gplaq(GhostGrid); + + // This is where the 3-link constructs will be stored + LatticeGaugeField Ughost_3link(Ughost.Grid()); + + // Create 3-link stencil + std::vector shifts; + for(int mu=0;mu_offset; + int o1 = SE1->_offset; + int o2 = SE2->_offset; + int o3 = SE3->_offset; + + auto U0 = U_v[o0](mu); + auto U1 = U_v[o1](nu); + auto U2 = adj(U_v[o2](mu)); + auto U3 = adj(U_v[o3](nu)); + + gpermute(U0,SE0->_permute); + gpermute(U1,SE1->_permute); + gpermute(U2,SE2->_permute); + gpermute(U3,SE3->_permute); + + auto W = U1*U2*U3; + + // We add together contributions coming from each orientation. + U_3link_v[ss](mu) = U_3link_v[ss](mu) + W; + + s=s+4; + } + } + } + + // Here is my understanding of this part: The padded cell has its own periodic BCs, so + // if I take a step to the right at the right-most side of the cell, I end up on the + // left-most side. This means that the plaquettes in the padding are wrong. Luckily + // all we care about are the plaquettes in the cell, which we obtain from Extract. + Umu = Ghost.Extract(Ughost_3link); + + NerscIO::writeConfiguration(Umu,param.conf_out,"HISQ"); Grid_finalize(); } \ No newline at end of file From ab56ad8d7a23a5662e1045dbedad4d06e57453a8 Mon Sep 17 00:00:00 2001 From: david clarke Date: Wed, 7 Jun 2023 21:14:58 -0600 Subject: [PATCH 005/114] fix 3-link stencil --- tests/smearing/Test_fatLinks.cc | 51 +++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index 85a60ee3..99081fea 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -12,7 +12,7 @@ #include using namespace Grid; -// This is to optimize the SIMD +// This is to optimize the SIMD (will also need to be in the class, at least for now) template void gpermute(vobj & inout,int perm) { vobj tmp=inout; if (perm & 0x1) {permute(inout,tmp,0); tmp=inout;} @@ -81,19 +81,27 @@ int main (int argc, char **argv) // This is where the 3-link constructs will be stored LatticeGaugeField Ughost_3link(Ughost.Grid()); - // Create 3-link stencil + // Create 3-link stencil (class will build its own stencils) + // writing your own stencil, you're hard-coding the periodic BCs, so you don't need + // the policy-based stuff, at least for now std::vector shifts; for(int mu=0;mu_offset; int o1 = SE1->_offset; int o2 = SE2->_offset; int o3 = SE3->_offset; + int o4 = SE4->_offset; + int o5 = SE5->_offset; - auto U0 = U_v[o0](mu); - auto U1 = U_v[o1](nu); - auto U2 = adj(U_v[o2](mu)); - auto U3 = adj(U_v[o3](nu)); + auto U0 = U_v[o0](nu); + auto U1 = adj(U_v[o1](mu)); + auto U2 = adj(U_v[o2](nu)); gpermute(U0,SE0->_permute); gpermute(U1,SE1->_permute); gpermute(U2,SE2->_permute); + + auto U3 = adj(U_v[o3](nu)); + auto U4 = adj(U_v[o4](mu)); + auto U5 = U_v[o5](nu); + gpermute(U3,SE3->_permute); + gpermute(U4,SE4->_permute); + gpermute(U5,SE5->_permute); - auto W = U1*U2*U3; - - // We add together contributions coming from each orientation. + // Forward contribution from this orientation + auto W = U0*U1*U2; U_3link_v[ss](mu) = U_3link_v[ss](mu) + W; - s=s+4; + // Backward contribution from this orientation + W = U3*U4*U5; + U_3link_v[ss](mu) = U_3link_v[ss](mu) + W; + + s=s+6; } } } From 4b994a1bc75901f124c886523c278c7ec771c899 Mon Sep 17 00:00:00 2001 From: david clarke Date: Thu, 8 Jun 2023 17:37:25 -0600 Subject: [PATCH 006/114] trouble with compilation --- Grid/qcd/smearing/HISQSmearing.h | 190 +++++++++++++++++++++++++++++++ tests/smearing/Test_fatLinks.cc | 127 ++------------------- 2 files changed, 200 insertions(+), 117 deletions(-) create mode 100644 Grid/qcd/smearing/HISQSmearing.h diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h new file mode 100644 index 00000000..53480f25 --- /dev/null +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -0,0 +1,190 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/smearing/StoutSmearing.h + +Copyright (C) 2019 + +Author: D. A. Clarke + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* + @file HISQSmearing.h + @brief Declares classes related to HISQ smearing +*/ + +// things like @brief are seen by things like doxygen and javadocs + +#pragma once + +#include +#include +#include + +NAMESPACE_BEGIN(Grid); + + +// This is to optimize the SIMD (will also need to be in the class, at least for now) +template void gpermute(vobj & inout,int perm) { + vobj tmp=inout; + if (perm & 0x1) {permute(inout,tmp,0); tmp=inout;} + if (perm & 0x2) {permute(inout,tmp,1); tmp=inout;} + if (perm & 0x4) {permute(inout,tmp,2); tmp=inout;} + if (perm & 0x8) {permute(inout,tmp,3); tmp=inout;} +} + + +/*! @brief 3-link smearing of link variable. */ +//template +//class Smear_HISQ_3link : public Smear { +class Smear_HISQ_3link { +// TODO: I'm not using Gimpl so I don't know how to inherit + +private: +// std::vector _linkTreatment; + GridBase* const _grid; + +public: +// INHERIT_GIMPL_TYPES(Gimpl) + + // Eventually this will take, e.g., coefficients as argument + Smear_HISQ_3link(GridBase* grid) : _grid(grid) { + assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); + } + + ~Smear_HISQ_3link() {} + + void smear(LatticeGaugeField& u_smr, const LatticeGaugeField& U) const { + + // Create a padded cell of extra padding depth=1 + int depth = 1; + PaddedCell Ghost(depth,this->_grid); + LatticeGaugeField Ughost = Ghost.Exchange(u_smr); + + // Array for (x) + GridBase *GhostGrid = Ughost.Grid(); + LatticeComplex gplaq(GhostGrid); + + // This is where the 3-link constructs will be stored + LatticeGaugeField Ughost_3link(Ughost.Grid()); + + // Create 3-link stencil (class will build its own stencils) + // writing your own stencil, you're hard-coding the periodic BCs, so you don't need + // the policy-based stuff, at least for now + std::vector shifts; + for(int mu=0;mu_offset; + int o1 = SE1->_offset; + int o2 = SE2->_offset; + int o3 = SE3->_offset; + int o4 = SE4->_offset; + int o5 = SE5->_offset; + + auto U0 = U_v[o0](nu); + auto U1 = adj(U_v[o1](mu)); + auto U2 = adj(U_v[o2](nu)); + + gpermute(U0,SE0->_permute); + gpermute(U1,SE1->_permute); + gpermute(U2,SE2->_permute); + + auto U3 = adj(U_v[o3](nu)); + auto U4 = adj(U_v[o4](mu)); + auto U5 = U_v[o5](nu); + + gpermute(U3,SE3->_permute); + gpermute(U4,SE4->_permute); + gpermute(U5,SE5->_permute); + + // Forward contribution from this orientation + auto W = U0*U1*U2; + U_3link_v[ss](mu) = U_3link_v[ss](mu) + W; + + // Backward contribution from this orientation + W = U3*U4*U5; + U_3link_v[ss](mu) = U_3link_v[ss](mu) + W; + + s=s+6; + } + } + } + + // Here is my understanding of this part: The padded cell has its own periodic BCs, so + // if I take a step to the right at the right-most side of the cell, I end up on the + // left-most side. This means that the plaquettes in the padding are wrong. Luckily + // all we care about are the plaquettes in the cell, which we obtain from Extract. + u_smr = Ghost.Extract(Ughost_3link); + }; + +// void derivative(const GaugeField& Gauge) const { +// }; +}; + +NAMESPACE_END(Grid); diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index 99081fea..124cfac3 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -10,16 +10,9 @@ #include #include #include +#include using namespace Grid; -// This is to optimize the SIMD (will also need to be in the class, at least for now) -template void gpermute(vobj & inout,int perm) { - vobj tmp=inout; - if (perm & 0x1) {permute(inout,tmp,0); tmp=inout;} - if (perm & 0x2) {permute(inout,tmp,1); tmp=inout;} - if (perm & 0x4) {permute(inout,tmp,2); tmp=inout;} - if (perm & 0x8) {permute(inout,tmp,3); tmp=inout;} -} // Make the logger work like Python print() template @@ -46,7 +39,11 @@ struct fatParams: Serializable { } }; - +// +// one method: input --> fat +// another : input --> long (naik) +// another : input --> unitarize +// int main (int argc, char **argv) { @@ -66,119 +63,15 @@ int main (int argc, char **argv) fatParams param(Reader); LatticeGaugeField Umu(&GRID); + LatticeGaugeField U_smr(&GRID); FieldMetaData header; NerscIO::readConfiguration(Umu, header, param.conf_in); - // Create a padded cell of extra padding depth=1 - int depth = 1; - PaddedCell Ghost(depth,&GRID); - LatticeGaugeField Ughost = Ghost.Exchange(Umu); + Smear_HISQ_3link hisq_3link(&GRID); - // Array for (x) - GridBase *GhostGrid = Ughost.Grid(); - LatticeComplex gplaq(GhostGrid); + hisq_3link.smear(U_smr,Umu); - // This is where the 3-link constructs will be stored - LatticeGaugeField Ughost_3link(Ughost.Grid()); - - // Create 3-link stencil (class will build its own stencils) - // writing your own stencil, you're hard-coding the periodic BCs, so you don't need - // the policy-based stuff, at least for now - std::vector shifts; - for(int mu=0;mu_offset; - int o1 = SE1->_offset; - int o2 = SE2->_offset; - int o3 = SE3->_offset; - int o4 = SE4->_offset; - int o5 = SE5->_offset; - - auto U0 = U_v[o0](nu); - auto U1 = adj(U_v[o1](mu)); - auto U2 = adj(U_v[o2](nu)); - - gpermute(U0,SE0->_permute); - gpermute(U1,SE1->_permute); - gpermute(U2,SE2->_permute); - - auto U3 = adj(U_v[o3](nu)); - auto U4 = adj(U_v[o4](mu)); - auto U5 = U_v[o5](nu); - - gpermute(U3,SE3->_permute); - gpermute(U4,SE4->_permute); - gpermute(U5,SE5->_permute); - - // Forward contribution from this orientation - auto W = U0*U1*U2; - U_3link_v[ss](mu) = U_3link_v[ss](mu) + W; - - // Backward contribution from this orientation - W = U3*U4*U5; - U_3link_v[ss](mu) = U_3link_v[ss](mu) + W; - - s=s+6; - } - } - } - - // Here is my understanding of this part: The padded cell has its own periodic BCs, so - // if I take a step to the right at the right-most side of the cell, I end up on the - // left-most side. This means that the plaquettes in the padding are wrong. Luckily - // all we care about are the plaquettes in the cell, which we obtain from Extract. - Umu = Ghost.Extract(Ughost_3link); - - NerscIO::writeConfiguration(Umu,param.conf_out,"HISQ"); + NerscIO::writeConfiguration(U_smr,param.conf_out,"HISQ"); Grid_finalize(); } \ No newline at end of file From 1cf9ec1cce35748763df382532a611bc5386e1f5 Mon Sep 17 00:00:00 2001 From: david clarke Date: Fri, 9 Jun 2023 16:27:45 -0600 Subject: [PATCH 007/114] now compiles --- Grid/qcd/smearing/HISQSmearing.h | 164 +++++++++++++++++++++++++++---- tests/smearing/Test_fatLinks.cc | 4 +- 2 files changed, 149 insertions(+), 19 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 53480f25..95187d48 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -51,25 +51,21 @@ template void gpermute(vobj & inout,int perm) { } -/*! @brief 3-link smearing of link variable. */ -//template -//class Smear_HISQ_3link : public Smear { -class Smear_HISQ_3link { -// TODO: I'm not using Gimpl so I don't know how to inherit + +/*! @brief create fat links from link variables. */ +class Smear_HISQ_fat { private: -// std::vector _linkTreatment; - GridBase* const _grid; + GridCartesian* const _grid; public: -// INHERIT_GIMPL_TYPES(Gimpl) // Eventually this will take, e.g., coefficients as argument - Smear_HISQ_3link(GridBase* grid) : _grid(grid) { + Smear_HISQ_fat(GridCartesian* grid) : _grid(grid) { assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); } - ~Smear_HISQ_3link() {} + ~Smear_HISQ_fat() {} void smear(LatticeGaugeField& u_smr, const LatticeGaugeField& U) const { @@ -83,7 +79,7 @@ public: LatticeComplex gplaq(GhostGrid); // This is where the 3-link constructs will be stored - LatticeGaugeField Ughost_3link(Ughost.Grid()); + LatticeGaugeField Ughost_fat(Ughost.Grid()); // Create 3-link stencil (class will build its own stencils) // writing your own stencil, you're hard-coding the periodic BCs, so you don't need @@ -110,11 +106,11 @@ public: } GeneralLocalStencil gStencil(GhostGrid,shifts); - Ughost_3link=Zero(); + Ughost_fat=Zero(); - // Create the accessors, here U_v and U_3link_v + // Create the accessors, here U_v and U_fat_v autoView(U_v , Ughost , CpuRead); - autoView(U_3link_v, Ughost_3link, CpuWrite); + autoView(U_fat_v, Ughost_fat, CpuWrite); // This is a loop over local sites. for(int ss=0;ss_grid); + LatticeGaugeField Ughost = Ghost.Exchange(u_smr); + + GridBase *GhostGrid = Ughost.Grid(); + LatticeComplex gplaq(GhostGrid); + + LatticeGaugeField Ughost_naik(Ughost.Grid()); + + std::vector shifts; + for(int mu=0;mu_offset; + int o1 = SE1->_offset; + int o2 = SE2->_offset; + int o3 = SE3->_offset; + int o4 = SE4->_offset; + int o5 = SE5->_offset; + + auto U0 = U_v[o0](nu); + auto U1 = adj(U_v[o1](mu)); + auto U2 = adj(U_v[o2](nu)); + + gpermute(U0,SE0->_permute); + gpermute(U1,SE1->_permute); + gpermute(U2,SE2->_permute); + + auto U3 = adj(U_v[o3](nu)); + auto U4 = adj(U_v[o4](mu)); + auto U5 = U_v[o5](nu); + + gpermute(U3,SE3->_permute); + gpermute(U4,SE4->_permute); + gpermute(U5,SE5->_permute); + + // Forward contribution from this orientation + auto W = U0*U1*U2; + U_naik_v[ss](mu) = U_naik_v[ss](mu) + W; + + // Backward contribution from this orientation + W = U3*U4*U5; + U_naik_v[ss](mu) = U_naik_v[ss](mu) + W; + + s=s+6; + } + } + } + + // Here is my understanding of this part: The padded cell has its own periodic BCs, so + // if I take a step to the right at the right-most side of the cell, I end up on the + // left-most side. This means that the plaquettes in the padding are wrong. Luckily + // all we care about are the plaquettes in the cell, which we obtain from Extract. + u_smr = Ghost.Extract(Ughost_naik); }; // void derivative(const GaugeField& Gauge) const { // }; }; + NAMESPACE_END(Grid); diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index 124cfac3..e87c48e0 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -67,9 +67,9 @@ int main (int argc, char **argv) FieldMetaData header; NerscIO::readConfiguration(Umu, header, param.conf_in); - Smear_HISQ_3link hisq_3link(&GRID); + Smear_HISQ_fat hisq_fat(&GRID); - hisq_3link.smear(U_smr,Umu); + hisq_fat.smear(U_smr,Umu); NerscIO::writeConfiguration(U_smr,param.conf_out,"HISQ"); From 14d352ea4f5a5d513fd21165a31f599e59680012 Mon Sep 17 00:00:00 2001 From: david clarke Date: Mon, 12 Jun 2023 16:55:44 -0600 Subject: [PATCH 008/114] added smearParams struct --- Grid/qcd/smearing/HISQSmearing.h | 33 +++++++++++++++++++++++++++++--- tests/smearing/Test_fatLinks.cc | 21 +++++++++++++------- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 95187d48..9fae32fd 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -51,17 +51,44 @@ template void gpermute(vobj & inout,int perm) { } +/*! @brief structure holding the link treatment */ +struct SmearingParameters{ + SmearingParameters(){} + Real c_1; // 1 link + Real c_naik; // Naik term + Real c_3; // 3 link + Real c_5; // 5 link + Real c_7; // 7 link + Real c_lp; // 5 link Lepage + SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) : + c_1(c1), + c_naik(cnaik), + c_3(c3), + c_5(c5), + c_7(c7), + c_lp(clp){} +}; -/*! @brief create fat links from link variables. */ + +/*! @brief create fat links from link variables */ class Smear_HISQ_fat { private: GridCartesian* const _grid; + SmearingParameters _LVL1; public: - // Eventually this will take, e.g., coefficients as argument - Smear_HISQ_fat(GridCartesian* grid) : _grid(grid) { + Smear_HISQ_fat(GridCartesian* grid, Real c1=1/8., Real cnaik=0., Real c3=1/16., Real c5=1/64., Real c7=1/384., Real clp=0.) + : _grid(grid), + _LVL1(c1,cnaik,c3,c5,c7,clp) { + assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); + } + + // Allow to pass a pointer to a C-style, double array for MILC convenience + Smear_HISQ_fat(GridCartesian* grid, double* coeff) + : _grid(grid), + _LVL1(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) { assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); } diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index e87c48e0..1854ea71 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -47,31 +47,38 @@ struct fatParams: Serializable { int main (int argc, char **argv) { + // Initialize the Grid Grid_init(&argc,&argv); - Coordinate latt_size = GridDefaultLatt(); Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); - Grid_log("mpi = ",mpi_layout); Grid_log("simd = ",simd_layout); Grid_log("latt = ",latt_size); - GridCartesian GRID(latt_size,simd_layout,mpi_layout); - XmlReader Reader("fatParams.xml",false, "grid"); - fatParams param(Reader); - + // Instantiate the LatticeGaugeField objects holding thin (Umu) and fat (U_smr) links LatticeGaugeField Umu(&GRID); LatticeGaugeField U_smr(&GRID); + + // Read in the parameter file + XmlReader Reader("fatParams.xml",false, "grid"); + fatParams param(Reader); FieldMetaData header; + + // Read the configuration into Umu NerscIO::readConfiguration(Umu, header, param.conf_in); + // Smear Umu and store result in U_smr Smear_HISQ_fat hisq_fat(&GRID); - hisq_fat.smear(U_smr,Umu); NerscIO::writeConfiguration(U_smr,param.conf_out,"HISQ"); + // In the following, we test instantiation of Smear_HISQ_fat in different ways: + Smear_HISQ_fat hisq_fat_typical(&GRID,1,2,3,4,5,6); + double path_coeff[6] = {1, 2, 3, 4, 5, 6}; + Smear_HISQ_fat hisq_fat_Cstyle(&GRID,path_coeff); + Grid_finalize(); } \ No newline at end of file From 452bf2e9076e852cd8d3286863b0523f6eeb142a Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Tue, 20 Jun 2023 20:36:24 +0300 Subject: [PATCH 009/114] Accelerator basisRotate also on HIP --- Grid/lattice/Lattice_basis.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index 0928cbd7..9415bd4f 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) basis_v.push_back(basis[k].View(AcceleratorWrite)); } -#if ( (!defined(GRID_CUDA)) ) +#if ( !(defined(GRID_CUDA) || defined(GRID_HIP)) ) int max_threads = thread_max(); Vector < vobj > Bt(Nm * max_threads); thread_region From 26b2caf5706b482c0461d52fb03a60e0c20b1127 Mon Sep 17 00:00:00 2001 From: david clarke Date: Tue, 20 Jun 2023 15:37:54 -0600 Subject: [PATCH 010/114] add template parameter to Smear_HISQ_fat for MILC interfacing --- Grid/qcd/smearing/HISQSmearing.h | 38 ++++++++++++++------------------ tests/smearing/Test_fatLinks.cc | 7 +++--- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 9fae32fd..b8fc5aef 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -71,6 +71,7 @@ struct SmearingParameters{ /*! @brief create fat links from link variables */ +template // TODO: change to Gimpl? class Smear_HISQ_fat { private: @@ -79,7 +80,8 @@ private: public: - Smear_HISQ_fat(GridCartesian* grid, Real c1=1/8., Real cnaik=0., Real c3=1/16., Real c5=1/64., Real c7=1/384., Real clp=0.) + // Don't allow default values here. + Smear_HISQ_fat(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) : _grid(grid), _LVL1(c1,cnaik,c3,c5,c7,clp) { assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); @@ -94,19 +96,21 @@ public: ~Smear_HISQ_fat() {} - void smear(LatticeGaugeField& u_smr, const LatticeGaugeField& U) const { - + void smear(LGF& u_smr, LGF& u_thin) const { + + SmearingParameters lvl1 = this->_LVL1; + // Create a padded cell of extra padding depth=1 int depth = 1; PaddedCell Ghost(depth,this->_grid); - LatticeGaugeField Ughost = Ghost.Exchange(u_smr); + LGF Ughost = Ghost.Exchange(u_thin); // Array for (x) GridBase *GhostGrid = Ughost.Grid(); LatticeComplex gplaq(GhostGrid); // This is where the 3-link constructs will be stored - LatticeGaugeField Ughost_fat(Ughost.Grid()); + LGF Ughost_fat(Ughost.Grid()); // Create 3-link stencil (class will build its own stencils) // writing your own stencil, you're hard-coding the periodic BCs, so you don't need @@ -136,7 +140,7 @@ public: Ughost_fat=Zero(); // Create the accessors, here U_v and U_fat_v - autoView(U_v , Ughost , CpuRead); + autoView(U_v , Ughost , CpuRead); autoView(U_fat_v, Ughost_fat, CpuWrite); // This is a loop over local sites. @@ -186,12 +190,8 @@ public: gpermute(U4,SE4->_permute); gpermute(U5,SE5->_permute); - // Forward contribution from this orientation - auto W = U0*U1*U2; - U_fat_v[ss](mu) = U_fat_v[ss](mu) + W; - - // Backward contribution from this orientation - W = U3*U4*U5; + // forward backward + auto W = U0*U1*U2 + U3*U4*U5; U_fat_v[ss](mu) = U_fat_v[ss](mu) + W; s=s+6; @@ -199,14 +199,9 @@ public: } } - // Here is my understanding of this part: The padded cell has its own periodic BCs, so - // if I take a step to the right at the right-most side of the cell, I end up on the - // left-most side. This means that the plaquettes in the padding are wrong. Luckily - // all we care about are the plaquettes in the cell, which we obtain from Extract. - u_smr = Ghost.Extract(Ughost_fat); + u_smr = lvl1.c_3*Ghost.Extract(Ughost_fat) + lvl1.c_1*u_thin; }; - // I guess the way this will go is: // 1. 3-link smear // 2. exchange @@ -219,6 +214,7 @@ public: /*! @brief create long links from link variables. */ +template class Smear_HISQ_Naik { private: @@ -233,16 +229,16 @@ public: ~Smear_HISQ_Naik() {} - void smear(LatticeGaugeField& u_smr, const LatticeGaugeField& U) const { + void smear(LGF& u_smr, const LGF& U) const { int depth = 1; PaddedCell Ghost(depth,this->_grid); - LatticeGaugeField Ughost = Ghost.Exchange(u_smr); + LGF Ughost = Ghost.Exchange(u_smr); GridBase *GhostGrid = Ughost.Grid(); LatticeComplex gplaq(GhostGrid); - LatticeGaugeField Ughost_naik(Ughost.Grid()); + LGF Ughost_naik(Ughost.Grid()); std::vector shifts; for(int mu=0;mu hisq_fat(&GRID,1/8.,0.,1/16.,1/64.,1/384.,0.); hisq_fat.smear(U_smr,Umu); NerscIO::writeConfiguration(U_smr,param.conf_out,"HISQ"); - // In the following, we test instantiation of Smear_HISQ_fat in different ways: - Smear_HISQ_fat hisq_fat_typical(&GRID,1,2,3,4,5,6); + // Test a C-style instantiation double path_coeff[6] = {1, 2, 3, 4, 5, 6}; - Smear_HISQ_fat hisq_fat_Cstyle(&GRID,path_coeff); + Smear_HISQ_fat hisq_fat_Cstyle(&GRID,path_coeff); Grid_finalize(); } \ No newline at end of file From f44f005dad99325f52e2390033f38fb6cbc30181 Mon Sep 17 00:00:00 2001 From: david clarke Date: Tue, 20 Jun 2023 15:48:27 -0600 Subject: [PATCH 011/114] rename _lvl1 --> _linkTreatment --- Grid/qcd/smearing/HISQSmearing.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index b8fc5aef..1e7e93ab 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -76,21 +76,21 @@ class Smear_HISQ_fat { private: GridCartesian* const _grid; - SmearingParameters _LVL1; + SmearingParameters _linkTreatment; public: // Don't allow default values here. Smear_HISQ_fat(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) : _grid(grid), - _LVL1(c1,cnaik,c3,c5,c7,clp) { + _linkTreatment(c1,cnaik,c3,c5,c7,clp) { assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); } // Allow to pass a pointer to a C-style, double array for MILC convenience Smear_HISQ_fat(GridCartesian* grid, double* coeff) : _grid(grid), - _LVL1(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) { + _linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) { assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); } @@ -98,7 +98,7 @@ public: void smear(LGF& u_smr, LGF& u_thin) const { - SmearingParameters lvl1 = this->_LVL1; + SmearingParameters lt = this->_linkTreatment; // Create a padded cell of extra padding depth=1 int depth = 1; @@ -199,7 +199,7 @@ public: } } - u_smr = lvl1.c_3*Ghost.Extract(Ughost_fat) + lvl1.c_1*u_thin; + u_smr = lt.c_3*Ghost.Extract(Ughost_fat) + lt.c_1*u_thin; }; // I guess the way this will go is: From d536c67b9d407617bf8d2f17c5b9efa95bad8d00 Mon Sep 17 00:00:00 2001 From: david clarke Date: Tue, 20 Jun 2023 16:04:48 -0600 Subject: [PATCH 012/114] add HISQSmearing to Smearing.h --- Grid/qcd/smearing/Smearing.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/qcd/smearing/Smearing.h b/Grid/qcd/smearing/Smearing.h index da5ede72..41a305ae 100644 --- a/Grid/qcd/smearing/Smearing.h +++ b/Grid/qcd/smearing/Smearing.h @@ -5,4 +5,5 @@ #include #include #include +#include From df99f227c16a152b648ec3b86122c02443bb04e7 Mon Sep 17 00:00:00 2001 From: david clarke Date: Thu, 22 Jun 2023 14:57:10 -0600 Subject: [PATCH 013/114] include missing staple orientations; invert path direction, which was backwards --- Grid/qcd/smearing/HISQSmearing.h | 185 ++++++++++++++++--------------- 1 file changed, 95 insertions(+), 90 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 1e7e93ab..94b47e52 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -104,101 +104,106 @@ public: int depth = 1; PaddedCell Ghost(depth,this->_grid); LGF Ughost = Ghost.Exchange(u_thin); - + // Array for (x) GridBase *GhostGrid = Ughost.Grid(); LatticeComplex gplaq(GhostGrid); - + // This is where the 3-link constructs will be stored LGF Ughost_fat(Ughost.Grid()); - // Create 3-link stencil (class will build its own stencils) - // writing your own stencil, you're hard-coding the periodic BCs, so you don't need - // the policy-based stuff, at least for now + // Create 3-link stencil. Writing your own stencil, you're hard-coding the + // periodic BCs, so you don't need the policy-based stuff, at least for now. + // Loop over all orientations, i.e. demand mu != nu. std::vector shifts; - for(int mu=0;mu_offset; - int o1 = SE1->_offset; - int o2 = SE2->_offset; - int o3 = SE3->_offset; - int o4 = SE4->_offset; - int o5 = SE5->_offset; - - auto U0 = U_v[o0](nu); - auto U1 = adj(U_v[o1](mu)); - auto U2 = adj(U_v[o2](nu)); - - gpermute(U0,SE0->_permute); - gpermute(U1,SE1->_permute); - gpermute(U2,SE2->_permute); - - auto U3 = adj(U_v[o3](nu)); - auto U4 = adj(U_v[o4](mu)); - auto U5 = U_v[o5](nu); - - gpermute(U3,SE3->_permute); - gpermute(U4,SE4->_permute); - gpermute(U5,SE5->_permute); - - // forward backward - auto W = U0*U1*U2 + U3*U4*U5; - U_fat_v[ss](mu) = U_fat_v[ss](mu) + W; - - s=s+6; - } + + for(int mu=0;mu_offset; + int o1 = SE1->_offset; + int o2 = SE2->_offset; + int o3 = SE3->_offset; + int o4 = SE4->_offset; + int o5 = SE5->_offset; + + // When you're deciding whether to take an adjoint, the question is: how is the + // stored link oriented compared to the one you want? If I imagine myself travelling + // with the to-be-updated link, I have two possible, alternative 3-link paths I can + // take, one starting by going to the left, the other starting by going to the right. + auto U0 = adj(U_v[o0](nu)); + auto U1 = U_v[o1](mu); + auto U2 = U_v[o2](nu); + + gpermute(U0,SE0->_permute); + gpermute(U1,SE1->_permute); + gpermute(U2,SE2->_permute); + + auto U3 = U_v[o3](nu); + auto U4 = U_v[o4](mu); + auto U5 = adj(U_v[o5](nu)); + + gpermute(U3,SE3->_permute); + gpermute(U4,SE4->_permute); + gpermute(U5,SE5->_permute); + + // "left" "right" + auto W = U2*U1*U0 + U5*U4*U3; + U_fat_v[ss](mu) = U_fat_v[ss](mu) + W; + + s=s+6; } } - + u_smr = lt.c_3*Ghost.Extract(Ughost_fat) + lt.c_1*u_thin; }; @@ -261,23 +266,23 @@ public: } } GeneralLocalStencil gStencil(GhostGrid,shifts); - + Ughost_naik=Zero(); - + // Create the accessors, here U_v and U_fat_v autoView(U_v , Ughost , CpuRead); autoView(U_naik_v, Ughost_naik, CpuWrite); - + // This is a loop over local sites. for(int ss=0;ss_offset; int o1 = SE1->_offset; @@ -298,36 +303,36 @@ public: int o3 = SE3->_offset; int o4 = SE4->_offset; int o5 = SE5->_offset; - + auto U0 = U_v[o0](nu); auto U1 = adj(U_v[o1](mu)); auto U2 = adj(U_v[o2](nu)); - + gpermute(U0,SE0->_permute); gpermute(U1,SE1->_permute); gpermute(U2,SE2->_permute); - + auto U3 = adj(U_v[o3](nu)); auto U4 = adj(U_v[o4](mu)); auto U5 = U_v[o5](nu); - + gpermute(U3,SE3->_permute); gpermute(U4,SE4->_permute); gpermute(U5,SE5->_permute); - + // Forward contribution from this orientation auto W = U0*U1*U2; U_naik_v[ss](mu) = U_naik_v[ss](mu) + W; - + // Backward contribution from this orientation W = U3*U4*U5; U_naik_v[ss](mu) = U_naik_v[ss](mu) + W; - + s=s+6; } } } - + // Here is my understanding of this part: The padded cell has its own periodic BCs, so // if I take a step to the right at the right-most side of the cell, I end up on the // left-most side. This means that the plaquettes in the padding are wrong. Luckily From eeb4703b84b723722ad654e834ad0e7af8a637c3 Mon Sep 17 00:00:00 2001 From: david clarke Date: Mon, 26 Jun 2023 17:45:35 -0600 Subject: [PATCH 014/114] develop wrappers to make the stencils easier to construct --- Grid/qcd/smearing/HISQSmearing.h | 49 ++++++++++++++++---------------- tests/smearing/Test_fatLinks.cc | 7 +++++ 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 94b47e52..d36fd85f 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -51,8 +51,17 @@ template void gpermute(vobj & inout,int perm) { } +void appendShift(std::vector& shifts, int mu, int steps=1) { + Coordinate shift(Nd,0); + shift[mu]=steps; + // push_back creates an element at the end of shifts and + // assigns the data in the argument to it. + shifts.push_back(shift); +} + + /*! @brief structure holding the link treatment */ -struct SmearingParameters{ +struct SmearingParameters { SmearingParameters(){} Real c_1; // 1 link Real c_naik; // Naik term @@ -100,8 +109,10 @@ public: SmearingParameters lt = this->_linkTreatment; - // Create a padded cell of extra padding depth=1 - int depth = 1; + // We create a cell with extra padding 2. This allows us to capture the LePage + // term without needing to save intermediate gauge fields or extra halo exchanges. + // The tradeoff is that we compute extra constructs in the padding. + int depth = 2; PaddedCell Ghost(depth,this->_grid); LGF Ughost = Ghost.Exchange(u_thin); @@ -112,28 +123,20 @@ public: // This is where the 3-link constructs will be stored LGF Ughost_fat(Ughost.Grid()); - // Create 3-link stencil. Writing your own stencil, you're hard-coding the + // Next we make the stencils. Writing your own stencil, you're hard-coding the // periodic BCs, so you don't need the policy-based stuff, at least for now. // Loop over all orientations, i.e. demand mu != nu. std::vector shifts; for(int mu=0;mu hisq_fat_Cstyle(&GRID,path_coeff); + // Make sure result doesn't change w.r.t. a trusted lattice + NerscIO::readConfiguration(Umu, header, "nersc.l8t4b3360.3link.control"); + LatticeGaugeField diff(&GRID); + diff = Umu-U_smr; + auto absDiff = norm2(diff)/norm2(Umu); + Grid_log(" |Umu-U|/|Umu| =",absDiff); + Grid_finalize(); } \ No newline at end of file From a7eabaad566cd2b740ad98e370eb7800f9394039 Mon Sep 17 00:00:00 2001 From: david clarke Date: Mon, 26 Jun 2023 23:59:28 -0600 Subject: [PATCH 015/114] rudimentary appendShift convenience method, which allows the user to append an arbitrary shift in one line --- Grid/qcd/smearing/HISQSmearing.h | 76 +++++++++++++++++++++++--------- tests/smearing/Test_fatLinks.cc | 4 +- 2 files changed, 56 insertions(+), 24 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index d36fd85f..e8587c9b 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -30,7 +30,6 @@ directory @brief Declares classes related to HISQ smearing */ -// things like @brief are seen by things like doxygen and javadocs #pragma once @@ -38,6 +37,9 @@ directory #include #include +#define BACKWARD_CONST 16 +#define NO_SHIFT -1 + NAMESPACE_BEGIN(Grid); @@ -51,9 +53,47 @@ template void gpermute(vobj & inout,int perm) { } -void appendShift(std::vector& shifts, int mu, int steps=1) { - Coordinate shift(Nd,0); - shift[mu]=steps; +/*! @brief signals that you want to go backwards in direction dir */ +inline int Back(const int dir) { + // generalShift will use BACKWARD_CONST to determine whether we step forward or + // backward. Should work as long as BACKWARD_CONST > Nd. + return dir + BACKWARD_CONST; +} + + +/*! @brief shift one unit in direction dir */ +void generalShift(Coordinate& shift, int dir) { + if (dir >= BACKWARD_CONST) { + dir -= BACKWARD_CONST; + shift[dir]+=-1; + } else if (dir == NO_SHIFT) { + ; // do nothing + } else { + shift[dir]+=1; + } +} + + +/*! @brief follow a path of directions, shifting one unit in each direction */ +template +void generalShift(Coordinate& shift, int dir, Args... args) { + if (dir >= BACKWARD_CONST) { + dir -= BACKWARD_CONST; + shift[dir]+=-1; + } else if (dir == NO_SHIFT) { + ; // do nothing + } else { + shift[dir]+=1; + } + generalShift(shift, args...); +} + + +/*! @brief append arbitrary shift path to shifts */ +template +void appendShift(std::vector& shifts, int dir, Args... args) { + Coordinate shift(Nd,0); + generalShift(shift, dir, args...); // push_back creates an element at the end of shifts and // assigns the data in the argument to it. shifts.push_back(shift); @@ -123,22 +163,17 @@ public: // This is where the 3-link constructs will be stored LGF Ughost_fat(Ughost.Grid()); - // Next we make the stencils. Writing your own stencil, you're hard-coding the - // periodic BCs, so you don't need the policy-based stuff, at least for now. - // Loop over all orientations, i.e. demand mu != nu. + // Create a stencil, which is a collection of sites neighboring some initial site. std::vector shifts; for(int mu=0;mu_offset; @@ -177,7 +210,6 @@ public: int o2 = SE2->_offset; int o3 = SE3->_offset; int o4 = SE4->_offset; - int o5 = SE5->_offset; // When you're deciding whether to take an adjoint, the question is: how is the // stored link oriented compared to the one you want? If I imagine myself travelling @@ -193,17 +225,17 @@ public: auto U3 = U_v[o3](nu); auto U4 = U_v[o4](mu); - auto U5 = adj(U_v[o5](nu)); + auto U5 = adj(U_v[o4](nu)); gpermute(U3,SE3->_permute); gpermute(U4,SE4->_permute); - gpermute(U5,SE5->_permute); + gpermute(U4,SE4->_permute); // "left" "right" auto W = U2*U1*U0 + U5*U4*U3; U_fat_v[ss](mu) = U_fat_v[ss](mu) + W; - s=s+6; + s=s+5; } } diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index aa8e8f92..de1796b7 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -15,13 +15,13 @@ using namespace Grid; // Make the logger work like Python print() -template +template inline std::string sjoin(Args&&... args) noexcept { std::ostringstream msg; (msg << ... << args); return msg.str(); } -template +template inline void Grid_log(Args&&... args) { std::string msg = sjoin(std::forward(args)...); std::cout << GridLogMessage << msg << std::endl; From 9015c229dc194fdf2b6552e6137b7ffbe23c6e78 Mon Sep 17 00:00:00 2001 From: david clarke Date: Tue, 27 Jun 2023 21:28:26 -0600 Subject: [PATCH 016/114] add benchmark to see whether matrix multiplication is slower than read from object --- Grid/qcd/smearing/HISQSmearing.h | 149 +++------------------- benchmarks/Benchmark_su3mult_vs_lookup.cc | 125 ++++++++++++++++++ tests/smearing/Test_fatLinks.cc | 78 ++++++----- 3 files changed, 188 insertions(+), 164 deletions(-) create mode 100644 benchmarks/Benchmark_su3mult_vs_lookup.cc diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index e8587c9b..0c11bde4 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -2,9 +2,9 @@ Grid physics library, www.github.com/paboyle/Grid -Source file: ./lib/qcd/smearing/StoutSmearing.h +Source file: ./lib/qcd/smearing/HISQSmearing.h -Copyright (C) 2019 +Copyright (C) 2023 Author: D. A. Clarke @@ -56,7 +56,7 @@ template void gpermute(vobj & inout,int perm) { /*! @brief signals that you want to go backwards in direction dir */ inline int Back(const int dir) { // generalShift will use BACKWARD_CONST to determine whether we step forward or - // backward. Should work as long as BACKWARD_CONST > Nd. + // backward. Should work as long as BACKWARD_CONST > Nd. Trick inspired by SIMULATeQCD. return dir + BACKWARD_CONST; } @@ -191,45 +191,33 @@ public: for(int mu=0;mu_offset; - int o1 = SE1->_offset; - int o2 = SE2->_offset; - int o3 = SE3->_offset; - int o4 = SE4->_offset; + auto SE0 = gStencil.GetEntry(s+0,ss); int x_p_mu = SE0->_offset; + auto SE1 = gStencil.GetEntry(s+1,ss); int x_p_nu = SE1->_offset; + auto SE2 = gStencil.GetEntry(s+2,ss); int x = SE2->_offset; + auto SE3 = gStencil.GetEntry(s+3,ss); int x_p_mu_m_nu = SE3->_offset; + auto SE4 = gStencil.GetEntry(s+4,ss); int x_m_nu = SE4->_offset; // When you're deciding whether to take an adjoint, the question is: how is the // stored link oriented compared to the one you want? If I imagine myself travelling // with the to-be-updated link, I have two possible, alternative 3-link paths I can // take, one starting by going to the left, the other starting by going to the right. - auto U0 = adj(U_v[o0](nu)); - auto U1 = U_v[o1](mu); - auto U2 = U_v[o2](nu); + auto U0 = adj(U_v[x_p_mu](nu)); + auto U1 = U_v[x_p_nu](mu) ; + auto U2 = U_v[x ](nu) ; gpermute(U0,SE0->_permute); gpermute(U1,SE1->_permute); gpermute(U2,SE2->_permute); - auto U3 = U_v[o3](nu); - auto U4 = U_v[o4](mu); - auto U5 = adj(U_v[o4](nu)); + auto U3 = U_v[x_p_mu_m_nu](nu) ; + auto U4 = U_v[x_m_nu ](mu) ; + auto U5 = adj(U_v[x_m_nu ](nu)); gpermute(U3,SE3->_permute); gpermute(U4,SE4->_permute); - gpermute(U4,SE4->_permute); // "left" "right" auto W = U2*U1*U0 + U5*U4*U3; @@ -265,111 +253,8 @@ public: ~Smear_HISQ_Naik() {} - void smear(LGF& u_smr, const LGF& U) const { - - int depth = 1; - PaddedCell Ghost(depth,this->_grid); - LGF Ughost = Ghost.Exchange(u_smr); - - GridBase *GhostGrid = Ughost.Grid(); - LatticeComplex gplaq(GhostGrid); - - LGF Ughost_naik(Ughost.Grid()); - - std::vector shifts; - for(int mu=0;mu_offset; - int o1 = SE1->_offset; - int o2 = SE2->_offset; - int o3 = SE3->_offset; - int o4 = SE4->_offset; - int o5 = SE5->_offset; - - auto U0 = U_v[o0](nu); - auto U1 = adj(U_v[o1](mu)); - auto U2 = adj(U_v[o2](nu)); - - gpermute(U0,SE0->_permute); - gpermute(U1,SE1->_permute); - gpermute(U2,SE2->_permute); - - auto U3 = adj(U_v[o3](nu)); - auto U4 = adj(U_v[o4](mu)); - auto U5 = U_v[o5](nu); - - gpermute(U3,SE3->_permute); - gpermute(U4,SE4->_permute); - gpermute(U5,SE5->_permute); - - // Forward contribution from this orientation - auto W = U0*U1*U2; - U_naik_v[ss](mu) = U_naik_v[ss](mu) + W; - - // Backward contribution from this orientation - W = U3*U4*U5; - U_naik_v[ss](mu) = U_naik_v[ss](mu) + W; - - s=s+6; - } - } - } - - // Here is my understanding of this part: The padded cell has its own periodic BCs, so - // if I take a step to the right at the right-most side of the cell, I end up on the - // left-most side. This means that the plaquettes in the padding are wrong. Luckily - // all we care about are the plaquettes in the cell, which we obtain from Extract. - u_smr = Ghost.Extract(Ughost_naik); - }; +// void smear(LGF& u_smr, const LGF& U) const { +// }; // void derivative(const GaugeField& Gauge) const { // }; diff --git a/benchmarks/Benchmark_su3mult_vs_lookup.cc b/benchmarks/Benchmark_su3mult_vs_lookup.cc new file mode 100644 index 00000000..5f9c98ba --- /dev/null +++ b/benchmarks/Benchmark_su3mult_vs_lookup.cc @@ -0,0 +1,125 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_su3mult_vs_lookup.cc + + Copyright (C) 2023 + + Author: D. A. Clarke + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* + @file Benchmark_su3mult_vs_lookup.cc + @brief check to see whether su3 multiplication or lookup tables is faster +*/ + +#include +using namespace Grid; + +/*! @brief make the logger work like python print */ +template +inline std::string sjoin(Args&&... args) noexcept { + std::ostringstream msg; + (msg << ... << args); + return msg.str(); +} +template +inline void Grid_log(Args&&... args) { + std::string msg = sjoin(std::forward(args)...); + std::cout << GridLogMessage << msg << std::endl; +} + +/*! @brief parameter file to easily adjust Nloop */ +struct ConfParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS( + ConfParameters, + int, Nloop); + + template + ConfParameters(Reader& Reader){ + read(Reader, "parameters", *this); + } +}; + +int main (int argc, char** argv) { + + // Params for the test. + int Ns = 8; + int Nt = 4; + int threads = GridThread::GetThreads(); + std::string conf_in = "nersc.l8t4b3360"; + Coordinate latt_size(Nd,0); latt_size[0]=Ns; latt_size[1]=Ns; latt_size[2]=Ns; latt_size[3]=Nt; + + Grid_init(&argc,&argv); + + Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + GridCartesian GRID(latt_size,simd_layout,mpi_layout); + + Grid_log(" mpi = ",mpi_layout); + Grid_log(" simd = ",simd_layout); + Grid_log(" latt = ",latt_size); + Grid_log("threads = ",threads); + + XmlReader Reader("mult_vs_lookup.xml",false, "grid"); + ConfParameters param(Reader); + Grid_log(" Nloop = ",param.Nloop); + + // Gauge field and accessor + LatticeGaugeField Umu(&GRID); + autoView(U_v, Umu, CpuRead); + + // Read the configuration into Umu + FieldMetaData header; + NerscIO::readConfiguration(Umu, header, conf_in); + + // Read in lattice sequentially, Nloop times + double lookupTime = 0.; + for(int i=0;i + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* + @file Test_fatLinks.cc + @brief test of the HISQ smearing +*/ + #include #include @@ -14,7 +38,7 @@ using namespace Grid; -// Make the logger work like Python print() +/*! @brief make the logger work like python print */ template inline std::string sjoin(Args&&... args) noexcept { std::ostringstream msg; @@ -27,32 +51,26 @@ inline void Grid_log(Args&&... args) { std::cout << GridLogMessage << msg << std::endl; } -struct fatParams: Serializable { - GRID_SERIALIZABLE_CLASS_MEMBERS( - fatParams, - std::string, conf_in, - std::string, conf_out); - - template - fatParams(Reader& Reader){ - read(Reader, "parameters", *this); - } -}; - // // one method: input --> fat // another : input --> long (naik) // another : input --> unitarize // -int main (int argc, char **argv) -{ +int main (int argc, char** argv) { + + // Params for the test. + int Ns = 8; + int Nt = 4; + Coordinate latt_size(Nd,0); latt_size[0]=Ns; latt_size[1]=Ns; latt_size[2]=Ns; latt_size[3]=Nt; + std::string conf_in = "nersc.l8t4b3360"; + std::string conf_out = "nersc.l8t4b3360.3link"; + // Initialize the Grid Grid_init(&argc,&argv); - Coordinate latt_size = GridDefaultLatt(); Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); - Grid_log("mpi = ",mpi_layout); + Grid_log(" mpi = ",mpi_layout); Grid_log("simd = ",simd_layout); Grid_log("latt = ",latt_size); GridCartesian GRID(latt_size,simd_layout,mpi_layout); @@ -61,19 +79,15 @@ int main (int argc, char **argv) LatticeGaugeField Umu(&GRID); LatticeGaugeField U_smr(&GRID); - // Read in the parameter file - XmlReader Reader("fatParams.xml",false, "grid"); - fatParams param(Reader); - FieldMetaData header; - // Read the configuration into Umu - NerscIO::readConfiguration(Umu, header, param.conf_in); + FieldMetaData header; + NerscIO::readConfiguration(Umu, header, conf_in); // Smear Umu and store result in U_smr Smear_HISQ_fat hisq_fat(&GRID,1/8.,0.,1/16.,1/64.,1/384.,0.); hisq_fat.smear(U_smr,Umu); - NerscIO::writeConfiguration(U_smr,param.conf_out,"HISQ"); + NerscIO::writeConfiguration(U_smr,conf_out,"HISQ"); // Test a C-style instantiation double path_coeff[6] = {1, 2, 3, 4, 5, 6}; From 9d263d9a7d7268fd8e8e909e76cd19ead9a6fa0c Mon Sep 17 00:00:00 2001 From: david clarke Date: Wed, 28 Jun 2023 10:05:34 -0600 Subject: [PATCH 017/114] fix bug in HISQSmearing; move benchmark b/c i don't understand how makefiles work --- Grid/qcd/smearing/HISQSmearing.h | 42 ++++++++----------- .../smearing}/Benchmark_su3mult_vs_lookup.cc | 0 tests/smearing/Test_fatLinks.cc | 2 +- 3 files changed, 18 insertions(+), 26 deletions(-) rename {benchmarks => tests/smearing}/Benchmark_su3mult_vs_lookup.cc (100%) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 0c11bde4..cba761f4 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -62,6 +62,7 @@ inline int Back(const int dir) { /*! @brief shift one unit in direction dir */ +template void generalShift(Coordinate& shift, int dir) { if (dir >= BACKWARD_CONST) { dir -= BACKWARD_CONST; @@ -101,7 +102,7 @@ void appendShift(std::vector& shifts, int dir, Args... args) { /*! @brief structure holding the link treatment */ -struct SmearingParameters { +struct SmearingParameters{ SmearingParameters(){} Real c_1; // 1 link Real c_naik; // Naik term @@ -149,10 +150,8 @@ public: SmearingParameters lt = this->_linkTreatment; - // We create a cell with extra padding 2. This allows us to capture the LePage - // term without needing to save intermediate gauge fields or extra halo exchanges. - // The tradeoff is that we compute extra constructs in the padding. - int depth = 2; + // Create a padded cell of extra padding depth=1 + int depth = 1; PaddedCell Ghost(depth,this->_grid); LGF Ughost = Ghost.Exchange(u_thin); @@ -163,7 +162,9 @@ public: // This is where the 3-link constructs will be stored LGF Ughost_fat(Ughost.Grid()); - // Create a stencil, which is a collection of sites neighboring some initial site. + // Create 3-link stencil. Writing your own stencil, you're hard-coding the + // periodic BCs, so you don't need the policy-based stuff, at least for now. + // Loop over all orientations, i.e. demand mu != nu. std::vector shifts; for(int mu=0;mu_permute); + auto U1 = U_v[x_p_nu ](mu); gpermute(U1,SE1->_permute); + auto U2 = U_v[x ](nu); gpermute(U2,SE2->_permute); + auto U3 = U_v[x_p_mu_m_nu](nu); gpermute(U3,SE3->_permute); + auto U4 = U_v[x_m_nu ](mu); gpermute(U4,SE4->_permute); + auto U5 = U_v[x_m_nu ](nu); gpermute(U5,SE4->_permute); - gpermute(U0,SE0->_permute); - gpermute(U1,SE1->_permute); - gpermute(U2,SE2->_permute); - - auto U3 = U_v[x_p_mu_m_nu](nu) ; - auto U4 = U_v[x_m_nu ](mu) ; - auto U5 = adj(U_v[x_m_nu ](nu)); - - gpermute(U3,SE3->_permute); - gpermute(U4,SE4->_permute); - - // "left" "right" - auto W = U2*U1*U0 + U5*U4*U3; + // "left" "right" + auto W = U2*U1*adj(U0) + adj(U5)*U4*U3; U_fat_v[ss](mu) = U_fat_v[ss](mu) + W; s=s+5; @@ -235,8 +229,6 @@ public: // }; }; - - /*! @brief create long links from link variables. */ template class Smear_HISQ_Naik { @@ -261,4 +253,4 @@ public: }; -NAMESPACE_END(Grid); +NAMESPACE_END(Grid); \ No newline at end of file diff --git a/benchmarks/Benchmark_su3mult_vs_lookup.cc b/tests/smearing/Benchmark_su3mult_vs_lookup.cc similarity index 100% rename from benchmarks/Benchmark_su3mult_vs_lookup.cc rename to tests/smearing/Benchmark_su3mult_vs_lookup.cc diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index 5bfdd891..5a19e835 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -98,7 +98,7 @@ int main (int argc, char** argv) { LatticeGaugeField diff(&GRID); diff = Umu-U_smr; auto absDiff = norm2(diff)/norm2(Umu); - Grid_log(" |Umu-U|/|Umu| =",absDiff); + Grid_log(" |Umu-U|/|Umu| = ",absDiff); Grid_finalize(); } \ No newline at end of file From 99d879ea7ff4dda0309311529b58e185cba3ac7c Mon Sep 17 00:00:00 2001 From: david clarke Date: Fri, 11 Aug 2023 22:56:30 -0600 Subject: [PATCH 018/114] 5-link first attempt --- Grid/qcd/smearing/HISQSmearing.h | 125 ++++++++++++------ tests/smearing/Benchmark_su3mult_vs_lookup.cc | 125 ------------------ tests/smearing/Test_fatLinks.cc | 65 ++++++++- 3 files changed, 145 insertions(+), 170 deletions(-) delete mode 100644 tests/smearing/Benchmark_su3mult_vs_lookup.cc diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index cba761f4..e6bab0f1 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -61,6 +61,14 @@ inline int Back(const int dir) { } +/*! @brief figure out the stencil index from mu and nu */ +inline int stencilIndex(int mu, int nu) { + // Nshifts depends on how you built the stencil + int Nshifts = 5; + return Nshifts*nu + Nd*Nshifts*mu; +} + + /*! @brief shift one unit in direction dir */ template void generalShift(Coordinate& shift, int dir) { @@ -135,6 +143,7 @@ public: : _grid(grid), _linkTreatment(c1,cnaik,c3,c5,c7,clp) { assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); + assert(Nd == 4 && "HISQ smearing only defined for Nd==4"); } // Allow to pass a pointer to a C-style, double array for MILC convenience @@ -142,6 +151,7 @@ public: : _grid(grid), _linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) { assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); + assert(Nd == 4 && "HISQ smearing only defined for Nd==4"); } ~Smear_HISQ_fat() {} @@ -150,25 +160,20 @@ public: SmearingParameters lt = this->_linkTreatment; - // Create a padded cell of extra padding depth=1 + // Create a padded cell of extra padding depth=1 and fill the padding. int depth = 1; PaddedCell Ghost(depth,this->_grid); LGF Ughost = Ghost.Exchange(u_thin); - // Array for (x) - GridBase *GhostGrid = Ughost.Grid(); - LatticeComplex gplaq(GhostGrid); - - // This is where the 3-link constructs will be stored + // This is where auxiliary N-link fields and the final smear will be stored. LGF Ughost_fat(Ughost.Grid()); + LGF Ughost_3link(Ughost.Grid()); - // Create 3-link stencil. Writing your own stencil, you're hard-coding the - // periodic BCs, so you don't need the policy-based stuff, at least for now. - // Loop over all orientations, i.e. demand mu != nu. + // Create 3-link stencil. We allow mu==nu just to make the indexing easier. + // Shifts with mu==nu will not be used. std::vector shifts; for(int mu=0;mu_offset; + auto SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + auto SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; + auto SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + auto SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; - auto SE0 = gStencil.GetEntry(s+0,ss); int x_p_mu = SE0->_offset; - auto SE1 = gStencil.GetEntry(s+1,ss); int x_p_nu = SE1->_offset; - auto SE2 = gStencil.GetEntry(s+2,ss); int x = SE2->_offset; - auto SE3 = gStencil.GetEntry(s+3,ss); int x_p_mu_m_nu = SE3->_offset; - auto SE4 = gStencil.GetEntry(s+4,ss); int x_m_nu = SE4->_offset; + // When you're deciding whether to take an adjoint, the question is: how is the + // stored link oriented compared to the one you want? If I imagine myself travelling + // with the to-be-updated link, I have two possible, alternative 3-link paths I can + // take, one starting by going to the left, the other starting by going to the right. + auto U0 = U_v[x_p_mu ](nu); gpermute(U0,SE0->_permute); + auto U1 = U_v[x_p_nu ](mu); gpermute(U1,SE1->_permute); + auto U2 = U_v[x ](nu); gpermute(U2,SE2->_permute); + auto U3 = U_v[x_p_mu_m_nu](nu); gpermute(U3,SE3->_permute); + auto U4 = U_v[x_m_nu ](mu); gpermute(U4,SE4->_permute); + auto U5 = U_v[x_m_nu ](nu); gpermute(U5,SE4->_permute); - // When you're deciding whether to take an adjoint, the question is: how is the - // stored link oriented compared to the one you want? If I imagine myself travelling - // with the to-be-updated link, I have two possible, alternative 3-link paths I can - // take, one starting by going to the left, the other starting by going to the right. - auto U0 = U_v[x_p_mu ](nu); gpermute(U0,SE0->_permute); - auto U1 = U_v[x_p_nu ](mu); gpermute(U1,SE1->_permute); - auto U2 = U_v[x ](nu); gpermute(U2,SE2->_permute); - auto U3 = U_v[x_p_mu_m_nu](nu); gpermute(U3,SE3->_permute); - auto U4 = U_v[x_m_nu ](mu); gpermute(U4,SE4->_permute); - auto U5 = U_v[x_m_nu ](nu); gpermute(U5,SE4->_permute); + // "left" "right" + auto W = U2*U1*adj(U0) + adj(U5)*U4*U3; - // "left" "right" - auto W = U2*U1*adj(U0) + adj(U5)*U4*U3; - U_fat_v[ss](mu) = U_fat_v[ss](mu) + W; + U_3link_v[site](nu) = W; - s=s+5; + U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_3*W; + } } + + // 5-link + for(int site=0;site_offset; + auto SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + auto SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; + auto SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + auto SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + + auto U0 = U_v[x_p_mu ](nu) ; gpermute(U0,SE0->_permute); + auto U1 = U_3link_v[x_p_nu ](rho); gpermute(U1,SE1->_permute); + auto U2 = U_v[x ](nu) ; gpermute(U2,SE2->_permute); + auto U3 = U_v[x_p_mu_m_nu](nu) ; gpermute(U3,SE3->_permute); + auto U4 = U_3link_v[x_m_nu ](rho); gpermute(U4,SE4->_permute); + auto U5 = U_v[x_m_nu ](nu) ; gpermute(U5,SE4->_permute); + + auto W = U2*U1*adj(U0) + adj(U5)*U4*U3; + + U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_5*W; + } + } + } + } - u_smr = lt.c_3*Ghost.Extract(Ughost_fat) + lt.c_1*u_thin; + u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin; }; @@ -229,6 +268,7 @@ public: // }; }; + /*! @brief create long links from link variables. */ template class Smear_HISQ_Naik { @@ -241,6 +281,7 @@ public: // Eventually this will take, e.g., coefficients as argument Smear_HISQ_Naik(GridCartesian* grid) : _grid(grid) { assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); + assert(Nd == 4 && "HISQ smearing only defined for Nd==4"); } ~Smear_HISQ_Naik() {} diff --git a/tests/smearing/Benchmark_su3mult_vs_lookup.cc b/tests/smearing/Benchmark_su3mult_vs_lookup.cc deleted file mode 100644 index 5f9c98ba..00000000 --- a/tests/smearing/Benchmark_su3mult_vs_lookup.cc +++ /dev/null @@ -1,125 +0,0 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./benchmarks/Benchmark_su3mult_vs_lookup.cc - - Copyright (C) 2023 - - Author: D. A. Clarke - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ -/* - @file Benchmark_su3mult_vs_lookup.cc - @brief check to see whether su3 multiplication or lookup tables is faster -*/ - -#include -using namespace Grid; - -/*! @brief make the logger work like python print */ -template -inline std::string sjoin(Args&&... args) noexcept { - std::ostringstream msg; - (msg << ... << args); - return msg.str(); -} -template -inline void Grid_log(Args&&... args) { - std::string msg = sjoin(std::forward(args)...); - std::cout << GridLogMessage << msg << std::endl; -} - -/*! @brief parameter file to easily adjust Nloop */ -struct ConfParameters: Serializable { - GRID_SERIALIZABLE_CLASS_MEMBERS( - ConfParameters, - int, Nloop); - - template - ConfParameters(Reader& Reader){ - read(Reader, "parameters", *this); - } -}; - -int main (int argc, char** argv) { - - // Params for the test. - int Ns = 8; - int Nt = 4; - int threads = GridThread::GetThreads(); - std::string conf_in = "nersc.l8t4b3360"; - Coordinate latt_size(Nd,0); latt_size[0]=Ns; latt_size[1]=Ns; latt_size[2]=Ns; latt_size[3]=Nt; - - Grid_init(&argc,&argv); - - Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); - Coordinate mpi_layout = GridDefaultMpi(); - GridCartesian GRID(latt_size,simd_layout,mpi_layout); - - Grid_log(" mpi = ",mpi_layout); - Grid_log(" simd = ",simd_layout); - Grid_log(" latt = ",latt_size); - Grid_log("threads = ",threads); - - XmlReader Reader("mult_vs_lookup.xml",false, "grid"); - ConfParameters param(Reader); - Grid_log(" Nloop = ",param.Nloop); - - // Gauge field and accessor - LatticeGaugeField Umu(&GRID); - autoView(U_v, Umu, CpuRead); - - // Read the configuration into Umu - FieldMetaData header; - NerscIO::readConfiguration(Umu, header, conf_in); - - // Read in lattice sequentially, Nloop times - double lookupTime = 0.; - for(int i=0;i + ConfParameters(Reader& Reader){ + read(Reader, "parameters", *this); + } +}; + // // one method: input --> fat // another : input --> long (naik) @@ -65,16 +79,22 @@ int main (int argc, char** argv) { Coordinate latt_size(Nd,0); latt_size[0]=Ns; latt_size[1]=Ns; latt_size[2]=Ns; latt_size[3]=Nt; std::string conf_in = "nersc.l8t4b3360"; std::string conf_out = "nersc.l8t4b3360.3link"; + int threads = GridThread::GetThreads(); // Initialize the Grid Grid_init(&argc,&argv); Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); - Grid_log(" mpi = ",mpi_layout); - Grid_log("simd = ",simd_layout); - Grid_log("latt = ",latt_size); + Grid_log("mpi = ",mpi_layout); + Grid_log("simd = ",simd_layout); + Grid_log("latt = ",latt_size); + Grid_log("threads = ",threads); GridCartesian GRID(latt_size,simd_layout,mpi_layout); + XmlReader Reader("fatParams.xml",false,"grid"); + ConfParameters param(Reader); + if(param.benchmark) Grid_log(" Nloop = ",param.Nloop); + // Instantiate the LatticeGaugeField objects holding thin (Umu) and fat (U_smr) links LatticeGaugeField Umu(&GRID); LatticeGaugeField U_smr(&GRID); @@ -85,6 +105,7 @@ int main (int argc, char** argv) { // Smear Umu and store result in U_smr Smear_HISQ_fat hisq_fat(&GRID,1/8.,0.,1/16.,1/64.,1/384.,0.); +// Smear_HISQ_fat hisq_fat(&GRID,1/8.,0.,1/16.,0.,1/384.,0.); hisq_fat.smear(U_smr,Umu); NerscIO::writeConfiguration(U_smr,conf_out,"HISQ"); @@ -100,5 +121,43 @@ int main (int argc, char** argv) { auto absDiff = norm2(diff)/norm2(Umu); Grid_log(" |Umu-U|/|Umu| = ",absDiff); + + if (param.benchmark) { + + autoView(U_v, Umu, CpuRead); // Gauge accessor + + // Read in lattice sequentially, Nloop times + double lookupTime = 0.; + for(int i=0;i Date: Sat, 16 Sep 2023 23:18:16 -0600 Subject: [PATCH 019/114] try 7-link --- Grid/qcd/smearing/HISQSmearing.h | 70 +++++++++++++++++++++++++++++--- tests/smearing/Test_fatLinks.cc | 5 +-- 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index e6bab0f1..9a69b660 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -129,7 +129,7 @@ struct SmearingParameters{ /*! @brief create fat links from link variables */ -template // TODO: change to Gimpl? +template class Smear_HISQ_fat { private: @@ -168,6 +168,8 @@ public: // This is where auxiliary N-link fields and the final smear will be stored. LGF Ughost_fat(Ughost.Grid()); LGF Ughost_3link(Ughost.Grid()); + LGF Ughost_5linkA(Ughost.Grid()); + LGF Ughost_5linkB(Ughost.Grid()); // Create 3-link stencil. We allow mu==nu just to make the indexing easier. // Shifts with mu==nu will not be used. @@ -188,13 +190,18 @@ public: Ughost_fat=Zero(); // Create the accessors - autoView(U_v , Ughost , CpuRead); - autoView(U_fat_v , Ughost_fat , CpuWrite); - autoView(U_3link_v, Ughost_3link, CpuWrite); + autoView(U_v , Ughost , CpuRead); + autoView(U_fat_v , Ughost_fat , CpuWrite); + autoView(U_3link_v , Ughost_3link , CpuWrite); + autoView(U_5linkA_v, Ughost_5linkA, CpuWrite); + autoView(U_5linkB_v, Ughost_5linkB, CpuWrite); for(int mu=0;mu_offset; @@ -253,7 +263,57 @@ public: auto W = U2*U1*adj(U0) + adj(U5)*U4*U3; + if(sigmaIndex<3) { + U_5linkA_v[site](rho) = W; + } else { + U_5linkB_v[site](rho) = W; + } + U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_5*W; + + sigmaIndex++; + } + } + } + + // 7-link + for(int site=0;site_offset; + auto SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + auto SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; + auto SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + auto SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + + auto U0 = U_v[x_p_mu ](nu) ; gpermute(U0,SE0->_permute); + auto U1 = U0; + if(sigmaIndex<3) { + U1 = U_5linkB_v[x_p_nu](rho); gpermute(U1,SE1->_permute); + } else { + U1 = U_5linkA_v[x_p_nu](rho); gpermute(U1,SE1->_permute); + } + auto U2 = U_v[x ](nu) ; gpermute(U2,SE2->_permute); + auto U3 = U_v[x_p_mu_m_nu](nu) ; gpermute(U3,SE3->_permute); + auto U4 = U0; + if(sigmaIndex<3) { + U4 = U_5linkB_v[x_m_nu](rho); gpermute(U4,SE4->_permute); + } else { + U4 = U_5linkA_v[x_m_nu](rho); gpermute(U4,SE4->_permute); + } + auto U5 = U_v[x_m_nu ](nu) ; gpermute(U5,SE4->_permute); + + auto W = U2*U1*adj(U0) + adj(U5)*U4*U3; + + U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_7*W; + + sigmaIndex++; } } } diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index 97e02400..f7c422c1 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -78,7 +78,7 @@ int main (int argc, char** argv) { int Nt = 4; Coordinate latt_size(Nd,0); latt_size[0]=Ns; latt_size[1]=Ns; latt_size[2]=Ns; latt_size[3]=Nt; std::string conf_in = "nersc.l8t4b3360"; - std::string conf_out = "nersc.l8t4b3360.3link"; + std::string conf_out = "nersc.l8t4b3360.35link"; int threads = GridThread::GetThreads(); // Initialize the Grid @@ -105,7 +105,6 @@ int main (int argc, char** argv) { // Smear Umu and store result in U_smr Smear_HISQ_fat hisq_fat(&GRID,1/8.,0.,1/16.,1/64.,1/384.,0.); -// Smear_HISQ_fat hisq_fat(&GRID,1/8.,0.,1/16.,0.,1/384.,0.); hisq_fat.smear(U_smr,Umu); NerscIO::writeConfiguration(U_smr,conf_out,"HISQ"); @@ -115,7 +114,7 @@ int main (int argc, char** argv) { Smear_HISQ_fat hisq_fat_Cstyle(&GRID,path_coeff); // Make sure result doesn't change w.r.t. a trusted lattice - NerscIO::readConfiguration(Umu, header, "nersc.l8t4b3360.3link.control"); + NerscIO::readConfiguration(Umu, header, "nersc.l8t4b3360.35link.control"); LatticeGaugeField diff(&GRID); diff = Umu-U_smr; auto absDiff = norm2(diff)/norm2(Umu); From d93eac7b1c74555b0bad224bdd0db1ac4afc8421 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 3 Oct 2023 15:53:14 +0000 Subject: [PATCH 020/114] Performance regressed and is OK in icpx 2023.2 --- Grid/qcd/utils/WilsonLoops.h | 7 +- Grid/stencil/GeneralLocalStencil.h | 2 +- configure.ac | 2 +- m4/ax_cxx_compile_stdcxx.m4 | 1018 +++++++++++++++++ m4/ax_cxx_compile_stdcxx_14.m4 | 34 + systems/Sunspot/benchmarks/bench.pbs | 3 +- .../Sunspot/benchmarks/gpu_tile_compact.sh | 4 +- systems/Sunspot/config-command | 2 +- 8 files changed, 1063 insertions(+), 9 deletions(-) create mode 100644 m4/ax_cxx_compile_stdcxx.m4 create mode 100644 m4/ax_cxx_compile_stdcxx_14.m4 diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index 78e25a8d..d6c0d621 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -464,7 +464,8 @@ public: //U_padded: the gauge link fields padded out using the PaddedCell class //Cell: the padded cell class //gStencil: the precomputed generalized local stencil for the staple - static void StaplePaddedAll(std::vector &staple, const std::vector &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) { + static void StaplePaddedAll(std::vector &staple, const std::vector &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) + { double t0 = usecond(); assert(U_padded.size() == Nd); assert(staple.size() == Nd); assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back()); @@ -489,7 +490,7 @@ public: autoView( gStaple_v , gStaple, AcceleratorWrite); auto gStencil_v = gStencil.View(); - accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), { decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; stencil_ss = Zero(); int off = outer_off; @@ -1201,7 +1202,7 @@ public: autoView( gStaple_v , gStaple, AcceleratorWrite); auto gStencil_v = gStencil.View(); - accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), { decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; stencil_ss = Zero(); int s=offset; diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h index 333f7028..90fff953 100644 --- a/Grid/stencil/GeneralLocalStencil.h +++ b/Grid/stencil/GeneralLocalStencil.h @@ -43,7 +43,7 @@ class GeneralLocalStencilView { int _npoints; // Move to template param? GeneralStencilEntry* _entries_p; - accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) { + accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) const { return & this->_entries_p[point+this->_npoints*osite]; } diff --git a/configure.ac b/configure.ac index 15b794fc..c16d90f6 100644 --- a/configure.ac +++ b/configure.ac @@ -41,7 +41,7 @@ AC_PROG_RANLIB ############### Get compiler informations AC_LANG([C++]) -AX_CXX_COMPILE_STDCXX(14,noext,mandatory) +AX_CXX_COMPILE_STDCXX(17,noext,mandatory) AX_COMPILER_VENDOR AC_DEFINE_UNQUOTED([CXX_COMP_VENDOR],["$ax_cv_cxx_compiler_vendor"], [vendor of C++ compiler that will compile the code]) diff --git a/m4/ax_cxx_compile_stdcxx.m4 b/m4/ax_cxx_compile_stdcxx.m4 new file mode 100644 index 00000000..8edf5152 --- /dev/null +++ b/m4/ax_cxx_compile_stdcxx.m4 @@ -0,0 +1,1018 @@ +# =========================================================================== +# https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional]) +# +# DESCRIPTION +# +# Check for baseline language coverage in the compiler for the specified +# version of the C++ standard. If necessary, add switches to CXX and +# CXXCPP to enable support. VERSION may be '11', '14', '17', or '20' for +# the respective C++ standard version. +# +# The second argument, if specified, indicates whether you insist on an +# extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g. +# -std=c++11). If neither is specified, you get whatever works, with +# preference for no added switch, and then for an extended mode. +# +# The third argument, if specified 'mandatory' or if left unspecified, +# indicates that baseline support for the specified C++ standard is +# required and that the macro should error out if no mode with that +# support is found. If specified 'optional', then configuration proceeds +# regardless, after defining HAVE_CXX${VERSION} if and only if a +# supporting mode is found. +# +# LICENSE +# +# Copyright (c) 2008 Benjamin Kosnik +# Copyright (c) 2012 Zack Weinberg +# Copyright (c) 2013 Roy Stogner +# Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov +# Copyright (c) 2015 Paul Norman +# Copyright (c) 2015 Moritz Klammler +# Copyright (c) 2016, 2018 Krzesimir Nowak +# Copyright (c) 2019 Enji Cooper +# Copyright (c) 2020 Jason Merrill +# Copyright (c) 2021 Jörn Heusipp +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 18 + +dnl This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro +dnl (serial version number 13). + +AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl + m4_if([$1], [11], [ax_cxx_compile_alternatives="11 0x"], + [$1], [14], [ax_cxx_compile_alternatives="14 1y"], + [$1], [17], [ax_cxx_compile_alternatives="17 1z"], + [$1], [20], [ax_cxx_compile_alternatives="20"], + [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl + m4_if([$2], [], [], + [$2], [ext], [], + [$2], [noext], [], + [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl + m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true], + [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true], + [$3], [optional], [ax_cxx_compile_cxx$1_required=false], + [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])]) + AC_LANG_PUSH([C++])dnl + ac_success=no + + m4_if([$2], [], [dnl + AC_CACHE_CHECK(whether $CXX supports C++$1 features by default, + ax_cv_cxx_compile_cxx$1, + [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], + [ax_cv_cxx_compile_cxx$1=yes], + [ax_cv_cxx_compile_cxx$1=no])]) + if test x$ax_cv_cxx_compile_cxx$1 = xyes; then + ac_success=yes + fi]) + + m4_if([$2], [noext], [], [dnl + if test x$ac_success = xno; then + for alternative in ${ax_cxx_compile_alternatives}; do + switch="-std=gnu++${alternative}" + cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch]) + AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch, + $cachevar, + [ac_save_CXX="$CXX" + CXX="$CXX $switch" + AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], + [eval $cachevar=yes], + [eval $cachevar=no]) + CXX="$ac_save_CXX"]) + if eval test x\$$cachevar = xyes; then + CXX="$CXX $switch" + if test -n "$CXXCPP" ; then + CXXCPP="$CXXCPP $switch" + fi + ac_success=yes + break + fi + done + fi]) + + m4_if([$2], [ext], [], [dnl + if test x$ac_success = xno; then + dnl HP's aCC needs +std=c++11 according to: + dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf + dnl Cray's crayCC needs "-h std=c++11" + dnl MSVC needs -std:c++NN for C++17 and later (default is C++14) + for alternative in ${ax_cxx_compile_alternatives}; do + for switch in -std=c++${alternative} +std=c++${alternative} "-h std=c++${alternative}" MSVC; do + if test x"$switch" = xMSVC; then + dnl AS_TR_SH maps both `:` and `=` to `_` so -std:c++17 would collide + dnl with -std=c++17. We suffix the cache variable name with _MSVC to + dnl avoid this. + switch=-std:c++${alternative} + cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_${switch}_MSVC]) + else + cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch]) + fi + AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch, + $cachevar, + [ac_save_CXX="$CXX" + CXX="$CXX $switch" + AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])], + [eval $cachevar=yes], + [eval $cachevar=no]) + CXX="$ac_save_CXX"]) + if eval test x\$$cachevar = xyes; then + CXX="$CXX $switch" + if test -n "$CXXCPP" ; then + CXXCPP="$CXXCPP $switch" + fi + ac_success=yes + break + fi + done + if test x$ac_success = xyes; then + break + fi + done + fi]) + AC_LANG_POP([C++]) + if test x$ax_cxx_compile_cxx$1_required = xtrue; then + if test x$ac_success = xno; then + AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.]) + fi + fi + if test x$ac_success = xno; then + HAVE_CXX$1=0 + AC_MSG_NOTICE([No compiler with C++$1 support was found]) + else + HAVE_CXX$1=1 + AC_DEFINE(HAVE_CXX$1,1, + [define if the compiler supports basic C++$1 syntax]) + fi + AC_SUBST(HAVE_CXX$1) +]) + + +dnl Test body for checking C++11 support + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11], + _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 +) + +dnl Test body for checking C++14 support + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14], + _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 + _AX_CXX_COMPILE_STDCXX_testbody_new_in_14 +) + +dnl Test body for checking C++17 support + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_17], + _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 + _AX_CXX_COMPILE_STDCXX_testbody_new_in_14 + _AX_CXX_COMPILE_STDCXX_testbody_new_in_17 +) + +dnl Test body for checking C++20 support + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_20], + _AX_CXX_COMPILE_STDCXX_testbody_new_in_11 + _AX_CXX_COMPILE_STDCXX_testbody_new_in_14 + _AX_CXX_COMPILE_STDCXX_testbody_new_in_17 + _AX_CXX_COMPILE_STDCXX_testbody_new_in_20 +) + + +dnl Tests for new features in C++11 + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[ + +// If the compiler admits that it is not ready for C++11, why torture it? +// Hopefully, this will speed up the test. + +#ifndef __cplusplus + +#error "This is not a C++ compiler" + +// MSVC always sets __cplusplus to 199711L in older versions; newer versions +// only set it correctly if /Zc:__cplusplus is specified as well as a +// /std:c++NN switch: +// https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/ +#elif __cplusplus < 201103L && !defined _MSC_VER + +#error "This is not a C++11 compiler" + +#else + +namespace cxx11 +{ + + namespace test_static_assert + { + + template + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + } + + namespace test_final_override + { + + struct Base + { + virtual ~Base() {} + virtual void f() {} + }; + + struct Derived : public Base + { + virtual ~Derived() override {} + virtual void f() override {} + }; + + } + + namespace test_double_right_angle_brackets + { + + template < typename T > + struct check {}; + + typedef check single_type; + typedef check> double_type; + typedef check>> triple_type; + typedef check>>> quadruple_type; + + } + + namespace test_decltype + { + + int + f() + { + int a = 1; + decltype(a) b = 2; + return a + b; + } + + } + + namespace test_type_deduction + { + + template < typename T1, typename T2 > + struct is_same + { + static const bool value = false; + }; + + template < typename T > + struct is_same + { + static const bool value = true; + }; + + template < typename T1, typename T2 > + auto + add(T1 a1, T2 a2) -> decltype(a1 + a2) + { + return a1 + a2; + } + + int + test(const int c, volatile int v) + { + static_assert(is_same::value == true, ""); + static_assert(is_same::value == false, ""); + static_assert(is_same::value == false, ""); + auto ac = c; + auto av = v; + auto sumi = ac + av + 'x'; + auto sumf = ac + av + 1.0; + static_assert(is_same::value == true, ""); + static_assert(is_same::value == true, ""); + static_assert(is_same::value == true, ""); + static_assert(is_same::value == false, ""); + static_assert(is_same::value == true, ""); + return (sumf > 0.0) ? sumi : add(c, v); + } + + } + + namespace test_noexcept + { + + int f() { return 0; } + int g() noexcept { return 0; } + + static_assert(noexcept(f()) == false, ""); + static_assert(noexcept(g()) == true, ""); + + } + + namespace test_constexpr + { + + template < typename CharT > + unsigned long constexpr + strlen_c_r(const CharT *const s, const unsigned long acc) noexcept + { + return *s ? strlen_c_r(s + 1, acc + 1) : acc; + } + + template < typename CharT > + unsigned long constexpr + strlen_c(const CharT *const s) noexcept + { + return strlen_c_r(s, 0UL); + } + + static_assert(strlen_c("") == 0UL, ""); + static_assert(strlen_c("1") == 1UL, ""); + static_assert(strlen_c("example") == 7UL, ""); + static_assert(strlen_c("another\0example") == 7UL, ""); + + } + + namespace test_rvalue_references + { + + template < int N > + struct answer + { + static constexpr int value = N; + }; + + answer<1> f(int&) { return answer<1>(); } + answer<2> f(const int&) { return answer<2>(); } + answer<3> f(int&&) { return answer<3>(); } + + void + test() + { + int i = 0; + const int c = 0; + static_assert(decltype(f(i))::value == 1, ""); + static_assert(decltype(f(c))::value == 2, ""); + static_assert(decltype(f(0))::value == 3, ""); + } + + } + + namespace test_uniform_initialization + { + + struct test + { + static const int zero {}; + static const int one {1}; + }; + + static_assert(test::zero == 0, ""); + static_assert(test::one == 1, ""); + + } + + namespace test_lambdas + { + + void + test1() + { + auto lambda1 = [](){}; + auto lambda2 = lambda1; + lambda1(); + lambda2(); + } + + int + test2() + { + auto a = [](int i, int j){ return i + j; }(1, 2); + auto b = []() -> int { return '0'; }(); + auto c = [=](){ return a + b; }(); + auto d = [&](){ return c; }(); + auto e = [a, &b](int x) mutable { + const auto identity = [](int y){ return y; }; + for (auto i = 0; i < a; ++i) + a += b--; + return x + identity(a + b); + }(0); + return a + b + c + d + e; + } + + int + test3() + { + const auto nullary = [](){ return 0; }; + const auto unary = [](int x){ return x; }; + using nullary_t = decltype(nullary); + using unary_t = decltype(unary); + const auto higher1st = [](nullary_t f){ return f(); }; + const auto higher2nd = [unary](nullary_t f1){ + return [unary, f1](unary_t f2){ return f2(unary(f1())); }; + }; + return higher1st(nullary) + higher2nd(nullary)(unary); + } + + } + + namespace test_variadic_templates + { + + template + struct sum; + + template + struct sum + { + static constexpr auto value = N0 + sum::value; + }; + + template <> + struct sum<> + { + static constexpr auto value = 0; + }; + + static_assert(sum<>::value == 0, ""); + static_assert(sum<1>::value == 1, ""); + static_assert(sum<23>::value == 23, ""); + static_assert(sum<1, 2>::value == 3, ""); + static_assert(sum<5, 5, 11>::value == 21, ""); + static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, ""); + + } + + // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae + // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function + // because of this. + namespace test_template_alias_sfinae + { + + struct foo {}; + + template + using member = typename T::member_type; + + template + void func(...) {} + + template + void func(member*) {} + + void test(); + + void test() { func(0); } + + } + +} // namespace cxx11 + +#endif // __cplusplus >= 201103L + +]]) + + +dnl Tests for new features in C++14 + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[ + +// If the compiler admits that it is not ready for C++14, why torture it? +// Hopefully, this will speed up the test. + +#ifndef __cplusplus + +#error "This is not a C++ compiler" + +#elif __cplusplus < 201402L && !defined _MSC_VER + +#error "This is not a C++14 compiler" + +#else + +namespace cxx14 +{ + + namespace test_polymorphic_lambdas + { + + int + test() + { + const auto lambda = [](auto&&... args){ + const auto istiny = [](auto x){ + return (sizeof(x) == 1UL) ? 1 : 0; + }; + const int aretiny[] = { istiny(args)... }; + return aretiny[0]; + }; + return lambda(1, 1L, 1.0f, '1'); + } + + } + + namespace test_binary_literals + { + + constexpr auto ivii = 0b0000000000101010; + static_assert(ivii == 42, "wrong value"); + + } + + namespace test_generalized_constexpr + { + + template < typename CharT > + constexpr unsigned long + strlen_c(const CharT *const s) noexcept + { + auto length = 0UL; + for (auto p = s; *p; ++p) + ++length; + return length; + } + + static_assert(strlen_c("") == 0UL, ""); + static_assert(strlen_c("x") == 1UL, ""); + static_assert(strlen_c("test") == 4UL, ""); + static_assert(strlen_c("another\0test") == 7UL, ""); + + } + + namespace test_lambda_init_capture + { + + int + test() + { + auto x = 0; + const auto lambda1 = [a = x](int b){ return a + b; }; + const auto lambda2 = [a = lambda1(x)](){ return a; }; + return lambda2(); + } + + } + + namespace test_digit_separators + { + + constexpr auto ten_million = 100'000'000; + static_assert(ten_million == 100000000, ""); + + } + + namespace test_return_type_deduction + { + + auto f(int& x) { return x; } + decltype(auto) g(int& x) { return x; } + + template < typename T1, typename T2 > + struct is_same + { + static constexpr auto value = false; + }; + + template < typename T > + struct is_same + { + static constexpr auto value = true; + }; + + int + test() + { + auto x = 0; + static_assert(is_same::value, ""); + static_assert(is_same::value, ""); + return x; + } + + } + +} // namespace cxx14 + +#endif // __cplusplus >= 201402L + +]]) + + +dnl Tests for new features in C++17 + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[ + +// If the compiler admits that it is not ready for C++17, why torture it? +// Hopefully, this will speed up the test. + +#ifndef __cplusplus + +#error "This is not a C++ compiler" + +#elif __cplusplus < 201703L && !defined _MSC_VER + +#error "This is not a C++17 compiler" + +#else + +#include +#include +#include + +namespace cxx17 +{ + + namespace test_constexpr_lambdas + { + + constexpr int foo = [](){return 42;}(); + + } + + namespace test::nested_namespace::definitions + { + + } + + namespace test_fold_expression + { + + template + int multiply(Args... args) + { + return (args * ... * 1); + } + + template + bool all(Args... args) + { + return (args && ...); + } + + } + + namespace test_extended_static_assert + { + + static_assert (true); + + } + + namespace test_auto_brace_init_list + { + + auto foo = {5}; + auto bar {5}; + + static_assert(std::is_same, decltype(foo)>::value); + static_assert(std::is_same::value); + } + + namespace test_typename_in_template_template_parameter + { + + template typename X> struct D; + + } + + namespace test_fallthrough_nodiscard_maybe_unused_attributes + { + + int f1() + { + return 42; + } + + [[nodiscard]] int f2() + { + [[maybe_unused]] auto unused = f1(); + + switch (f1()) + { + case 17: + f1(); + [[fallthrough]]; + case 42: + f1(); + } + return f1(); + } + + } + + namespace test_extended_aggregate_initialization + { + + struct base1 + { + int b1, b2 = 42; + }; + + struct base2 + { + base2() { + b3 = 42; + } + int b3; + }; + + struct derived : base1, base2 + { + int d; + }; + + derived d1 {{1, 2}, {}, 4}; // full initialization + derived d2 {{}, {}, 4}; // value-initialized bases + + } + + namespace test_general_range_based_for_loop + { + + struct iter + { + int i; + + int& operator* () + { + return i; + } + + const int& operator* () const + { + return i; + } + + iter& operator++() + { + ++i; + return *this; + } + }; + + struct sentinel + { + int i; + }; + + bool operator== (const iter& i, const sentinel& s) + { + return i.i == s.i; + } + + bool operator!= (const iter& i, const sentinel& s) + { + return !(i == s); + } + + struct range + { + iter begin() const + { + return {0}; + } + + sentinel end() const + { + return {5}; + } + }; + + void f() + { + range r {}; + + for (auto i : r) + { + [[maybe_unused]] auto v = i; + } + } + + } + + namespace test_lambda_capture_asterisk_this_by_value + { + + struct t + { + int i; + int foo() + { + return [*this]() + { + return i; + }(); + } + }; + + } + + namespace test_enum_class_construction + { + + enum class byte : unsigned char + {}; + + byte foo {42}; + + } + + namespace test_constexpr_if + { + + template + int f () + { + if constexpr(cond) + { + return 13; + } + else + { + return 42; + } + } + + } + + namespace test_selection_statement_with_initializer + { + + int f() + { + return 13; + } + + int f2() + { + if (auto i = f(); i > 0) + { + return 3; + } + + switch (auto i = f(); i + 4) + { + case 17: + return 2; + + default: + return 1; + } + } + + } + + namespace test_template_argument_deduction_for_class_templates + { + + template + struct pair + { + pair (T1 p1, T2 p2) + : m1 {p1}, + m2 {p2} + {} + + T1 m1; + T2 m2; + }; + + void f() + { + [[maybe_unused]] auto p = pair{13, 42u}; + } + + } + + namespace test_non_type_auto_template_parameters + { + + template + struct B + {}; + + B<5> b1; + B<'a'> b2; + + } + + namespace test_structured_bindings + { + + int arr[2] = { 1, 2 }; + std::pair pr = { 1, 2 }; + + auto f1() -> int(&)[2] + { + return arr; + } + + auto f2() -> std::pair& + { + return pr; + } + + struct S + { + int x1 : 2; + volatile double y1; + }; + + S f3() + { + return {}; + } + + auto [ x1, y1 ] = f1(); + auto& [ xr1, yr1 ] = f1(); + auto [ x2, y2 ] = f2(); + auto& [ xr2, yr2 ] = f2(); + const auto [ x3, y3 ] = f3(); + + } + + namespace test_exception_spec_type_system + { + + struct Good {}; + struct Bad {}; + + void g1() noexcept; + void g2(); + + template + Bad + f(T*, T*); + + template + Good + f(T1*, T2*); + + static_assert (std::is_same_v); + + } + + namespace test_inline_variables + { + + template void f(T) + {} + + template inline T g(T) + { + return T{}; + } + + template<> inline void f<>(int) + {} + + template<> int g<>(int) + { + return 5; + } + + } + +} // namespace cxx17 + +#endif // __cplusplus < 201703L && !defined _MSC_VER + +]]) + + +dnl Tests for new features in C++20 + +m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_20], [[ + +#ifndef __cplusplus + +#error "This is not a C++ compiler" + +#elif __cplusplus < 202002L && !defined _MSC_VER + +#error "This is not a C++20 compiler" + +#else + +#include + +namespace cxx20 +{ + +// As C++20 supports feature test macros in the standard, there is no +// immediate need to actually test for feature availability on the +// Autoconf side. + +} // namespace cxx20 + +#endif // __cplusplus < 202002L && !defined _MSC_VER + +]]) diff --git a/m4/ax_cxx_compile_stdcxx_14.m4 b/m4/ax_cxx_compile_stdcxx_14.m4 new file mode 100644 index 00000000..094db0d0 --- /dev/null +++ b/m4/ax_cxx_compile_stdcxx_14.m4 @@ -0,0 +1,34 @@ +# ============================================================================= +# https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_14.html +# ============================================================================= +# +# SYNOPSIS +# +# AX_CXX_COMPILE_STDCXX_14([ext|noext], [mandatory|optional]) +# +# DESCRIPTION +# +# Check for baseline language coverage in the compiler for the C++14 +# standard; if necessary, add switches to CXX and CXXCPP to enable +# support. +# +# This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX +# macro with the version set to C++14. The two optional arguments are +# forwarded literally as the second and third argument respectively. +# Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for +# more information. If you want to use this macro, you also need to +# download the ax_cxx_compile_stdcxx.m4 file. +# +# LICENSE +# +# Copyright (c) 2015 Moritz Klammler +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 5 + +AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX]) +AC_DEFUN([AX_CXX_COMPILE_STDCXX_14], [AX_CXX_COMPILE_STDCXX([14], [$1], [$2])]) diff --git a/systems/Sunspot/benchmarks/bench.pbs b/systems/Sunspot/benchmarks/bench.pbs index f1a26aa4..dc07ca2f 100644 --- a/systems/Sunspot/benchmarks/bench.pbs +++ b/systems/Sunspot/benchmarks/bench.pbs @@ -20,7 +20,7 @@ unset OMP_PLACES cd $PBS_O_WORKDIR -qsub jobscript.pbs +#qsub jobscript.pbs echo Jobid: $PBS_JOBID echo Running on host `hostname` @@ -44,3 +44,4 @@ CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -enva ./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \ --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" +$CMD diff --git a/systems/Sunspot/benchmarks/gpu_tile_compact.sh b/systems/Sunspot/benchmarks/gpu_tile_compact.sh index ec532b1b..11ed83aa 100755 --- a/systems/Sunspot/benchmarks/gpu_tile_compact.sh +++ b/systems/Sunspot/benchmarks/gpu_tile_compact.sh @@ -45,8 +45,8 @@ echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_A if [ $PALS_LOCAL_RANKID = 0 ] then - onetrace --chrome-device-timeline "$@" -# "$@" +# onetrace --chrome-device-timeline "$@" + "$@" else "$@" fi diff --git a/systems/Sunspot/config-command b/systems/Sunspot/config-command index 7adf7117..e59ef515 100644 --- a/systems/Sunspot/config-command +++ b/systems/Sunspot/config-command @@ -11,6 +11,6 @@ TOOLS=$HOME/tools --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -lapmidg -L$TOOLS/lib64/" \ + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \ CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include" From 7786ea9921170498c12b6dacec189dd2ee72df87 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 3 Oct 2023 09:58:44 -0700 Subject: [PATCH 021/114] Bug fix in script --- systems/OEM/benchmarks/select_gpu.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/systems/OEM/benchmarks/select_gpu.sh b/systems/OEM/benchmarks/select_gpu.sh index 2ef1f82d..db1cfc85 100755 --- a/systems/OEM/benchmarks/select_gpu.sh +++ b/systems/OEM/benchmarks/select_gpu.sh @@ -1,9 +1,8 @@ #!/bin/bash num_tile=2 - -gpu_id=$(( (MPI_LOCAL_RANKID % num_tile ) )) -tile_id=$((MPI_LOCAL_RANKID / num_tile)) +gpu_id=$(( (MPI_LOCALRANKID / num_tile ) )) +tile_id=$((MPI_LOCALRANKID % num_tile)) export ZE_AFFINITY_MASK=$gpu_id.$tile_id From 6d0c2de3996062be45a0911a4431e013321b1bbe Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 3 Oct 2023 17:04:20 +0000 Subject: [PATCH 022/114] Deprecate teh PVC directory and make a PVC-OEM generic PVC target with no queueing system dependency -- just interactive scripts --- systems/{OEM => PVC-OEM}/README | 0 systems/{OEM => PVC-OEM}/benchmarks/bench.sh | 0 .../{OEM => PVC-OEM}/benchmarks/select_gpu.sh | 0 systems/{OEM => PVC-OEM}/config-command | 0 systems/{OEM => PVC-OEM}/setup.sh | 0 systems/PVC/benchmarks/run-1tile.sh | 62 ------------------- systems/PVC/benchmarks/run-2tile-mpi.sh | 33 ---------- systems/PVC/benchmarks/wrap.sh | 9 --- systems/PVC/config-command | 16 ----- systems/PVC/setup.sh | 18 ------ 10 files changed, 138 deletions(-) rename systems/{OEM => PVC-OEM}/README (100%) rename systems/{OEM => PVC-OEM}/benchmarks/bench.sh (100%) rename systems/{OEM => PVC-OEM}/benchmarks/select_gpu.sh (100%) rename systems/{OEM => PVC-OEM}/config-command (100%) rename systems/{OEM => PVC-OEM}/setup.sh (100%) delete mode 100755 systems/PVC/benchmarks/run-1tile.sh delete mode 100755 systems/PVC/benchmarks/run-2tile-mpi.sh delete mode 100755 systems/PVC/benchmarks/wrap.sh delete mode 100644 systems/PVC/config-command delete mode 100644 systems/PVC/setup.sh diff --git a/systems/OEM/README b/systems/PVC-OEM/README similarity index 100% rename from systems/OEM/README rename to systems/PVC-OEM/README diff --git a/systems/OEM/benchmarks/bench.sh b/systems/PVC-OEM/benchmarks/bench.sh similarity index 100% rename from systems/OEM/benchmarks/bench.sh rename to systems/PVC-OEM/benchmarks/bench.sh diff --git a/systems/OEM/benchmarks/select_gpu.sh b/systems/PVC-OEM/benchmarks/select_gpu.sh similarity index 100% rename from systems/OEM/benchmarks/select_gpu.sh rename to systems/PVC-OEM/benchmarks/select_gpu.sh diff --git a/systems/OEM/config-command b/systems/PVC-OEM/config-command similarity index 100% rename from systems/OEM/config-command rename to systems/PVC-OEM/config-command diff --git a/systems/OEM/setup.sh b/systems/PVC-OEM/setup.sh similarity index 100% rename from systems/OEM/setup.sh rename to systems/PVC-OEM/setup.sh diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh deleted file mode 100755 index 9a29b773..00000000 --- a/systems/PVC/benchmarks/run-1tile.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/sh -##SBATCH -p PVC-SPR-QZEH -##SBATCH -p PVC-ICX-QZNW -#SBATCH -p QZ1J-ICX-PVC -##SBATCH -p QZ1J-SPR-PVC-2C - -#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh - -export NT=8 - -export I_MPI_OFFLOAD=1 -export I_MPI_OFFLOAD_TOPOLIB=level_zero -export I_MPI_OFFLOAD_DOMAIN_SIZE=-1 - -# export IGC_EnableLSCFenceUGMBeforeEOT=0 -# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False" -export SYCL_DEVICE_FILTER=gpu,level_zero -#export IGC_ShaderDumpEnable=1 -#export IGC_DumpToCurrentDir=1 -export I_MPI_OFFLOAD_CELL=tile -export EnableImplicitScaling=0 -export EnableWalkerPartition=0 -export ZE_AFFINITY_MASK=0.0 -mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --device-mem 32768 - -export ZE_AFFINITY_MASK=0 -export I_MPI_OFFLOAD_CELL=device -export EnableImplicitScaling=1 -export EnableWalkerPartition=1 - - - - - - - - - - - - - - - - - - - - -#mpiexec -launcher ssh -n 2 -host localhost vtune -collect gpu-hotspots -knob gpu-sampling-interval=1 -data-limit=0 -r ./vtune_run4 -- ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 - -#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 - -#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 - -#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 - -#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 - -#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 16.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 0 -#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 1 - diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh deleted file mode 100755 index 1db67508..00000000 --- a/systems/PVC/benchmarks/run-2tile-mpi.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -##SBATCH -p PVC-SPR-QZEH -##SBATCH -p PVC-ICX-QZNW - -#SBATCH -p QZ1J-ICX-PVC - -#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh - -export NT=16 - -# export IGC_EnableLSCFenceUGMBeforeEOT=0 -# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False" -#export IGC_ShaderDumpEnable=1 -#export IGC_DumpToCurrentDir=1 -export I_MPI_OFFLOAD=1 -export I_MPI_OFFLOAD_TOPOLIB=level_zero -export I_MPI_OFFLOAD_DOMAIN_SIZE=-1 -export SYCL_DEVICE_FILTER=gpu,level_zero -export I_MPI_OFFLOAD_CELL=tile -export EnableImplicitScaling=0 -export EnableWalkerPartition=0 -#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 -#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 - -for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 -do -mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 1.1.1.2.log$i -mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 2.1.1.1.log$i -done - -mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 - diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh deleted file mode 100755 index b8806b30..00000000 --- a/systems/PVC/benchmarks/wrap.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/sh - -export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID - -echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK - - - $@ - diff --git a/systems/PVC/config-command b/systems/PVC/config-command deleted file mode 100644 index c3523c2d..00000000 --- a/systems/PVC/config-command +++ /dev/null @@ -1,16 +0,0 @@ -INSTALL=/nfs/site/home/paboylx/prereqs/ -../../configure \ - --enable-simd=GPU \ - --enable-gen-simd-width=64 \ - --enable-comms=mpi-auto \ - --disable-accelerator-cshift \ - --disable-gparity \ - --disable-fermion-reps \ - --enable-shm=nvlink \ - --enable-accelerator=sycl \ - --enable-unified=no \ - MPICXX=mpicxx \ - CXX=dpcpp \ - LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ - CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare" - diff --git a/systems/PVC/setup.sh b/systems/PVC/setup.sh deleted file mode 100644 index c3b97ce0..00000000 --- a/systems/PVC/setup.sh +++ /dev/null @@ -1,18 +0,0 @@ -export https_proxy=http://proxy-chain.intel.com:911 -#export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH - -module load intel-release -module load intel-comp-rt/embargo-ci-neo - -#source /opt/intel/oneapi/PVC_setup.sh -#source /opt/intel/oneapi/ATS_setup.sh -#module load intel-nightly/20230331 -#module load intel-comp-rt/ci-neo-master/026093 - -#module load intel/mpich -module load intel/mpich/pvc45.3 -export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH - -#clsh embargo-ci-neo-022845 -#source /opt/intel/vtune_amplifier/amplxe-vars.sh From e2a3dae1f25d93b3a50649ed119b394deedc800d Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 8 Oct 2023 08:58:44 +0200 Subject: [PATCH 023/114] Option for multiple simultaneous CartesianStencils --- Grid/communicator/SharedMemoryMPI.cc | 20 ++++++++++---------- Grid/stencil/Stencil.h | 7 +++++-- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 335404c2..affc3ba2 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -27,7 +27,7 @@ Author: Christoph Lehner *************************************************************************************/ /* END LEGAL */ -#define header "SharedMemoryMpi: " +#define Grid_header "SharedMemoryMpi: " #include #include @@ -174,8 +174,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) MPI_Comm_size(WorldShmComm ,&WorldShmSize); if ( WorldRank == 0) { - std::cout << header " World communicator of size " < &directions, const std::vector &distances, - Parameters p=Parameters()) + Parameters p=Parameters(), + bool preserve_shm=false) { face_table_computed=0; _grid = grid; @@ -854,7 +855,9 @@ public: ///////////////////////////////////////////////////////////////////////////////// const int Nsimd = grid->Nsimd(); - _grid->ShmBufferFreeAll(); + // Allow for multiple stencils to exist simultaneously + if (!preserve_shm) + _grid->ShmBufferFreeAll(); int maxl=2; u_simd_send_buf.resize(maxl); From 0cfd13d18b41978afbdd8ba4b5ccbbce30e9dc09 Mon Sep 17 00:00:00 2001 From: david clarke Date: Tue, 10 Oct 2023 22:41:52 -0600 Subject: [PATCH 024/114] 7-link working --- Grid/qcd/smearing/HISQSmearing.h | 29 ++++++++++++++++------------- tests/smearing/Test_fatLinks.cc | 4 ++-- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 9a69b660..8c60d874 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -82,7 +82,7 @@ void generalShift(Coordinate& shift, int dir) { } } - +// Move into general stencil header, beneath definition of general stencil /*! @brief follow a path of directions, shifting one unit in each direction */ template void generalShift(Coordinate& shift, int dir, Args... args) { @@ -118,13 +118,13 @@ struct SmearingParameters{ Real c_5; // 5 link Real c_7; // 7 link Real c_lp; // 5 link Lepage - SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) : - c_1(c1), - c_naik(cnaik), - c_3(c3), - c_5(c5), - c_7(c7), - c_lp(clp){} + SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) + : c_1(c1), + c_naik(cnaik), + c_3(c3), + c_5(c5), + c_7(c7), + c_lp(clp){} }; @@ -198,11 +198,12 @@ public: for(int mu=0;mu_offset; auto SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; @@ -254,6 +254,8 @@ public: auto SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; auto SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + // gpermutes will be replaced with single line of code, combines load and permute + // into one step. still in pull request stage auto U0 = U_v[x_p_mu ](nu) ; gpermute(U0,SE0->_permute); auto U1 = U_3link_v[x_p_nu ](rho); gpermute(U1,SE1->_permute); auto U2 = U_v[x ](nu) ; gpermute(U2,SE2->_permute); @@ -283,8 +285,7 @@ public: if(nu==mu) continue; int s = stencilIndex(mu,nu); for(int rho=0;rho_offset; auto SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; @@ -293,6 +294,7 @@ public: auto SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; auto U0 = U_v[x_p_mu ](nu) ; gpermute(U0,SE0->_permute); + // decltype, or auto U1 = { ? ... } auto U1 = U0; if(sigmaIndex<3) { U1 = U_5linkB_v[x_p_nu](rho); gpermute(U1,SE1->_permute); @@ -311,6 +313,7 @@ public: auto W = U2*U1*adj(U0) + adj(U5)*U4*U3; + // std::vector(3) ? U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_7*W; sigmaIndex++; diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index f7c422c1..200e2af6 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -78,7 +78,7 @@ int main (int argc, char** argv) { int Nt = 4; Coordinate latt_size(Nd,0); latt_size[0]=Ns; latt_size[1]=Ns; latt_size[2]=Ns; latt_size[3]=Nt; std::string conf_in = "nersc.l8t4b3360"; - std::string conf_out = "nersc.l8t4b3360.35link"; + std::string conf_out = "nersc.l8t4b3360.357link"; int threads = GridThread::GetThreads(); // Initialize the Grid @@ -114,7 +114,7 @@ int main (int argc, char** argv) { Smear_HISQ_fat hisq_fat_Cstyle(&GRID,path_coeff); // Make sure result doesn't change w.r.t. a trusted lattice - NerscIO::readConfiguration(Umu, header, "nersc.l8t4b3360.35link.control"); + NerscIO::readConfiguration(Umu, header, "nersc.l8t4b3360.357link.control"); LatticeGaugeField diff(&GRID); diff = Umu-U_smr; auto absDiff = norm2(diff)/norm2(Umu); From 36600899e21094e130ec21b7aad74deab6550a62 Mon Sep 17 00:00:00 2001 From: david clarke Date: Thu, 12 Oct 2023 11:11:39 -0600 Subject: [PATCH 025/114] working 7-link; Grid_log; generalShift --- Grid/log/Log.h | 21 ++++++++ Grid/qcd/smearing/HISQSmearing.h | 81 ++++++++---------------------- Grid/stencil/GeneralLocalStencil.h | 44 ++++++++++++++++ tests/smearing/Test_fatLinks.cc | 14 ------ 4 files changed, 85 insertions(+), 75 deletions(-) diff --git a/Grid/log/Log.h b/Grid/log/Log.h index 2d663a3c..b88bf61f 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -191,6 +191,27 @@ extern Colours GridLogColours; std::string demangle(const char* name) ; +template +inline std::string sjoin(Args&&... args) noexcept { + std::ostringstream msg; + (msg << ... << args); + return msg.str(); +} + +/*! @brief make log messages work like python print */ +template +inline void Grid_log(Args&&... args) { + std::string msg = sjoin(std::forward(args)...); + std::cout << GridLogMessage << msg << std::endl; +} + +/*! @brief make warning messages work like python print */ +template +inline void Grid_warn(Args&&... args) { + std::string msg = sjoin(std::forward(args)...); + std::cout << GridLogWarning << msg << std::endl; +} + #define _NBACKTRACE (256) extern void * Grid_backtrace_buffer[_NBACKTRACE]; diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 8c60d874..1ea1b7b9 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -32,17 +32,25 @@ directory #pragma once - #include #include #include -#define BACKWARD_CONST 16 -#define NO_SHIFT -1 NAMESPACE_BEGIN(Grid); +/*! @brief append arbitrary shift path to shifts */ +template +void appendShift(std::vector& shifts, int dir, Args... args) { + Coordinate shift(Nd,0); + generalShift(shift, dir, args...); + // push_back creates an element at the end of shifts and + // assigns the data in the argument to it. + shifts.push_back(shift); +} + + // This is to optimize the SIMD (will also need to be in the class, at least for now) template void gpermute(vobj & inout,int perm) { vobj tmp=inout; @@ -53,14 +61,6 @@ template void gpermute(vobj & inout,int perm) { } -/*! @brief signals that you want to go backwards in direction dir */ -inline int Back(const int dir) { - // generalShift will use BACKWARD_CONST to determine whether we step forward or - // backward. Should work as long as BACKWARD_CONST > Nd. Trick inspired by SIMULATeQCD. - return dir + BACKWARD_CONST; -} - - /*! @brief figure out the stencil index from mu and nu */ inline int stencilIndex(int mu, int nu) { // Nshifts depends on how you built the stencil @@ -69,46 +69,6 @@ inline int stencilIndex(int mu, int nu) { } -/*! @brief shift one unit in direction dir */ -template -void generalShift(Coordinate& shift, int dir) { - if (dir >= BACKWARD_CONST) { - dir -= BACKWARD_CONST; - shift[dir]+=-1; - } else if (dir == NO_SHIFT) { - ; // do nothing - } else { - shift[dir]+=1; - } -} - -// Move into general stencil header, beneath definition of general stencil -/*! @brief follow a path of directions, shifting one unit in each direction */ -template -void generalShift(Coordinate& shift, int dir, Args... args) { - if (dir >= BACKWARD_CONST) { - dir -= BACKWARD_CONST; - shift[dir]+=-1; - } else if (dir == NO_SHIFT) { - ; // do nothing - } else { - shift[dir]+=1; - } - generalShift(shift, args...); -} - - -/*! @brief append arbitrary shift path to shifts */ -template -void appendShift(std::vector& shifts, int dir, Args... args) { - Coordinate shift(Nd,0); - generalShift(shift, dir, args...); - // push_back creates an element at the end of shifts and - // assigns the data in the argument to it. - shifts.push_back(shift); -} - - /*! @brief structure holding the link treatment */ struct SmearingParameters{ SmearingParameters(){} @@ -189,17 +149,16 @@ public: // This is where contributions from the smearing get added together Ughost_fat=Zero(); - // Create the accessors - autoView(U_v , Ughost , CpuRead); - autoView(U_fat_v , Ughost_fat , CpuWrite); - autoView(U_3link_v , Ughost_3link , CpuWrite); - autoView(U_5linkA_v, Ughost_5linkA, CpuWrite); - autoView(U_5linkB_v, Ughost_5linkB, CpuWrite); - for(int mu=0;mu Nd! + +/*! @brief signals that you want to go backwards in direction dir */ +inline int Back(const int dir) { + // generalShift will use BACKWARD_CONST to determine whether we step forward or + // backward. Trick inspired by SIMULATeQCD. + return dir + BACKWARD_CONST; +} + +/*! @brief shift one unit in direction dir */ +template +void generalShift(Coordinate& shift, int dir) { + if (dir >= BACKWARD_CONST) { + dir -= BACKWARD_CONST; + shift[dir]+=-1; + } else if (dir == NO_SHIFT) { + ; // do nothing + } else { + shift[dir]+=1; + } +} + +/*! @brief follow a path of directions, shifting one unit in each direction */ +template +void generalShift(Coordinate& shift, int dir, Args... args) { + if (dir >= BACKWARD_CONST) { + dir -= BACKWARD_CONST; + shift[dir]+=-1; + } else if (dir == NO_SHIFT) { + ; // do nothing + } else { + shift[dir]+=1; + } + generalShift(shift, args...); +} + + NAMESPACE_END(Grid); diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index 200e2af6..f5c7b5ca 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -38,20 +38,6 @@ directory using namespace Grid; -/*! @brief make the logger work like python print */ -template -inline std::string sjoin(Args&&... args) noexcept { - std::ostringstream msg; - (msg << ... << args); - return msg.str(); -} -template -inline void Grid_log(Args&&... args) { - std::string msg = sjoin(std::forward(args)...); - std::cout << GridLogMessage << msg << std::endl; -} - - /*! @brief parameter file to easily adjust Nloop */ struct ConfParameters: Serializable { GRID_SERIALIZABLE_CLASS_MEMBERS( From bf4369f72de9bd241a0bde9e5382c3dc57e38f22 Mon Sep 17 00:00:00 2001 From: david clarke Date: Thu, 12 Oct 2023 12:41:06 -0600 Subject: [PATCH 026/114] clean up HISQSmear with decltypes --- Grid/qcd/smearing/HISQSmearing.h | 101 ++++++++++++++----------------- 1 file changed, 47 insertions(+), 54 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 1ea1b7b9..432184e0 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -40,6 +40,8 @@ directory NAMESPACE_BEGIN(Grid); +// TODO: find a way to fold this into the stencil header. need to access grid to get +// Nd, since you don't want to inherit from QCD.h /*! @brief append arbitrary shift path to shifts */ template void appendShift(std::vector& shifts, int dir, Args... args) { @@ -51,16 +53,6 @@ void appendShift(std::vector& shifts, int dir, Args... args) { } -// This is to optimize the SIMD (will also need to be in the class, at least for now) -template void gpermute(vobj & inout,int perm) { - vobj tmp=inout; - if (perm & 0x1) {permute(inout,tmp,0); tmp=inout;} - if (perm & 0x2) {permute(inout,tmp,1); tmp=inout;} - if (perm & 0x4) {permute(inout,tmp,2); tmp=inout;} - if (perm & 0x8) {permute(inout,tmp,3); tmp=inout;} -} - - /*! @brief figure out the stencil index from mu and nu */ inline int stencilIndex(int mu, int nu) { // Nshifts depends on how you built the stencil @@ -163,6 +155,12 @@ public: Ughost_5linkA=Zero(); Ughost_5linkB=Zero(); + // We infer some types that will be needed in the calculation. + typedef decltype(gStencil.GetEntry(0,0)) stencilElement; + typedef decltype(coalescedReadGeneralPermute(U_v[0](0),gStencil.GetEntry(0,0)->_permute,Nd)) U3matrix; + stencilElement SE0, SE1, SE2, SE3, SE4; + U3matrix U0, U1, U2, U3, U4, U5, W; + // 3-link for(int site=0;site_offset; - auto SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; - auto SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; - auto SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; - auto SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + SE0 = gStencil.GetEntry(s+0,site); int x_p_mu = SE0->_offset; + SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; // When you're deciding whether to take an adjoint, the question is: how is the // stored link oriented compared to the one you want? If I imagine myself travelling // with the to-be-updated link, I have two possible, alternative 3-link paths I can // take, one starting by going to the left, the other starting by going to the right. - auto U0 = U_v[x_p_mu ](nu); gpermute(U0,SE0->_permute); - auto U1 = U_v[x_p_nu ](mu); gpermute(U1,SE1->_permute); - auto U2 = U_v[x ](nu); gpermute(U2,SE2->_permute); - auto U3 = U_v[x_p_mu_m_nu](nu); gpermute(U3,SE3->_permute); - auto U4 = U_v[x_m_nu ](mu); gpermute(U4,SE4->_permute); - auto U5 = U_v[x_m_nu ](nu); gpermute(U5,SE4->_permute); + U0 = coalescedReadGeneralPermute(U_v[x_p_mu ](nu),SE0->_permute,Nd); + U1 = coalescedReadGeneralPermute(U_v[x_p_nu ](mu),SE1->_permute,Nd); + U2 = coalescedReadGeneralPermute(U_v[x ](nu),SE2->_permute,Nd); + U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd); + U4 = coalescedReadGeneralPermute(U_v[x_m_nu ](mu),SE4->_permute,Nd); + U5 = coalescedReadGeneralPermute(U_v[x_m_nu ](nu),SE4->_permute,Nd); - // "left" "right" - auto W = U2*U1*adj(U0) + adj(U5)*U4*U3; + // "left" "right" + W = U2*U1*adj(U0) + adj(U5)*U4*U3; U_3link_v[site](nu) = W; @@ -197,7 +195,6 @@ public: } } - // 5-link for(int site=0;site_offset; - auto SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; - auto SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; - auto SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; - auto SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + SE0 = gStencil.GetEntry(s+0,site); int x_p_mu = SE0->_offset; + SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; // gpermutes will be replaced with single line of code, combines load and permute // into one step. still in pull request stage - auto U0 = U_v[x_p_mu ](nu) ; gpermute(U0,SE0->_permute); - auto U1 = U_3link_v[x_p_nu ](rho); gpermute(U1,SE1->_permute); - auto U2 = U_v[x ](nu) ; gpermute(U2,SE2->_permute); - auto U3 = U_v[x_p_mu_m_nu](nu) ; gpermute(U3,SE3->_permute); - auto U4 = U_3link_v[x_m_nu ](rho); gpermute(U4,SE4->_permute); - auto U5 = U_v[x_m_nu ](nu) ; gpermute(U5,SE4->_permute); + U0 = coalescedReadGeneralPermute( U_v[x_p_mu ](nu ),SE0->_permute,Nd); + U1 = coalescedReadGeneralPermute(U_3link_v[x_p_nu ](rho),SE1->_permute,Nd); + U2 = coalescedReadGeneralPermute( U_v[x ](nu ),SE2->_permute,Nd); + U3 = coalescedReadGeneralPermute( U_v[x_p_mu_m_nu](nu ),SE3->_permute,Nd); + U4 = coalescedReadGeneralPermute(U_3link_v[x_m_nu ](rho),SE4->_permute,Nd); + U5 = coalescedReadGeneralPermute( U_v[x_m_nu ](nu ),SE4->_permute,Nd); - auto W = U2*U1*adj(U0) + adj(U5)*U4*U3; + W = U2*U1*adj(U0) + adj(U5)*U4*U3; if(sigmaIndex<3) { U_5linkA_v[site](rho) = W; @@ -246,33 +243,29 @@ public: for(int rho=0;rho_offset; - auto SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; - auto SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; - auto SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; - auto SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + SE0 = gStencil.GetEntry(s+0,site); int x_p_mu = SE0->_offset; + SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; - auto U0 = U_v[x_p_mu ](nu) ; gpermute(U0,SE0->_permute); - // decltype, or auto U1 = { ? ... } - auto U1 = U0; + U0 = coalescedReadGeneralPermute(U_v[x_p_mu](nu),SE0->_permute,Nd); if(sigmaIndex<3) { - U1 = U_5linkB_v[x_p_nu](rho); gpermute(U1,SE1->_permute); + U1 = coalescedReadGeneralPermute(U_5linkB_v[x_p_nu](rho),SE1->_permute,Nd); } else { - U1 = U_5linkA_v[x_p_nu](rho); gpermute(U1,SE1->_permute); + U1 = coalescedReadGeneralPermute(U_5linkA_v[x_p_nu](rho),SE1->_permute,Nd); } - auto U2 = U_v[x ](nu) ; gpermute(U2,SE2->_permute); - auto U3 = U_v[x_p_mu_m_nu](nu) ; gpermute(U3,SE3->_permute); - auto U4 = U0; + U2 = coalescedReadGeneralPermute(U_v[x](nu),SE2->_permute,Nd); + U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd); if(sigmaIndex<3) { - U4 = U_5linkB_v[x_m_nu](rho); gpermute(U4,SE4->_permute); + U4 = coalescedReadGeneralPermute(U_5linkB_v[x_m_nu](rho),SE4->_permute,Nd); } else { - U4 = U_5linkA_v[x_m_nu](rho); gpermute(U4,SE4->_permute); + U4 = coalescedReadGeneralPermute(U_5linkA_v[x_m_nu](rho),SE4->_permute,Nd); } - auto U5 = U_v[x_m_nu ](nu) ; gpermute(U5,SE4->_permute); + U5 = coalescedReadGeneralPermute(U_v[x_m_nu](nu),SE4->_permute,Nd); - auto W = U2*U1*adj(U0) + adj(U5)*U4*U3; + W = U2*U1*adj(U0) + adj(U5)*U4*U3; - // std::vector(3) ? U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_7*W; sigmaIndex++; From c9c45762375135539ac3da49758c885f8dd02826 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Oct 2023 17:46:07 +0300 Subject: [PATCH 027/114] Improved frontier cshift --- Grid/cshift/Cshift_common.h | 71 ++++++++--------------- Grid/cshift/Cshift_mpi.h | 111 ++++++++++++++++++++++++++++++------ Grid/cshift/Cshift_table.cc | 3 +- 3 files changed, 122 insertions(+), 63 deletions(-) diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index 742c99da..309517b2 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -29,8 +29,27 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); -extern Vector > Cshift_table; +extern std::vector > Cshift_table; +extern commVector > Cshift_table_device; +inline std::pair *MapCshiftTable(void) +{ + // GPU version +#ifdef ACCELERATOR_CSHIFT + uint64_t sz=Cshift_table.size(); + if (Cshift_table_device.size()!=sz ) { + Cshift_table_device.resize(sz); + } + acceleratorCopyToDevice((void *)&Cshift_table[0], + (void *)&Cshift_table_device[0], + sizeof(Cshift_table[0])*sz); + + return &Cshift_table_device[0]; +#else + return &Cshift_table[0]; +#endif + // CPU version use identify map +} /////////////////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// @@ -74,8 +93,8 @@ Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dim } { auto buffer_p = & buffer[0]; - auto table = &Cshift_table[0]; -#ifdef ACCELERATOR_CSHIFT + auto table = MapCshiftTable(); +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); @@ -225,7 +244,7 @@ template void Scatter_plane_simple (Lattice &rhs,cshiftVector< { auto buffer_p = & buffer[0]; - auto table = &Cshift_table[0]; + auto table = MapCshiftTable(); #ifdef ACCELERATOR_CSHIFT autoView( rhs_v, rhs, AcceleratorWrite); accelerator_for(i,ent,vobj::Nsimd(),{ @@ -297,30 +316,6 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA } } -#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) - -template -T iDivUp(T a, T b) // Round a / b to nearest higher integer value -{ return (a % b != 0) ? (a / b + 1) : (a / b); } - -template -__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) -{ - int idx = blockIdx.x*blockDim.x + threadIdx.x; - if (idx >= e1*e2) return; - - int n, b, o; - - n = idx / e2; - b = idx % e2; - o = n*stride + b; - - vector[2*idx + 0] = lo + o; - vector[2*idx + 1] = ro + o; -} - -#endif - ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// @@ -345,20 +340,12 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int ent=0; if(cbmask == 0x3 ){ -#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) - ent = e1*e2; - dim3 blockSize(acceleratorThreads()); - dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); - populate_Cshift_table<<>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); - accelerator_barrier(); -#else for(int n=0;n(lo+o,ro+o); } } -#endif } else { for(int n=0;n void Copy_plane(Lattice& lhs,const Lattice &rhs } { - auto table = &Cshift_table[0]; + auto table = MapCshiftTable(); #ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); @@ -409,19 +396,11 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); - accelerator_barrier(); -#else for(int n=0;n(lo+o+b,ro+o+b); }} -#endif } else { for(int n=0;n void Copy_plane_permute(Lattice& lhs,const Lattice Lattice Cshift(const Lattice &rhs,int dimension int comm_dim = rhs.Grid()->_processors[dimension] >1 ; int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim); - + RealD t1,t0; + t0=usecond(); if ( !comm_dim ) { //std::cout << "CSHIFT: Cshift_local" < Lattice Cshift(const Lattice &rhs,int dimension //std::cout << "CSHIFT: Cshift_comms" < void Cshift_comms(Lattice &ret,const Lattice &r int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - + RealD tcopy=0.0; + RealD tgather=0.0; + RealD tscatter=0.0; + RealD tcomms=0.0; + uint64_t xbytes=0; for(int x=0;x void Cshift_comms(Lattice &ret,const Lattice &r int bytes = words * sizeof(vobj); + tgather-=usecond(); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); + tgather+=usecond(); // int rank = grid->_processor; int recv_from_rank; int xmit_to_rank; grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - - grid->Barrier(); + + tcomms-=usecond(); + // grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, (void *)&recv_buf[0], recv_from_rank, bytes); + xbytes+=bytes; + // grid->Barrier(); + tcomms+=usecond(); - grid->Barrier(); - + tscatter-=usecond(); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); + tscatter+=usecond(); } } + /* + std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -190,6 +210,12 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice=0); assert(shiftPermuteType(dimension); /////////////////////////////////////////////// @@ -227,7 +253,9 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - grid->Barrier(); + tcomms-=usecond(); + // grid->Barrier(); send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0]; @@ -262,7 +291,9 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); + xbytes+=bytes; + // grid->Barrier(); + tcomms+=usecond(); rpointers[i] = &recv_buf_extract[i][0]; } else { @@ -270,9 +301,17 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -292,6 +331,11 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(comm_dim==1); assert(shift>=0); assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); @@ -315,7 +359,9 @@ template void Cshift_comms(Lattice &ret,const Lattice &r if (comm_proc==0) { + tcopy-=usecond(); Copy_plane(ret,rhs,dimension,x,sx,cbmask); + tcopy+=usecond(); } else { @@ -324,7 +370,9 @@ template void Cshift_comms(Lattice &ret,const Lattice &r int bytes = words * sizeof(vobj); + tgather-=usecond(); Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); + tgather+=usecond(); // int rank = grid->_processor; int recv_from_rank; @@ -332,7 +380,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - grid->Barrier(); + tcomms-=usecond(); + // grid->Barrier(); acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); grid->SendToRecvFrom((void *)&send_buf[0], @@ -340,13 +389,24 @@ template void Cshift_comms(Lattice &ret,const Lattice &r (void *)&recv_buf[0], recv_from_rank, bytes); + xbytes+=bytes; acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); - grid->Barrier(); + // grid->Barrier(); + tcomms+=usecond(); + tscatter-=usecond(); Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); + tscatter+=usecond(); } } + /* + std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -372,6 +432,11 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice=0); assert(shiftPermuteType(dimension); @@ -414,8 +479,10 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - grid->Barrier(); + tcomms-=usecond(); + // grid->Barrier(); acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); grid->SendToRecvFrom((void *)send_buf_extract_mpi, @@ -449,17 +517,28 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); + // grid->Barrier(); + tcomms+=usecond(); rpointers[i] = &recv_buf_extract[i][0]; } else { rpointers[i] = &send_buf_extract[nbr_lane][0]; } } + tscatter-=usecond(); Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); - } + tscatter+=usecond(); + } + /* + std::cout << GridLogPerformance << " Cshift (s) copy "< NAMESPACE_BEGIN(Grid); -Vector > Cshift_table; +std::vector > Cshift_table; +commVector > Cshift_table_device; NAMESPACE_END(Grid); From c5b43b322c7b1d4087aa477003e555a9cdd54493 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Oct 2023 17:53:58 +0300 Subject: [PATCH 028/114] traceProduct eliminates non-contributing intermediate terms --- Grid/lattice/Lattice_arith.h | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index aebc093a..5b37532f 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -270,5 +270,42 @@ RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const L return axpby_norm_fast(ret,a,b,x,y); } +/// Trace product +template auto traceProduct(const Lattice &rhs_1,const Lattice &rhs_2) + -> Lattice +{ + typedef decltype(trace(obj())) robj; + Lattice ret_i(rhs_1.Grid()); + autoView( rhs1 , rhs_1, AcceleratorRead); + autoView( rhs2 , rhs_2, AcceleratorRead); + autoView( ret , ret_i, AcceleratorWrite); + ret.Checkerboard() = rhs_1.Checkerboard(); + accelerator_for(ss,rhs1.size(),obj::Nsimd(),{ + coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss))); + }); + return ret_i; +} + +template auto traceProduct(const Lattice &rhs_1,const obj2 &rhs2) + -> Lattice +{ + typedef decltype(trace(obj1())) robj; + Lattice ret_i(rhs_1.Grid()); + autoView( rhs1 , rhs_1, AcceleratorRead); + autoView( ret , ret_i, AcceleratorWrite); + ret.Checkerboard() = rhs_1.Checkerboard(); + accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{ + coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2)); + }); + return ret_i; +} +template auto traceProduct(const obj2 &rhs_2,const Lattice &rhs_1) + -> Lattice +{ + return traceProduct(rhs_1,rhs_2); +} + + + NAMESPACE_END(Grid); #endif From ffc0639cb95a3f8a4ff35d1cdacd727324118415 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Oct 2023 17:55:27 +0300 Subject: [PATCH 029/114] Running in HMC tests --- Grid/qcd/smearing/GaugeConfigurationMasked.h | 238 +++++++++++++++++-- 1 file changed, 217 insertions(+), 21 deletions(-) diff --git a/Grid/qcd/smearing/GaugeConfigurationMasked.h b/Grid/qcd/smearing/GaugeConfigurationMasked.h index 4309c470..78846263 100644 --- a/Grid/qcd/smearing/GaugeConfigurationMasked.h +++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h @@ -1,3 +1,4 @@ + /*! @file GaugeConfiguration.h @brief Declares the GaugeConfiguration class @@ -6,6 +7,15 @@ NAMESPACE_BEGIN(Grid); + +template void Dump(const Lattice & lat, + std::string s, + Coordinate site = Coordinate({0,0,0,0})) +{ + typename T::scalar_object tmp; + peekSite(tmp,lat,site); + std::cout << " Dump "< WL; + GaugeLinkField staple(grid), u_tmp(grid); + GaugeLinkField iLambda_mu(grid), iLambda_nu(grid); + GaugeLinkField U_mu(grid), U_nu(grid); + GaugeLinkField sh_field(grid), temp_Sigma(grid); + Real rho_munu, rho_numu; + + rho_munu = rho; + rho_numu = rho; + for(int mu = 0; mu < Nd; ++mu){ + U_mu = peekLorentz( U, mu); + iLambda_mu = peekLorentz(iLambda, mu); + + for(int nu = 0; nu < Nd; ++nu){ + if(nu==mu) continue; + + U_nu = peekLorentz( U, nu); + + // Nd(nd-1) = 12 staples normally. + // We must compute 6 of these + // in FTHMC case + if ( (mu==mmu)||(nu==mmu) ) + WL.StapleUpper(staple, U, mu, nu); + + if(nu==mmu) { + iLambda_nu = peekLorentz(iLambda, nu); + + temp_Sigma = -rho_numu*staple*iLambda_nu; //ok + //-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x) + Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); + + sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity? + + temp_Sigma = rho_numu*sh_field*staple; //ok + //r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x) + Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); + } + + if ( mu == mmu ) { + sh_field = Cshift(iLambda_mu, nu, 1); + + temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok + //-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x) + Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); + } + + // staple = Zero(); + sh_field = Cshift(U_nu, mu, 1); + + temp_Sigma = Zero(); + + if ( mu == mmu ) + temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu; + + if ( nu == mmu ) { + temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu; + + u_tmp = adj(U_nu)*iLambda_nu; + sh_field = Cshift(u_tmp, mu, 1); + temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu; + } + + sh_field = Cshift(temp_Sigma, nu, -1); + Gimpl::AddLink(SigmaTerm, sh_field, mu); + + } + } + } + + void BaseSmear(GaugeLinkField& Cup, const GaugeField& U,int mu,RealD rho) { + GridBase *grid = U.Grid(); + GaugeLinkField tmp_stpl(grid); + WilsonLoops WL; + Cup = Zero(); + for(int nu=0; nu(Dbc,tmp,b,c); // Adjoint rep } +#endif + tp+=usecond(); } - tmp = trace(MpInvJx * Dbc); + // Dump(Dbc_opt,"Dbc_opt"); + // Dump(Dbc,"Dbc"); + tpk-=usecond(); + tmp = trace(MpInvJx * Dbc_opt); PokeIndex(Fdet2,tmp,a); + tpk+=usecond(); } + t+=usecond(); + std::cout << GridLogPerformance << " Compute_MpInvJx_dNxxdSy " << t/1e3 << " ms proj "<(NxAd,tmp,c,b); } +#endif } } void ApplyMask(GaugeField &U,int smr) @@ -164,8 +301,7 @@ public: // Computes ALL the staples -- could compute one only and do it here RealD time; time=-usecond(); - this->StoutSmearing->BaseSmear(C, U); - Cmu = peekLorentz(C, mu); + BaseSmear(Cmu, U,mu,rho); ////////////////////////////////////////////////////////////////// // Assemble Luscher exp diff map J matrix @@ -209,6 +345,36 @@ public: // dJ(x)/dxe ////////////////////////////////////// time=-usecond(); +#if 1 + std::vector dJdX; dJdX.resize(8,grid); + std::vector TRb_s; TRb_s.resize(8); + AdjMatrixField tbXn(grid); + AdjMatrixField sumXtbX(grid); + AdjMatrixField t2(grid); + AdjMatrixField dt2(grid); + AdjMatrixField t3(grid); + AdjMatrixField dt3(grid); + AdjMatrixField aunit(grid); + + for(int b=0;b<8;b++){ + SU3Adjoint::generator(b, TRb_s[b]); + dJdX[b] = TRb_s[b]; + } + aunit = ComplexD(1.0); + // Could put into an accelerator_for + X = (-1.0)*ZxAd; + t2 = X; + for (int j = 12; j > 1; --j) { + t3 = t2*(1.0 / (j + 1)) + aunit; + t2 = X * t3; + for(int b=0;b<8;b++){ + dJdX[b]= TRb_s[b] * t3 + X * dJdX[b]*(1.0 / (j + 1)); + } + } + for(int b=0;b<8;b++){ + dJdX[b] = -dJdX[b]; + } +#else std::vector dJdX; dJdX.resize(8,grid); AdjMatrixField tbXn(grid); AdjMatrixField sumXtbX(grid); @@ -224,14 +390,15 @@ public: X = (-1.0)*ZxAd; t2 = X; dt2 = TRb; - for (int j = 20; j > 1; --j) { - t3 = t2*(1.0 / (j + 1)) + aunit; + for (int j = 12; j > 1; --j) { + t3 = t2*(1.0 / (j + 1)) + aunit; dt3 = dt2*(1.0 / (j + 1)); t2 = X * t3; dt2 = TRb * t3 + X * dt3; } dJdX[b] = -dt2; } +#endif time+=usecond(); std::cout << GridLogMessage << "dJx took "<StoutSmearing->BaseSmear(C, U); - Cmu = peekLorentz(C, mu); + double rho=this->StoutSmearing->SmearRho[1]; + BaseSmear(Cmu, U,mu,rho); + Umu = peekLorentz(U, mu); Complex ci(0,1); for(int b=0;b(Ncb,tmp,c,b); } +#endif } ////////////////////////////////////////////////////////////////// @@ -693,15 +865,19 @@ private: const GaugeField& GaugeK,int level) { GridBase* grid = GaugeK.Grid(); - GaugeField C(grid), SigmaK(grid), iLambda(grid); + GaugeField SigmaK(grid), iLambda(grid); GaugeField SigmaKPrimeA(grid); GaugeField SigmaKPrimeB(grid); GaugeLinkField iLambda_mu(grid); GaugeLinkField iQ(grid), e_iQ(grid); GaugeLinkField SigmaKPrime_mu(grid); GaugeLinkField GaugeKmu(grid), Cmu(grid); - - this->StoutSmearing->BaseSmear(C, GaugeK); + + int mmu= (level/2) %Nd; + int cb= (level%2); + double rho=this->StoutSmearing->SmearRho[1]; + + // Can override this to do one direction only. SigmaK = Zero(); iLambda = Zero(); @@ -712,18 +888,38 @@ private: // Could get away with computing only one polarisation here // int mu= (smr/2) %Nd; // SigmaKprime_A has only one component - for (int mu = 0; mu < Nd; mu++) +#if 0 + BaseSmear(Cmu, GaugeK,mu,rho); + GaugeKmu = peekLorentz(GaugeK, mu); + SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu); + iQ = Ta(Cmu * adj(GaugeKmu)); + this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu); + pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); + pokeLorentz(iLambda, iLambda_mu, mu); + BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho); // derivative of SmearBase +#else + // GaugeField C(grid); + // this->StoutSmearing->BaseSmear(C, GaugeK); + // for (int mu = 0; mu < Nd; mu++) + int mu =mmu; + BaseSmear(Cmu, GaugeK,mu,rho); { - Cmu = peekLorentz(C, mu); + // Cmu = peekLorentz(C, mu); GaugeKmu = peekLorentz(GaugeK, mu); SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu); iQ = Ta(Cmu * adj(GaugeKmu)); this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu); pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); pokeLorentz(iLambda, iLambda_mu, mu); + std::cout << " mu "<StoutSmearing->derivative(SigmaK, iLambda,GaugeK); // derivative of SmearBase - + // GaugeField SigmaKcopy(grid); + // SigmaKcopy = SigmaK; + BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho); // derivative of SmearBase + // this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK); // derivative of SmearBase + // SigmaKcopy = SigmaKcopy - SigmaK; + // std::cout << " BaseSmearDerivative fast path error" < Date: Fri, 13 Oct 2023 17:56:47 +0300 Subject: [PATCH 030/114] IfGridTensor shorthand --- Grid/tensors/Tensor_trace.h | 29 +++++++++++++++++++++++++++++ Grid/tensors/Tensor_traits.h | 9 ++++++--- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/Grid/tensors/Tensor_trace.h b/Grid/tensors/Tensor_trace.h index 6aa398f9..c73d0949 100644 --- a/Grid/tensors/Tensor_trace.h +++ b/Grid/tensors/Tensor_trace.h @@ -69,6 +69,35 @@ accelerator_inline auto trace(const iVector &arg) -> iVector = 0, IfNotGridTensor = 0> +accelerator_inline auto traceProduct( const S1 &arg1,const S2 &arg2) + -> decltype(arg1*arg2) +{ + return arg1*arg2; +} + +template +accelerator_inline auto traceProduct(const iMatrix &arg1,const iMatrix &arg2) -> iScalar +{ + iScalar ret; + zeroit(ret._internal); + for(int i=0;i +accelerator_inline auto traceProduct(const iScalar &arg1,const iScalar &arg2) -> iScalar +{ + iScalar ret; + ret._internal=traceProduct(arg1._internal,arg2._internal); + return ret; +} NAMESPACE_END(Grid); diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 58fdc6ce..98bc3986 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -34,9 +34,12 @@ NAMESPACE_BEGIN(Grid); // These are the Grid tensors template struct isGridTensor : public std::false_type { static constexpr bool notvalue = true; }; - template struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; - template struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; - template struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; + template struct isGridTensor > : public std::true_type { static constexpr bool notvalue = false; }; + template struct isGridTensor >: public std::true_type { static constexpr bool notvalue = false; }; + template struct isGridTensor >: public std::true_type { static constexpr bool notvalue = false; }; + + template using IfGridTensor = Invoke::value, int> >; + template using IfNotGridTensor = Invoke::value, int> >; // Traits to identify scalars template struct isGridScalar : public std::false_type { static constexpr bool notvalue = true; }; From 9626a2c7c052b1f6cd6985911675f3ef38552cf4 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Oct 2023 17:57:20 +0300 Subject: [PATCH 031/114] Asynch handling --- Grid/threads/Accelerator.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 2aeb9fa7..f362a077 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -575,4 +575,11 @@ accelerator_inline void acceleratorFence(void) return; } +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) +{ + acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); + acceleratorCopySynchronise(); +} + + NAMESPACE_END(Grid); From e19171523be4e66e59baa3408e6098a6695fb0f1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Oct 2023 17:57:56 +0300 Subject: [PATCH 032/114] FTHMC Status at lattice conference commit --- HMC/FTHMC2p1f.cc | 18 ++++++----- HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc | 2 +- HMC/Mobius2p1f_EOFA_96I_hmc.cc | 46 +++++++++++++++-------------- 3 files changed, 35 insertions(+), 31 deletions(-) diff --git a/HMC/FTHMC2p1f.cc b/HMC/FTHMC2p1f.cc index dd824138..7d93d168 100644 --- a/HMC/FTHMC2p1f.cc +++ b/HMC/FTHMC2p1f.cc @@ -54,15 +54,16 @@ int main(int argc, char **argv) // MD.name = std::string("Force Gradient"); typedef GenericHMCRunner HMCWrapper; MD.name = std::string("MinimumNorm2"); - MD.MDsteps = 12; + MD.MDsteps = 24; MD.trajL = 1.0; HMCparameters HMCparams; - HMCparams.StartTrajectory = 0; + HMCparams.StartTrajectory = 104; HMCparams.Trajectories = 200; HMCparams.NoMetropolisUntil= 20; // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; - HMCparams.StartingType =std::string("HotStart"); + // HMCparams.StartingType =std::string("HotStart"); + HMCparams.StartingType =std::string("CheckpointStart"); HMCparams.MD = MD; HMCWrapper TheHMC(HMCparams); @@ -87,6 +88,7 @@ int main(int argc, char **argv) // here there is too much indirection typedef PlaquetteMod PlaqObs; TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// const int Ls = 16; @@ -134,7 +136,6 @@ int main(int argc, char **argv) //////////////////////////////////// ActionLevel Level1(1); ActionLevel Level2(2); - ActionLevel Level3(4); //////////////////////////////////// // Strange action @@ -191,7 +192,7 @@ int main(int argc, char **argv) Smear_Stout Stout(rho); SmearedConfigurationMasked SmearingPolicy(GridPtr, Nstep, Stout); JacobianAction Jacobian(&SmearingPolicy); - if( ApplySmearing ) Level2.push_back(&Jacobian); + if( ApplySmearing ) Level1.push_back(&Jacobian); std::cout << GridLogMessage << " Built the Jacobian "<< std::endl; @@ -200,7 +201,7 @@ int main(int argc, char **argv) ///////////////////////////////////////////////////////////// // GaugeAction.is_smeared = ApplySmearing; GaugeAction.is_smeared = true; - Level3.push_back(&GaugeAction); + Level2.push_back(&GaugeAction); std::cout << GridLogMessage << " ************************************************"<< std::endl; std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl; @@ -210,10 +211,11 @@ int main(int argc, char **argv) std::cout << GridLogMessage << " Running the FT HMC "<< std::endl; - TheHMC.TheAction.push_back(Level1); TheHMC.TheAction.push_back(Level2); - TheHMC.TheAction.push_back(Level3); + + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); TheHMC.Run(SmearingPolicy); // for smearing diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc b/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc index 24d8951e..83f20b92 100644 --- a/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc +++ b/HMC/Mobius2p1f_DD_EOFA_96I_mixed.cc @@ -343,7 +343,7 @@ int main(int argc, char **argv) { // Probably dominates the force - back to EOFA. OneFlavourRationalParams SFRp; SFRp.lo = 0.1; - SFRp.hi = 25.0; + SFRp.hi = 30.0; SFRp.MaxIter = 10000; SFRp.tolerance= 1.0e-5; SFRp.mdtolerance= 2.0e-4; diff --git a/HMC/Mobius2p1f_EOFA_96I_hmc.cc b/HMC/Mobius2p1f_EOFA_96I_hmc.cc index 3d674db4..91f0bd95 100644 --- a/HMC/Mobius2p1f_EOFA_96I_hmc.cc +++ b/HMC/Mobius2p1f_EOFA_96I_hmc.cc @@ -128,7 +128,7 @@ template MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD); @@ -180,7 +180,7 @@ int main(int argc, char **argv) { // 4/2 => 0.6 dH // 3/3 => 0.8 dH .. depth 3, slower //MD.MDsteps = 4; - MD.MDsteps = 14; + MD.MDsteps = 12; MD.trajL = 0.5; HMCparameters HMCparams; @@ -204,7 +204,7 @@ int main(int argc, char **argv) { TheHMC.Resources.LoadNerscCheckpointer(CPparams); std::cout << "loaded NERSC checpointer"< hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); - // std::vector hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); - std::vector hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated - // std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); + std::vector hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated + //std::vector hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated auto GridPtr = TheHMC.Resources.GetCartesian(); auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); @@ -277,20 +276,20 @@ int main(int argc, char **argv) { // double StoppingCondition = 1e-14; // double MDStoppingCondition = 1e-9; - double StoppingCondition = 1e-9; - double MDStoppingCondition = 1e-8; - double MDStoppingConditionLoose = 1e-8; - double MDStoppingConditionStrange = 1e-8; - double MaxCGIterations = 300000; + double StoppingCondition = 1e-14; + double MDStoppingCondition = 1e-9; + double MDStoppingConditionLoose = 1e-9; + double MDStoppingConditionStrange = 1e-9; + double MaxCGIterations = 50000; ConjugateGradient CG(StoppingCondition,MaxCGIterations); ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); //////////////////////////////////// // Collect actions //////////////////////////////////// - // ActionLevel Level1(1); - ActionLevel Level2(1); - ActionLevel Level3(15); + ActionLevel Level1(1); + ActionLevel Level2(2); + ActionLevel Level3(4); //////////////////////////////////// // Strange action @@ -300,11 +299,11 @@ int main(int argc, char **argv) { // Probably dominates the force - back to EOFA. OneFlavourRationalParams SFRp; - SFRp.lo = 0.1; + SFRp.lo = 0.8; SFRp.hi = 30.0; SFRp.MaxIter = 10000; - SFRp.tolerance= 1.0e-8; - SFRp.mdtolerance= 2.0e-6; + SFRp.tolerance= 1.0e-12; + SFRp.mdtolerance= 1.0e-9; SFRp.degree = 10; SFRp.precision= 50; @@ -355,8 +354,10 @@ int main(int argc, char **argv) { ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L, Strange_Op_R, ActionCG, - ActionCGL, ActionCGR, - DerivativeCGL, DerivativeCGR, + // ActionCGL, ActionCGR, + // DerivativeCGL, DerivativeCGR, + ActionCG, ActionCG, + DerivativeCG, DerivativeCG, SFRp, true); Level2.push_back(&EOFA); @@ -443,13 +444,14 @@ int main(int argc, char **argv) { } int nquo=Quotients.size(); for(int h=0;h Date: Fri, 13 Oct 2023 17:58:48 +0300 Subject: [PATCH 033/114] FTHMC 3 Gev --- HMC/FTHMC2p1f_3GeV.cc | 226 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 HMC/FTHMC2p1f_3GeV.cc diff --git a/HMC/FTHMC2p1f_3GeV.cc b/HMC/FTHMC2p1f_3GeV.cc new file mode 100644 index 00000000..7d93d168 --- /dev/null +++ b/HMC/FTHMC2p1f_3GeV.cc @@ -0,0 +1,226 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Copyright (C) 2023 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +using namespace Grid; + +int main(int argc, char **argv) +{ + std::cout << std::setprecision(12); + + Grid_init(&argc, &argv); + int threads = GridThread::GetThreads(); + // here make a routine to print all the relevant information on the run + std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; + + // Typedefs to simplify notation + typedef WilsonImplR FermionImplPolicy; + typedef MobiusFermionD FermionAction; + typedef typename FermionAction::FermionField FermionField; + + typedef Grid::XmlReader Serialiser; + + //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + IntegratorParameters MD; + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Leap Frog"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Force Gradient"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("MinimumNorm2"); + MD.MDsteps = 24; + MD.trajL = 1.0; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 104; + HMCparams.Trajectories = 200; + HMCparams.NoMetropolisUntil= 20; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("HotStart"); + HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_EODWF_lat"; + CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr"; + CPparams.rng_prefix = "ckpoint_EODWF_rng"; + CPparams.saveInterval = 1; + CPparams.saveSmeared = true; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + + // Construct observables + // here there is too much indirection + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + + ////////////////////////////////////////////// + + const int Ls = 16; + Real beta = 2.13; + Real light_mass = 0.01; + Real strange_mass = 0.04; + Real pv_mass = 1.0; + RealD M5 = 1.8; + RealD b = 1.0; // Scale factor two + RealD c = 0.0; + + OneFlavourRationalParams OFRp; + OFRp.lo = 1.0e-2; + OFRp.hi = 64; + OFRp.MaxIter = 10000; + OFRp.tolerance= 1.0e-10; + OFRp.degree = 14; + OFRp.precision= 40; + + std::vector hasenbusch({ 0.1 }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeField U(GridPtr); + LatticeGaugeField Uhot(GridPtr); + + // These lines are unecessary if BC are all periodic + std::vector boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + + double StoppingCondition = 1e-10; + double MaxCGIterations = 30000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + + bool ApplySmearing = true; + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(2); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + CG, + CG, CG, + CG, CG, + OFRp, false); + + EOFA.is_smeared = ApplySmearing; + Level1.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector *> Quotients; + + for(int h=0;h(*Numerators[h],*Denominators[h],CG,CG)); + } + + for(int h=0;his_smeared = ApplySmearing; + Level1.push_back(Quotients[h]); + } + + ///////////////////////////////////////////////////////////// + // lnDetJacobianAction + ///////////////////////////////////////////////////////////// + double rho = 0.1; // smearing parameter + int Nsmear = 1; // number of smearing levels - must be multiple of 2Nd + int Nstep = 8*Nsmear; // number of smearing levels - must be multiple of 2Nd + Smear_Stout Stout(rho); + SmearedConfigurationMasked SmearingPolicy(GridPtr, Nstep, Stout); + JacobianAction Jacobian(&SmearingPolicy); + if( ApplySmearing ) Level1.push_back(&Jacobian); + std::cout << GridLogMessage << " Built the Jacobian "<< std::endl; + + + ///////////////////////////////////////////////////////////// + // Gauge action + ///////////////////////////////////////////////////////////// + // GaugeAction.is_smeared = ApplySmearing; + GaugeAction.is_smeared = true; + Level2.push_back(&GaugeAction); + + std::cout << GridLogMessage << " ************************************************"<< std::endl; + std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl; + std::cout << GridLogMessage << " ************************************************"<< std::endl; + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << std::endl; + + + std::cout << GridLogMessage << " Running the FT HMC "<< std::endl; + TheHMC.TheAction.push_back(Level1); + TheHMC.TheAction.push_back(Level2); + + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + + TheHMC.Run(SmearingPolicy); // for smearing + + Grid_finalize(); +} // main + + + From 07e4900218c8f7bc7923cc3c9457ee3e0ddb4f3b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Oct 2023 18:20:43 +0300 Subject: [PATCH 034/114] FTHMC commit --- HMC/Mobius2p1f_DD_EOFA_96I_double.cc | 350 ++++++++++++++++++++++++++ HMC/Mobius2p1f_EOFA_96I_hmc_double.cc | 268 ++++++++++++++++++++ 2 files changed, 618 insertions(+) create mode 100644 HMC/Mobius2p1f_DD_EOFA_96I_double.cc create mode 100644 HMC/Mobius2p1f_EOFA_96I_hmc_double.cc diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc new file mode 100644 index 00000000..a2af38f7 --- /dev/null +++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc @@ -0,0 +1,350 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_hmc_EODWFRatio.cc + +Copyright (C) 2015-2016 + +Author: Peter Boyle +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +int main(int argc, char **argv) { + using namespace Grid; + + Grid_init(&argc, &argv); + + CartesianCommunicator::BarrierWorld(); + std::cout << GridLogMessage << " Clock skew check" < HMCWrapper; + // MD.name = std::string("Leap Frog"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("Force Gradient"); + //typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("MinimumNorm2"); + // TrajL = 2 + // 4/2 => 0.6 dH + // 3/3 => 0.8 dH .. depth 3, slower + //MD.MDsteps = 4; + MD.MDsteps = 3; + MD.trajL = 0.5; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 1077; + HMCparams.Trajectories = 1; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("ColdStart"); + HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_DDHMC_lat"; + CPparams.rng_prefix = "ckpoint_DDHMC_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + std::cout << "loaded NERSC checpointer"< PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 12; + RealD M5 = 1.8; + RealD b = 1.5; + RealD c = 0.5; + Real beta = 2.13; + // Real light_mass = 5.4e-4; + Real light_mass = 7.8e-4; + Real light_mass_dir = 0.01; + Real strange_mass = 0.0362; + Real pv_mass = 1.0; + std::vector hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + // std::vector hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + // std::vector hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated + // std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); + + int SP_iters=9000; + + RationalActionParams OFRp; // Up/down + OFRp.lo = 6.0e-5; + OFRp.hi = 90.0; + OFRp.inv_pow = 2; + OFRp.MaxIter = SP_iters; // get most shifts by 2000, stop sharing space + OFRp.action_tolerance= 1.0e-8; + OFRp.action_degree = 18; + OFRp.md_tolerance= 1.0e-7; + OFRp.md_degree = 14; + // OFRp.degree = 20; converges + // OFRp.degree = 16; + OFRp.precision= 80; + OFRp.BoundsCheckFreq=0; + std::vector ActionTolByPole({ + // 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 3.0e-7,1.0e-7,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8 + }); + std::vector MDTolByPole({ + // 1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more + // 1.0e-6,3.0e-7,1.0e-7,1.0e-7, + 1.0e-5,1.0e-6,1.0e-7,1.0e-7, // soften convergence + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8 + }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + typedef SchurDiagMooeeOperator LinearOperatorD; + typedef SchurDiagMooeeOperator LinearOperatorEOFAD; + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + Coordinate CommDim(Nd); + for(int d=0;d1 ? 1 : 0; + + Coordinate NonDirichlet(Nd+1,0); + Coordinate Dirichlet(Nd+1,0); + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + //Dirichlet[1] = 0; + //Dirichlet[2] = 0; + //Dirichlet[3] = 0; + + // + Coordinate Block4(Nd); + Block4[0] = Dirichlet[1]; + Block4[1] = Dirichlet[2]; + Block4[2] = Dirichlet[3]; + Block4[3] = Dirichlet[4]; + + int Width=4; + TheHMC.Resources.SetMomentumFilter(new DDHMCFilter(Block4,Width)); + + ////////////////////////// + // Fermion Grids + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeFieldD U(GridPtr); U=Zero(); + + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + std::cout << "loaded NERSC gauge field"< boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + FermionAction::ImplParams ParamsDir(boundary); + + Params.dirichlet=NonDirichlet; + ParamsDir.dirichlet=Dirichlet; + ParamsDir.partialDirichlet=0; + std::cout << GridLogMessage<< "Partial Dirichlet depth is "< CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(3); + ActionLevel Level3(15); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + // Probably dominates the force - back to EOFA. + OneFlavourRationalParams SFRp; + SFRp.lo = 0.1; + SFRp.hi = 25.0; + SFRp.MaxIter = 10000; + SFRp.tolerance= 1.0e-8; + SFRp.mdtolerance= 2.0e-6; + SFRp.degree = 12; + SFRp.precision= 50; + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ConjugateGradient ActionCG(StoppingCondition,MaxCGIterations); + ConjugateGradient DerivativeCG(MDStoppingCondition,MaxCGIterations); + LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L); + LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R); + + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + ActionCG, + ActionCG, ActionCG, + DerivativeCG, DerivativeCG, + SFRp, true); + Level2.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + std::vector dirichlet_den; + std::vector dirichlet_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); dirichlet_den.push_back(0); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector *> Quotients; + + std::vector *> Bdys; + + typedef SchurDiagMooeeOperator LinearOperatorD; + std::vector LinOpD; + + for(int h=0;h(*Numerators[h],*Denominators[h],MDCG,CG)); + } else { + Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + } + } + for(int h=0;hSetTolerances(ActionTolByPole,MDTolByPole); + } + int nquo=Quotients.size(); + Level1.push_back(Bdys[0]); + Level1.push_back(Bdys[1]); + Level2.push_back(Quotients[0]); + for(int h=1;h +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + + + +int main(int argc, char **argv) { + using namespace Grid; + + std::cout << " Grid Initialise "< HMCWrapper; + // MD.name = std::string("Leap Frog"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("Force Gradient"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("MinimumNorm2"); + // TrajL = 2 + // 4/2 => 0.6 dH + // 3/3 => 0.8 dH .. depth 3, slower + //MD.MDsteps = 4; + MD.MDsteps = 8; + MD.trajL = 0.5; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 1077; + HMCparams.Trajectories = 20; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + HMCparams.StartingType =std::string("ColdStart"); + // HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_HMC_lat"; + CPparams.rng_prefix = "ckpoint_HMC_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + std::cout << "loaded NERSC checpointer"< PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 12; + RealD M5 = 1.8; + RealD b = 1.5; + RealD c = 0.5; + RealD beta = 2.13; + // Real light_mass = 5.4e-4; + Real light_mass = 7.8e-4; + // Real light_mass = 7.8e-3; + Real strange_mass = 0.0362; + Real pv_mass = 1.0; + std::vector hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated + //std::vector hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + typedef SchurDiagMooeeOperator LinearOperatorD; + typedef SchurDiagMooeeOperator LinearOperatorEOFAD; + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + ////////////////////////// + // Fermion Grids + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeFieldD U(GridPtr); U=Zero(); + + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + std::cout << "loaded NERSC gauge field"< boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + + // double StoppingCondition = 1e-14; + // double MDStoppingCondition = 1e-9; + double StoppingCondition = 1e-14; + double MDStoppingCondition = 1e-9; + double MDStoppingConditionLoose = 1e-9; + double MDStoppingConditionStrange = 1e-9; + double MaxCGIterations = 50000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(2); + ActionLevel Level3(4); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + // Probably dominates the force - back to EOFA. + OneFlavourRationalParams SFRp; + SFRp.lo = 0.8; + SFRp.hi = 30.0; + SFRp.MaxIter = 10000; + SFRp.tolerance= 1.0e-12; + SFRp.mdtolerance= 1.0e-9; + SFRp.degree = 10; + SFRp.precision= 50; + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ConjugateGradient ActionCG(StoppingCondition,MaxCGIterations); + ConjugateGradient DerivativeCG(MDStoppingCondition,MaxCGIterations); + LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L); + LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R); + + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + ActionCG, + ActionCG, ActionCG, + DerivativeCG, DerivativeCG, + SFRp, true); + Level2.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector *> Quotients; + + std::vector *> Bdys; + + typedef SchurDiagMooeeOperator LinearOperatorD; + std::vector LinOpD; + + for(int h=0;h(*Numerators[h],*Denominators[h],MDCG,CG,CG)); + } + int nquo=Quotients.size(); + for(int h=0;h Date: Sat, 14 Oct 2023 00:42:55 +0300 Subject: [PATCH 035/114] FTHMC compiled and merged to develop --- Grid/lattice/Lattice_reduction_gpu.h | 2 +- Grid/qcd/utils/GaugeGroup.h | 62 +++++++++++++++++++++++++++- Grid/qcd/utils/SUn.impl.h | 2 + Grid/threads/Accelerator.cc | 2 +- Grid/threads/Accelerator.h | 16 +++---- systems/Lumi/config-command | 2 +- 6 files changed, 73 insertions(+), 13 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index ecf90d19..e82494f5 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator & cudaGetDevice(&device); #endif #ifdef GRID_HIP - hipGetDevice(&device); + auto r=hipGetDevice(&device); #endif Iterator warpSize = gpu_props[device].warpSize; diff --git a/Grid/qcd/utils/GaugeGroup.h b/Grid/qcd/utils/GaugeGroup.h index f92064f4..6811d247 100644 --- a/Grid/qcd/utils/GaugeGroup.h +++ b/Grid/qcd/utils/GaugeGroup.h @@ -100,6 +100,9 @@ class GaugeGroup { using iGroupMatrix = iScalar > >; template using iAlgebraVector = iScalar > >; + template + using iSUnAlgebraMatrix = + iScalar > >; static int su2subgroups(void) { return su2subgroups(group_name()); } ////////////////////////////////////////////////////////////////////////////////////////////////// @@ -128,10 +131,19 @@ class GaugeGroup { typedef Lattice LatticeMatrix; typedef Lattice LatticeMatrixF; typedef Lattice LatticeMatrixD; - + typedef Lattice LatticeAlgebraVector; typedef Lattice LatticeAlgebraVectorF; typedef Lattice LatticeAlgebraVectorD; + + typedef iSUnAlgebraMatrix vAlgebraMatrix; + typedef iSUnAlgebraMatrix vAlgebraMatrixF; + typedef iSUnAlgebraMatrix vAlgebraMatrixD; + + typedef Lattice LatticeAlgebraMatrix; + typedef Lattice LatticeAlgebraMatrixF; + typedef Lattice LatticeAlgebraMatrixD; + typedef iSU2Matrix SU2Matrix; typedef iSU2Matrix SU2MatrixF; @@ -160,7 +172,7 @@ class GaugeGroup { return generator(lieIndex, ta, group_name()); } - static void su2SubGroupIndex(int &i1, int &i2, int su2_index) { + static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index) { return su2SubGroupIndex(i1, i2, su2_index, group_name()); } @@ -389,6 +401,52 @@ class GaugeGroup { } } +// Ta are hermitian (?) +// Anti herm is i Ta basis +static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, int b) +{ + conformable(in, out); + GridBase *grid = out.Grid(); + LatticeComplex tmp(grid); + Matrix ta; + // Using Luchang's projection convention + // 2 Tr{Ta Tb} A_b= 2/2 delta ab A_b = A_a + autoView(out_v,out,AcceleratorWrite); + autoView(in_v,in,AcceleratorRead); + int N = ncolour; + int NNm1 = N * (N - 1); + int hNNm1= NNm1/2; + RealD sqrt_2 = sqrt(2.0); + Complex ci(0.0,1.0); + for(int su2Index=0;su2IndexoSites(),1,{ + // in is traceless ANTI-hermitian whereas Grid generators are Hermitian. + // trace( Ta x Ci in) + // Bet I need to move to real part with mult by -i + out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2))); + out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1))); + }); + } + for(int diagIndex=0;diagIndexoSites(),vComplex::Nsimd(),{ + auto tmp = in_v[ss]()()(0,0); + for(int i=1;i diff --git a/Grid/qcd/utils/SUn.impl.h b/Grid/qcd/utils/SUn.impl.h index e19f970c..02fa161b 100644 --- a/Grid/qcd/utils/SUn.impl.h +++ b/Grid/qcd/utils/SUn.impl.h @@ -10,6 +10,7 @@ // doesn't get found by the scripts/filelist during bootstrapping. private: + template static int su2subgroups(GroupName::SU) { return (ncolour * (ncolour - 1)) / 2; } //////////////////////////////////////////////////////////////////////// @@ -576,3 +577,4 @@ static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeFie LieRandomize(pRNG,g,1.0); GaugeTransform(Umu,g); } + diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 70f469b0..3769b2aa 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -147,7 +147,7 @@ void acceleratorInit(void) #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); - hipGetDeviceProperties(&gpu_props[i], i); + auto r=hipGetDeviceProperties(&gpu_props[i], i); hipDeviceProp_t prop; prop = gpu_props[i]; totalDeviceMem = prop.totalGlobalMem; diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index f362a077..ff5ccd7a 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -405,7 +405,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) #define accelerator_barrier(dummy) \ { \ - hipStreamSynchronize(computeStream); \ + auto r=hipStreamSynchronize(computeStream); \ auto err = hipGetLastError(); \ if ( err != hipSuccess ) { \ printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \ @@ -438,19 +438,19 @@ inline void *acceleratorAllocDevice(size_t bytes) return ptr; }; -inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; -inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} +inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);}; +inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);}; +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} //inline void acceleratorCopySynchronise(void) { } -inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);} inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch { - hipMemcpyDtoDAsync(to,from,bytes, copyStream); + auto r=hipMemcpyDtoDAsync(to,from,bytes, copyStream); } -inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); }; +inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyStream); }; #endif diff --git a/systems/Lumi/config-command b/systems/Lumi/config-command index 3f7877c8..5e596285 100644 --- a/systems/Lumi/config-command +++ b/systems/Lumi/config-command @@ -23,7 +23,7 @@ echo mpfr X$MPFR --disable-fermion-reps \ --disable-gparity \ CXX=hipcc MPICXX=mpicxx \ - CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \ + CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++17 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \ LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp" From 51051df62cb8f6875bafd6b54192aec513919567 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 16 Oct 2023 20:49:52 +0300 Subject: [PATCH 036/114] 3GeV run setup --- HMC/FTHMC2p1f_3GeV.cc | 20 ++++----- systems/Lumi/HMC/32cube/fthmc3gev.slurm | 57 +++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 10 deletions(-) create mode 100644 systems/Lumi/HMC/32cube/fthmc3gev.slurm diff --git a/HMC/FTHMC2p1f_3GeV.cc b/HMC/FTHMC2p1f_3GeV.cc index 7d93d168..a8aa67f8 100644 --- a/HMC/FTHMC2p1f_3GeV.cc +++ b/HMC/FTHMC2p1f_3GeV.cc @@ -58,12 +58,13 @@ int main(int argc, char **argv) MD.trajL = 1.0; HMCparameters HMCparams; - HMCparams.StartTrajectory = 104; + HMCparams.StartTrajectory = 0; HMCparams.Trajectories = 200; HMCparams.NoMetropolisUntil= 20; // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; // HMCparams.StartingType =std::string("HotStart"); - HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.StartingType =std::string("ColdStart"); + // HMCparams.StartingType =std::string("CheckpointStart"); HMCparams.MD = MD; HMCWrapper TheHMC(HMCparams); @@ -91,13 +92,13 @@ int main(int argc, char **argv) ////////////////////////////////////////////// - const int Ls = 16; - Real beta = 2.13; - Real light_mass = 0.01; - Real strange_mass = 0.04; + const int Ls = 12; + Real beta = 2.37; + Real light_mass = 0.0047; + Real strange_mass = 0.0186; Real pv_mass = 1.0; RealD M5 = 1.8; - RealD b = 1.0; // Scale factor two + RealD b = 1.0; // Scale factor one, Shamir RealD c = 0.0; OneFlavourRationalParams OFRp; @@ -108,7 +109,7 @@ int main(int argc, char **argv) OFRp.degree = 14; OFRp.precision= 40; - std::vector hasenbusch({ 0.1 }); + std::vector hasenbusch({ 0.05, 0.1, 0.25, 0.5 }); auto GridPtr = TheHMC.Resources.GetCartesian(); auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); @@ -199,8 +200,7 @@ int main(int argc, char **argv) ///////////////////////////////////////////////////////////// // Gauge action ///////////////////////////////////////////////////////////// - // GaugeAction.is_smeared = ApplySmearing; - GaugeAction.is_smeared = true; + GaugeAction.is_smeared = ApplySmearing; Level2.push_back(&GaugeAction); std::cout << GridLogMessage << " ************************************************"<< std::endl; diff --git a/systems/Lumi/HMC/32cube/fthmc3gev.slurm b/systems/Lumi/HMC/32cube/fthmc3gev.slurm new file mode 100644 index 00000000..4cdc5136 --- /dev/null +++ b/systems/Lumi/HMC/32cube/fthmc3gev.slurm @@ -0,0 +1,57 @@ +#!/bin/bash -l +#SBATCH --job-name=fthmc3ge +#SBATCH --partition=small-g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +##SBATCH --cpus-per-task=8 +#SBATCH --gpus-per-node=8 +#SBATCH --time=2:00:00 +#SBATCH --account=project_465000546 +#SBATCH --gpu-bind=none +#SBATCH --exclusive +#SBATCH --mem=0 + + +#sbatch --dependency=afterany:$SLURM_JOBID fthmc3gev.slurm + +CPU_BIND="map_ldom:3,3,1,1,0,0,2,2" +MEM_BIND="map_mem:3,3,1,1,0,0,2,2" +echo $CPU_BIND + +cat << EOF > ./select_gpu +#!/bin/bash +export GPU_MAP=(0 1 2 3 4 5 6 7) +export NUMA_MAP=(3 3 1 1 0 0 2 2) +export GPU=\${GPU_MAP[\$SLURM_LOCALID]} +export NUM=\${NUMA_MAP[\$SLURM_LOCALID]} +#export HIP_VISIBLE_DEVICES=\$GPU +export ROCR_VISIBLE_DEVICES=\$GPU +echo RANK \$SLURM_LOCALID using GPU \$GPU +echo NUMA \$SLURM_LOCALID using NUMA \${NUM} +echo numactl -m \$NUM -N \$NUM \$* +exec numactl -m \$NUM -N \$NUM \$* +EOF +cat ./select_gpu + +chmod +x ./select_gpu + +root=/scratch/project_465000546/boylepet/Grid/systems/Lumi +source ${root}/sourceme.sh + +export OMP_NUM_THREADS=7 +export MPICH_SMP_SINGLE_COPY_MODE=CMA +export MPICH_GPU_SUPPORT_ENABLED=1 + +#cfg=`ls -rt ckpoint_*lat* | tail -n 1 ` +#traj="${cfg#*.}" +#cfg=`ls -rt ckpoint_*lat* | tail -n 1 ` +traj=0 + +vol=32.32.32.64 +mpi=1.2.2.2 +PARAMS="--mpi $mpi --accelerator-threads 16 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol" +#HMCPARAMS="--StartingType CheckpointStart --StartingTrajectory $traj --Trajectories 200" +HMCPARAMS="--StartingType ColdStart --StartingTrajectory $traj --Trajectories 20" + +srun ./select_gpu ../FTHMC2p1f_3GeV $HMCPARAMS $PARAMS + From 391fd9cc6a4c4d826eb71c2d8d761afaea3b7a69 Mon Sep 17 00:00:00 2001 From: david clarke Date: Tue, 17 Oct 2023 14:57:15 -0600 Subject: [PATCH 037/114] try lepage term --- Grid/qcd/smearing/HISQSmearing.h | 52 +++++++++++++++++++++++++------- examples/Example_plaquette.cc | 14 +++++---- tests/smearing/Test_fatLinks.cc | 4 +-- 3 files changed, 51 insertions(+), 19 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 432184e0..44e14e85 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -81,8 +81,8 @@ struct SmearingParameters{ /*! @brief create fat links from link variables */ -template -class Smear_HISQ_fat { +template +class Smear_HISQ_fat : public Gimpl { private: GridCartesian* const _grid; @@ -90,6 +90,8 @@ private: public: + INHERIT_GIMPL_TYPES(Gimpl); + // Don't allow default values here. Smear_HISQ_fat(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) : _grid(grid), @@ -141,6 +143,7 @@ public: // This is where contributions from the smearing get added together Ughost_fat=Zero(); + // This loop handles 3-, 5-, and 7-link constructs, minus Lepage and Naik. for(int mu=0;mu_offset; SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; - // gpermutes will be replaced with single line of code, combines load and permute - // into one step. still in pull request stage U0 = coalescedReadGeneralPermute( U_v[x_p_mu ](nu ),SE0->_permute,Nd); U1 = coalescedReadGeneralPermute(U_3link_v[x_p_nu ](rho),SE1->_permute,Nd); U2 = coalescedReadGeneralPermute( U_v[x ](nu ),SE2->_permute,Nd); @@ -234,8 +233,7 @@ public: } } - // 7-link - for(int site=0;site U(Nd, u_thin.Grid()); + std::vector V(Nd, u_smr.Grid()); + for (int mu = 0; mu < Nd; mu++) { + U[mu] = PeekIndex(u_thin, mu); + V[mu] = PeekIndex(u_smr, mu); + } + + // Compute LePage term from U_thin: + for(int mu=0;mu(u_smr, V[mu], mu); + } + }; // void derivative(const GaugeField& Gauge) const { // }; diff --git a/examples/Example_plaquette.cc b/examples/Example_plaquette.cc index 17de4762..faf17d82 100644 --- a/examples/Example_plaquette.cc +++ b/examples/Example_plaquette.cc @@ -29,7 +29,6 @@ public: // Gimpl seems to be an arbitrary class. Within this class, it is expected that certain types are // already defined, things like Scalar and Field. This macro includes a bunch of #typedefs that // implement this equivalence at compile time. - // WARNING: The first time you include this or take it out, the compile time will increase a lot. INHERIT_GIMPL_TYPES(Gimpl); // Some example Gimpls can be found in GaugeImplementations.h, at the bottom. These are in turn built @@ -53,12 +52,15 @@ public: // U_mu_nu(x) static void dirPlaquette(GaugeMat &plaq, const std::vector &U, const int mu, const int nu) { - // These CovShift calls seem to carry out the multiplication already. A positive shift moves the lattice - // site x_mu = 1 in the RHS to x_mu = 0 in the result. + // Calls like CovShiftForward and CovShiftBackward have 3 arguments, and they multiply together + // the first and last argument. (Second arg gives the shift direction.) The CovShiftIdentityBackward + // has meanwhile only two arguments; it just returns the shifted (adjoint since backward) link. plaq = Gimpl::CovShiftForward(U[mu],mu, - Gimpl::CovShiftForward(U[nu],nu, - Gimpl::CovShiftBackward(U[mu],mu, - Gimpl::CovShiftIdentityBackward(U[nu], nu)))); + // Means Link*Cshift(field,mu,1), arguments are Link, mu, field in that order. + Gimpl::CovShiftForward(U[nu],nu, + Gimpl::CovShiftBackward(U[mu],mu, + // This means Cshift(adj(Link), mu, -1) + Gimpl::CovShiftIdentityBackward(U[nu], nu)))); } // tr U_mu_nu(x) diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index f5c7b5ca..1b24c7ca 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -90,14 +90,14 @@ int main (int argc, char** argv) { NerscIO::readConfiguration(Umu, header, conf_in); // Smear Umu and store result in U_smr - Smear_HISQ_fat hisq_fat(&GRID,1/8.,0.,1/16.,1/64.,1/384.,0.); + Smear_HISQ_fat hisq_fat(&GRID,1/8.,0.,1/16.,1/64.,1/384.,-1/8.); hisq_fat.smear(U_smr,Umu); NerscIO::writeConfiguration(U_smr,conf_out,"HISQ"); // Test a C-style instantiation double path_coeff[6] = {1, 2, 3, 4, 5, 6}; - Smear_HISQ_fat hisq_fat_Cstyle(&GRID,path_coeff); + Smear_HISQ_fat hisq_fat_Cstyle(&GRID,path_coeff); // Make sure result doesn't change w.r.t. a trusted lattice NerscIO::readConfiguration(Umu, header, "nersc.l8t4b3360.357link.control"); From 7bb8ab7000962dd9da9e56254c4550be4cbe0c44 Mon Sep 17 00:00:00 2001 From: david clarke Date: Fri, 20 Oct 2023 08:41:02 -0600 Subject: [PATCH 038/114] improve smearing templating --- Grid/qcd/smearing/HISQSmearing.h | 4 +++- tests/smearing/Test_fatLinks.cc | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 44e14e85..82eedc1d 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -81,7 +81,8 @@ struct SmearingParameters{ /*! @brief create fat links from link variables */ -template +//template +template class Smear_HISQ_fat : public Gimpl { private: @@ -91,6 +92,7 @@ private: public: INHERIT_GIMPL_TYPES(Gimpl); + typedef typename Gimpl::GaugeField LGF; // Don't allow default values here. Smear_HISQ_fat(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index 1b24c7ca..2ff3c116 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -90,14 +90,14 @@ int main (int argc, char** argv) { NerscIO::readConfiguration(Umu, header, conf_in); // Smear Umu and store result in U_smr - Smear_HISQ_fat hisq_fat(&GRID,1/8.,0.,1/16.,1/64.,1/384.,-1/8.); + Smear_HISQ_fat hisq_fat(&GRID,1/8.,0.,1/16.,1/64.,1/384.,-1/8.); hisq_fat.smear(U_smr,Umu); NerscIO::writeConfiguration(U_smr,conf_out,"HISQ"); // Test a C-style instantiation double path_coeff[6] = {1, 2, 3, 4, 5, 6}; - Smear_HISQ_fat hisq_fat_Cstyle(&GRID,path_coeff); + Smear_HISQ_fat hisq_fat_Cstyle(&GRID,path_coeff); // Make sure result doesn't change w.r.t. a trusted lattice NerscIO::readConfiguration(Umu, header, "nersc.l8t4b3360.357link.control"); From 21ed6ac0f4c442564da142098952bf6b226c7d52 Mon Sep 17 00:00:00 2001 From: david clarke Date: Fri, 20 Oct 2023 13:54:26 -0600 Subject: [PATCH 039/114] added floating-point support --- Grid/qcd/smearing/HISQSmearing.h | 25 +++++++++++++------------ tests/smearing/Test_fatLinks.cc | 10 ++++++---- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 82eedc1d..9ac6eddd 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -81,7 +81,7 @@ struct SmearingParameters{ /*! @brief create fat links from link variables */ -//template +//template template class Smear_HISQ_fat : public Gimpl { @@ -92,7 +92,8 @@ private: public: INHERIT_GIMPL_TYPES(Gimpl); - typedef typename Gimpl::GaugeField LGF; + typedef typename Gimpl::GaugeField GF; + typedef typename Gimpl::GaugeLinkField LF; // Don't allow default values here. Smear_HISQ_fat(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) @@ -112,20 +113,20 @@ public: ~Smear_HISQ_fat() {} - void smear(LGF& u_smr, LGF& u_thin) const { + void smear(GF& u_smr, GF& u_thin) const { SmearingParameters lt = this->_linkTreatment; // Create a padded cell of extra padding depth=1 and fill the padding. int depth = 1; PaddedCell Ghost(depth,this->_grid); - LGF Ughost = Ghost.Exchange(u_thin); + GF Ughost = Ghost.Exchange(u_thin); // This is where auxiliary N-link fields and the final smear will be stored. - LGF Ughost_fat(Ughost.Grid()); - LGF Ughost_3link(Ughost.Grid()); - LGF Ughost_5linkA(Ughost.Grid()); - LGF Ughost_5linkB(Ughost.Grid()); + GF Ughost_fat(Ughost.Grid()); + GF Ughost_3link(Ughost.Grid()); + GF Ughost_5linkA(Ughost.Grid()); + GF Ughost_5linkB(Ughost.Grid()); // Create 3-link stencil. We allow mu==nu just to make the indexing easier. // Shifts with mu==nu will not be used. @@ -279,8 +280,8 @@ public: u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin; // Load up U and V std::vectors to access thin and smeared links. - std::vector U(Nd, u_thin.Grid()); - std::vector V(Nd, u_smr.Grid()); + std::vector U(Nd, u_thin.Grid()); + std::vector V(Nd, u_smr.Grid()); for (int mu = 0; mu < Nd; mu++) { U[mu] = PeekIndex(u_thin, mu); V[mu] = PeekIndex(u_smr, mu); @@ -317,7 +318,7 @@ public: /*! @brief create long links from link variables. */ -template +template class Smear_HISQ_Naik { private: @@ -333,7 +334,7 @@ public: ~Smear_HISQ_Naik() {} -// void smear(LGF& u_smr, const LGF& U) const { +// void smear(GF& u_smr, const GF& U) const { // }; // void derivative(const GaugeField& Gauge) const { diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index 2ff3c116..acfb626c 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -67,6 +67,8 @@ int main (int argc, char** argv) { std::string conf_out = "nersc.l8t4b3360.357link"; int threads = GridThread::GetThreads(); + typedef LatticeGaugeFieldD LGF; + // Initialize the Grid Grid_init(&argc,&argv); Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); @@ -81,9 +83,9 @@ int main (int argc, char** argv) { ConfParameters param(Reader); if(param.benchmark) Grid_log(" Nloop = ",param.Nloop); - // Instantiate the LatticeGaugeField objects holding thin (Umu) and fat (U_smr) links - LatticeGaugeField Umu(&GRID); - LatticeGaugeField U_smr(&GRID); + // Instantiate the LGF objects holding thin (Umu) and fat (U_smr) links + LGF Umu(&GRID); + LGF U_smr(&GRID); // Read the configuration into Umu FieldMetaData header; @@ -101,7 +103,7 @@ int main (int argc, char** argv) { // Make sure result doesn't change w.r.t. a trusted lattice NerscIO::readConfiguration(Umu, header, "nersc.l8t4b3360.357link.control"); - LatticeGaugeField diff(&GRID); + LGF diff(&GRID); diff = Umu-U_smr; auto absDiff = norm2(diff)/norm2(Umu); Grid_log(" |Umu-U|/|Umu| = ",absDiff); From f2648e94b92c9939e33f55e401ec2fab01d7f553 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 23 Oct 2023 13:47:41 +0200 Subject: [PATCH 040/114] getHostPointer added to Lattice --- Grid/lattice/Lattice_view.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h index cb568abd..064c10e6 100644 --- a/Grid/lattice/Lattice_view.h +++ b/Grid/lattice/Lattice_view.h @@ -45,6 +45,7 @@ public: }; // Host only GridBase * getGrid(void) const { return _grid; }; + vobj* getHostPointer(void) const { return _odata; }; }; ///////////////////////////////////////////////////////////////////////////////////////// From 3d3376d1a321b007daf52dcbfc8746da59611b96 Mon Sep 17 00:00:00 2001 From: david clarke Date: Fri, 27 Oct 2023 16:26:31 -0600 Subject: [PATCH 041/114] LePage works, trying Naik --- Grid/qcd/smearing/HISQSmearing.h | 46 +++++++++++--------------------- tests/smearing/Test_fatLinks.cc | 6 ++--- 2 files changed, 19 insertions(+), 33 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 9ac6eddd..4ec61141 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -56,7 +56,7 @@ void appendShift(std::vector& shifts, int dir, Args... args) { /*! @brief figure out the stencil index from mu and nu */ inline int stencilIndex(int mu, int nu) { // Nshifts depends on how you built the stencil - int Nshifts = 5; + int Nshifts = 6; return Nshifts*nu + Nd*Nshifts*mu; } @@ -128,8 +128,8 @@ public: GF Ughost_5linkA(Ughost.Grid()); GF Ughost_5linkB(Ughost.Grid()); - // Create 3-link stencil. We allow mu==nu just to make the indexing easier. - // Shifts with mu==nu will not be used. + // mu-nu plane stencil. We allow mu==nu to make indexing the stencil easier, + // but these entries will not be used. std::vector shifts; for(int mu=0;mu_permute,Nd)) U3matrix; - stencilElement SE0, SE1, SE2, SE3, SE4; + stencilElement SE0, SE1, SE2, SE3, SE4, SE5; U3matrix U0, U1, U2, U3, U4, U5, W; + for(int site=0;site_offset; SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + SE5 = gStencil.GetEntry(s+5,site); int x_m_mu = SE5->_offset; // When you're deciding whether to take an adjoint, the question is: how is the // stored link oriented compared to the one you want? If I imagine myself travelling @@ -194,9 +197,17 @@ public: // "left" "right" W = U2*U1*adj(U0) + adj(U5)*U4*U3; + // Save 3-link construct for later and add to smeared field. U_3link_v[site](nu) = W; + U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_3*W; - U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_3*W; + U0 = coalescedReadGeneralPermute(U_v[x_m_mu](mu),SE5->_permute,Nd); + U1 = coalescedReadGeneralPermute(U_v[x ](mu),SE2->_permute,Nd); + U2 = coalescedReadGeneralPermute(U_v[x_p_mu](mu),SE0->_permute,Nd); + W = U0*U1*U2; + + // Add Naik term to smeared field. + U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_naik*W; } } @@ -317,29 +328,4 @@ public: }; -/*! @brief create long links from link variables. */ -template -class Smear_HISQ_Naik { - -private: - GridCartesian* const _grid; - -public: - - // Eventually this will take, e.g., coefficients as argument - Smear_HISQ_Naik(GridCartesian* grid) : _grid(grid) { - assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); - assert(Nd == 4 && "HISQ smearing only defined for Nd==4"); - } - - ~Smear_HISQ_Naik() {} - -// void smear(GF& u_smr, const GF& U) const { -// }; - -// void derivative(const GaugeField& Gauge) const { -// }; -}; - - NAMESPACE_END(Grid); \ No newline at end of file diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index acfb626c..61668b66 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -64,7 +64,7 @@ int main (int argc, char** argv) { int Nt = 4; Coordinate latt_size(Nd,0); latt_size[0]=Ns; latt_size[1]=Ns; latt_size[2]=Ns; latt_size[3]=Nt; std::string conf_in = "nersc.l8t4b3360"; - std::string conf_out = "nersc.l8t4b3360.357link"; + std::string conf_out = "nersc.l8t4b3360.357lplink"; int threads = GridThread::GetThreads(); typedef LatticeGaugeFieldD LGF; @@ -92,7 +92,7 @@ int main (int argc, char** argv) { NerscIO::readConfiguration(Umu, header, conf_in); // Smear Umu and store result in U_smr - Smear_HISQ_fat hisq_fat(&GRID,1/8.,0.,1/16.,1/64.,1/384.,-1/8.); + Smear_HISQ_fat hisq_fat(&GRID,1/8.,-1/24.,1/16.,1/64.,1/384.,-1/8.); hisq_fat.smear(U_smr,Umu); NerscIO::writeConfiguration(U_smr,conf_out,"HISQ"); @@ -102,7 +102,7 @@ int main (int argc, char** argv) { Smear_HISQ_fat hisq_fat_Cstyle(&GRID,path_coeff); // Make sure result doesn't change w.r.t. a trusted lattice - NerscIO::readConfiguration(Umu, header, "nersc.l8t4b3360.357link.control"); + NerscIO::readConfiguration(Umu, header, "nersc.l8t4b3360.357lplink.control"); LGF diff(&GRID); diff = Umu-U_smr; auto absDiff = norm2(diff)/norm2(Umu); From df9b958c40e920ff8e3ae5c565be9be57b377735 Mon Sep 17 00:00:00 2001 From: david clarke Date: Mon, 30 Oct 2023 17:40:53 -0600 Subject: [PATCH 042/114] naik now returns separately --- Grid/log/Log.h | 22 +++++++++++++--- Grid/qcd/smearing/HISQSmearing.h | 27 ++++++++++++-------- tests/smearing/Test_fatLinks.cc | 44 ++++++++++++++++++++------------ 3 files changed, 63 insertions(+), 30 deletions(-) diff --git a/Grid/log/Log.h b/Grid/log/Log.h index b88bf61f..370b0428 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -179,11 +179,11 @@ extern GridLogger GridLogSolver; extern GridLogger GridLogError; extern GridLogger GridLogWarning; extern GridLogger GridLogMessage; -extern GridLogger GridLogDebug ; +extern GridLogger GridLogDebug; extern GridLogger GridLogPerformance; extern GridLogger GridLogDslash; -extern GridLogger GridLogIterative ; -extern GridLogger GridLogIntegrator ; +extern GridLogger GridLogIterative; +extern GridLogger GridLogIntegrator; extern GridLogger GridLogHMC; extern GridLogger GridLogMemory; extern GridLogger GridLogTracing; @@ -209,7 +209,21 @@ inline void Grid_log(Args&&... args) { template inline void Grid_warn(Args&&... args) { std::string msg = sjoin(std::forward(args)...); - std::cout << GridLogWarning << msg << std::endl; + std::cout << "\033[33m" << GridLogWarning << msg << "\033[0m" << std::endl; +} + +/*! @brief make error messages work like python print */ +template +inline void Grid_error(Args&&... args) { + std::string msg = sjoin(std::forward(args)...); + std::cout << "\033[31m" << GridLogError << msg << "\033[0m" << std::endl; +} + +/*! @brief make pass messages work like python print */ +template +inline void Grid_pass(Args&&... args) { + std::string msg = sjoin(std::forward(args)...); + std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl; } #define _NBACKTRACE (256) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 4ec61141..a0b60dcd 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -81,9 +81,9 @@ struct SmearingParameters{ /*! @brief create fat links from link variables */ -//template template -class Smear_HISQ_fat : public Gimpl { +class Smear_HISQ : public Gimpl { +// TODO: this needs to be renamed, becaues the Naik guy is not part of the fat smear private: GridCartesian* const _grid; @@ -96,7 +96,7 @@ public: typedef typename Gimpl::GaugeLinkField LF; // Don't allow default values here. - Smear_HISQ_fat(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) + Smear_HISQ(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) : _grid(grid), _linkTreatment(c1,cnaik,c3,c5,c7,clp) { assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); @@ -104,16 +104,18 @@ public: } // Allow to pass a pointer to a C-style, double array for MILC convenience - Smear_HISQ_fat(GridCartesian* grid, double* coeff) + Smear_HISQ(GridCartesian* grid, double* coeff) : _grid(grid), _linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) { assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); assert(Nd == 4 && "HISQ smearing only defined for Nd==4"); } - ~Smear_HISQ_fat() {} + ~Smear_HISQ() {} - void smear(GF& u_smr, GF& u_thin) const { + // Intent: OUT--u_smr, u_naik + // IN--u_thin + void smear(GF& u_smr, GF& u_naik, GF& u_thin) const { SmearingParameters lt = this->_linkTreatment; @@ -127,6 +129,7 @@ public: GF Ughost_3link(Ughost.Grid()); GF Ughost_5linkA(Ughost.Grid()); GF Ughost_5linkB(Ughost.Grid()); + GF Ughost_naik(Ughost.Grid()); // mu-nu plane stencil. We allow mu==nu to make indexing the stencil easier, // but these entries will not be used. @@ -146,6 +149,7 @@ public: // This is where contributions from the smearing get added together Ughost_fat=Zero(); + Ughost_naik=Zero(); // This loop handles 3-, 5-, and 7-link constructs, minus Lepage and Naik. for(int mu=0;mu_permute,Nd); U1 = coalescedReadGeneralPermute(U_v[x ](mu),SE2->_permute,Nd); U2 = coalescedReadGeneralPermute(U_v[x_p_mu](mu),SE0->_permute,Nd); W = U0*U1*U2; - - // Add Naik term to smeared field. - U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_naik*W; + U_naik_v[site](mu) = U_fat_v[site](mu) + lt.c_naik*W; } } @@ -290,6 +294,9 @@ public: // c1, c3, c5, c7 construct contributions u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin; + // Naik contribution + u_naik = Ghost.Extract(Ughost_naik); + // Load up U and V std::vectors to access thin and smeared links. std::vector U(Nd, u_thin.Grid()); std::vector V(Nd, u_smr.Grid()); diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index 61668b66..05e80929 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -57,6 +57,22 @@ struct ConfParameters: Serializable { // another : input --> unitarize // + +void testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD Usmr, LatticeGaugeFieldD Unaik, + LatticeGaugeFieldD Ucontrol, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) { + Smear_HISQ hisq_fat(&GRID,c1,cnaik,c3,c5,c7,clp); + hisq_fat.smear(Usmr, Unaik, Umu); + LatticeGaugeFieldD diff(&GRID); + diff = Ucontrol-Usmr; + auto absDiff = norm2(diff)/norm2(Ucontrol); + if (absDiff < 1e-30) { + Grid_pass(" |Umu-U|/|Umu| = ",absDiff); + } else { + Grid_error(" |Umu-U|/|Umu| = ",absDiff); + } +} + + int main (int argc, char** argv) { // Params for the test. @@ -83,30 +99,26 @@ int main (int argc, char** argv) { ConfParameters param(Reader); if(param.benchmark) Grid_log(" Nloop = ",param.Nloop); - // Instantiate the LGF objects holding thin (Umu) and fat (U_smr) links - LGF Umu(&GRID); - LGF U_smr(&GRID); + LGF Umu(&GRID), Usmr(&GRID), Unaik(&GRID), Ucontrol(&GRID); // Read the configuration into Umu FieldMetaData header; NerscIO::readConfiguration(Umu, header, conf_in); - // Smear Umu and store result in U_smr - Smear_HISQ_fat hisq_fat(&GRID,1/8.,-1/24.,1/16.,1/64.,1/384.,-1/8.); - hisq_fat.smear(U_smr,Umu); - - NerscIO::writeConfiguration(U_smr,conf_out,"HISQ"); + // Carry out various tests + NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.357lplink.control"); + testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,1/384.,-1/8.); + NerscIO::writeConfiguration(Usmr,conf_out,"HISQ"); + NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.357link.control"); + testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,1/384.,0.); + NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.35link.control"); + testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,0.,0.); + NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.3link.control"); + testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,0.,0.,0.); // Test a C-style instantiation double path_coeff[6] = {1, 2, 3, 4, 5, 6}; - Smear_HISQ_fat hisq_fat_Cstyle(&GRID,path_coeff); - - // Make sure result doesn't change w.r.t. a trusted lattice - NerscIO::readConfiguration(Umu, header, "nersc.l8t4b3360.357lplink.control"); - LGF diff(&GRID); - diff = Umu-U_smr; - auto absDiff = norm2(diff)/norm2(Umu); - Grid_log(" |Umu-U|/|Umu| = ",absDiff); + Smear_HISQ hisq_fat_Cstyle(&GRID,path_coeff); if (param.benchmark) { From 69c869d345dc0b842138cab082cf01d84b5e8dac Mon Sep 17 00:00:00 2001 From: david clarke Date: Mon, 30 Oct 2023 17:41:52 -0600 Subject: [PATCH 043/114] fixed stupid typo --- Grid/qcd/smearing/HISQSmearing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index a0b60dcd..46cc527d 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -211,7 +211,7 @@ public: U1 = coalescedReadGeneralPermute(U_v[x ](mu),SE2->_permute,Nd); U2 = coalescedReadGeneralPermute(U_v[x_p_mu](mu),SE0->_permute,Nd); W = U0*U1*U2; - U_naik_v[site](mu) = U_fat_v[site](mu) + lt.c_naik*W; + U_naik_v[site](mu) = lt.c_naik*W; } } From 2ae2a81e859976996894d289f198b53dfe19c637 Mon Sep 17 00:00:00 2001 From: david clarke Date: Tue, 31 Oct 2023 13:54:55 -0600 Subject: [PATCH 044/114] attempt to fix Naik --- Grid/qcd/smearing/HISQSmearing.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 46cc527d..43fe0f77 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -203,15 +203,16 @@ public: W = U2*U1*adj(U0) + adj(U5)*U4*U3; // Save 3-link construct for later and add to smeared field. - U_3link_v[site](nu) = W; - U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_3*W; + U_3link_v[x](nu) = W; + U_fat_v[x](mu) = U_fat_v[x](mu) + lt.c_3*W; - // TODO: May need to be shifted by 1? + // Naik term starts at x-mu, save at x-mu. The idea will be to keep track + // of this shift, and then take into account when we use Naik later. U0 = coalescedReadGeneralPermute(U_v[x_m_mu](mu),SE5->_permute,Nd); U1 = coalescedReadGeneralPermute(U_v[x ](mu),SE2->_permute,Nd); U2 = coalescedReadGeneralPermute(U_v[x_p_mu](mu),SE0->_permute,Nd); W = U0*U1*U2; - U_naik_v[site](mu) = lt.c_naik*W; + U_naik_v[x_m_mu](mu) = lt.c_naik*W; } } @@ -239,12 +240,12 @@ public: W = U2*U1*adj(U0) + adj(U5)*U4*U3; if(sigmaIndex<3) { - U_5linkA_v[site](rho) = W; + U_5linkA_v[x](rho) = W; } else { - U_5linkB_v[site](rho) = W; + U_5linkB_v[x](rho) = W; } - U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_5*W; + U_fat_v[x](mu) = U_fat_v[x](mu) + lt.c_5*W; sigmaIndex++; } @@ -282,7 +283,7 @@ public: W = U2*U1*adj(U0) + adj(U5)*U4*U3; - U_fat_v[site](mu) = U_fat_v[site](mu) + lt.c_7*W; + U_fat_v[x](mu) = U_fat_v[x](mu) + lt.c_7*W; sigmaIndex++; } From c8b17c9526679541443141559604636fe0657e66 Mon Sep 17 00:00:00 2001 From: david clarke Date: Thu, 2 Nov 2023 12:43:22 -0600 Subject: [PATCH 045/114] Naik to CShift --- Grid/qcd/smearing/HISQSmearing.h | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 43fe0f77..36f66ee5 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -129,7 +129,6 @@ public: GF Ughost_3link(Ughost.Grid()); GF Ughost_5linkA(Ughost.Grid()); GF Ughost_5linkB(Ughost.Grid()); - GF Ughost_naik(Ughost.Grid()); // mu-nu plane stencil. We allow mu==nu to make indexing the stencil easier, // but these entries will not be used. @@ -149,7 +148,6 @@ public: // This is where contributions from the smearing get added together Ughost_fat=Zero(); - Ughost_naik=Zero(); // This loop handles 3-, 5-, and 7-link constructs, minus Lepage and Naik. for(int mu=0;mu_permute,Nd); - U1 = coalescedReadGeneralPermute(U_v[x ](mu),SE2->_permute,Nd); - U2 = coalescedReadGeneralPermute(U_v[x_p_mu](mu),SE0->_permute,Nd); - W = U0*U1*U2; - U_naik_v[x_m_mu](mu) = lt.c_naik*W; } } @@ -295,19 +284,24 @@ public: // c1, c3, c5, c7 construct contributions u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin; - // Naik contribution - u_naik = Ghost.Extract(Ughost_naik); - // Load up U and V std::vectors to access thin and smeared links. std::vector U(Nd, u_thin.Grid()); std::vector V(Nd, u_smr.Grid()); + std::vector Vnaik(Nd, u_naik.Grid()); for (int mu = 0; mu < Nd; mu++) { - U[mu] = PeekIndex(u_thin, mu); - V[mu] = PeekIndex(u_smr, mu); + U[mu] = PeekIndex(u_thin, mu); + V[mu] = PeekIndex(u_smr, mu); + Vnaik[mu] = PeekIndex(u_naik, mu); } - // Compute LePage term from U_thin: for(int mu=0;mu Date: Fri, 3 Nov 2023 14:11:38 -0600 Subject: [PATCH 046/114] fix naik bug --- Grid/qcd/smearing/HISQSmearing.h | 10 +++++----- tests/smearing/Test_fatLinks.cc | 23 ++++++++++++++++------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 36f66ee5..959a6cf0 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -291,15 +291,14 @@ public: for (int mu = 0; mu < Nd; mu++) { U[mu] = PeekIndex(u_thin, mu); V[mu] = PeekIndex(u_smr, mu); - Vnaik[mu] = PeekIndex(u_naik, mu); } for(int mu=0;mu(u_smr, V[mu], mu); + PokeIndex(u_smr , V[mu] , mu); + PokeIndex(u_naik, Vnaik[mu], mu); } }; diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index 05e80929..e3aa0e33 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -62,13 +62,20 @@ void testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD U LatticeGaugeFieldD Ucontrol, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) { Smear_HISQ hisq_fat(&GRID,c1,cnaik,c3,c5,c7,clp); hisq_fat.smear(Usmr, Unaik, Umu); - LatticeGaugeFieldD diff(&GRID); - diff = Ucontrol-Usmr; - auto absDiff = norm2(diff)/norm2(Ucontrol); - if (absDiff < 1e-30) { - Grid_pass(" |Umu-U|/|Umu| = ",absDiff); + LatticeGaugeFieldD diff1(&GRID), diff2(&GRID); + diff1 = Ucontrol-Usmr; + diff2 = Ucontrol-Unaik; + auto absDiff1 = norm2(diff1)/norm2(Ucontrol); + auto absDiff2 = norm2(diff2)/norm2(Ucontrol); + if (absDiff1 < 1e-30) { + Grid_pass(" |Umu-Usmr|/|Umu| = ",absDiff1); } else { - Grid_error(" |Umu-U|/|Umu| = ",absDiff); + Grid_error(" |Umu-Usmr|/|Umu| = ",absDiff1); + } + if (absDiff2 < 1e-30) { + Grid_pass(" |Umu-Unaik|/|Umu| = ",absDiff2); + } else { + Grid_error(" |Umu-Unaik|/|Umu| = ",absDiff2); } } @@ -114,7 +121,9 @@ int main (int argc, char** argv) { NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.35link.control"); testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,0.,0.); NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.3link.control"); - testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,0.,0.,0.); + testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,1.,1/16.,0.,0.,0.); + NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.3link.control"); + testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,2.,1/16.,0.,0.,0.); // Test a C-style instantiation double path_coeff[6] = {1, 2, 3, 4, 5, 6}; From 7d077fe4930a678d04a3f1330470246719a60735 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 9 Nov 2023 13:58:44 -0500 Subject: [PATCH 047/114] Frontier compiel --- HMC/HMC2p1f_3GeV.cc | 226 ++++++++++++++++++++++++++++++++ systems/Frontier/config-command | 23 ++++ systems/Frontier/sourceme.sh | 13 ++ 3 files changed, 262 insertions(+) create mode 100644 HMC/HMC2p1f_3GeV.cc create mode 100644 systems/Frontier/config-command create mode 100644 systems/Frontier/sourceme.sh diff --git a/HMC/HMC2p1f_3GeV.cc b/HMC/HMC2p1f_3GeV.cc new file mode 100644 index 00000000..4bf088d7 --- /dev/null +++ b/HMC/HMC2p1f_3GeV.cc @@ -0,0 +1,226 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Copyright (C) 2023 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +using namespace Grid; + +int main(int argc, char **argv) +{ + std::cout << std::setprecision(12); + + Grid_init(&argc, &argv); + int threads = GridThread::GetThreads(); + // here make a routine to print all the relevant information on the run + std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; + + // Typedefs to simplify notation + typedef WilsonImplR FermionImplPolicy; + typedef MobiusFermionD FermionAction; + typedef typename FermionAction::FermionField FermionField; + + typedef Grid::XmlReader Serialiser; + + //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + IntegratorParameters MD; + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Leap Frog"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Force Gradient"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("MinimumNorm2"); + MD.MDsteps = 24; + MD.trajL = 1.0; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 0; + HMCparams.Trajectories = 200; + HMCparams.NoMetropolisUntil= 20; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("HotStart"); + HMCparams.StartingType =std::string("ColdStart"); + // HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_EODWF_lat"; + CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr"; + CPparams.rng_prefix = "ckpoint_EODWF_rng"; + CPparams.saveInterval = 1; + CPparams.saveSmeared = true; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + + // Construct observables + // here there is too much indirection + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + + ////////////////////////////////////////////// + + const int Ls = 12; + Real beta = 2.37; + Real light_mass = 0.0047; + Real strange_mass = 0.0186; + Real pv_mass = 1.0; + RealD M5 = 1.8; + RealD b = 1.0; // Scale factor one, Shamir + RealD c = 0.0; + + OneFlavourRationalParams OFRp; + OFRp.lo = 1.0e-2; + OFRp.hi = 64; + OFRp.MaxIter = 10000; + OFRp.tolerance= 1.0e-10; + OFRp.degree = 14; + OFRp.precision= 40; + + std::vector hasenbusch({ 0.05, 0.1, 0.25, 0.5 }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeField U(GridPtr); + LatticeGaugeField Uhot(GridPtr); + + // These lines are unecessary if BC are all periodic + std::vector boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + + double StoppingCondition = 1e-10; + double MaxCGIterations = 30000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + + bool ApplySmearing = false; + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(2); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + CG, + CG, CG, + CG, CG, + OFRp, false); + + EOFA.is_smeared = ApplySmearing; + Level1.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector *> Quotients; + + for(int h=0;h(*Numerators[h],*Denominators[h],CG,CG)); + } + + for(int h=0;his_smeared = ApplySmearing; + Level1.push_back(Quotients[h]); + } + + ///////////////////////////////////////////////////////////// + // lnDetJacobianAction + ///////////////////////////////////////////////////////////// + double rho = 0.1; // smearing parameter + int Nsmear = 1; // number of smearing levels - must be multiple of 2Nd + int Nstep = 8*Nsmear; // number of smearing levels - must be multiple of 2Nd + Smear_Stout Stout(rho); + SmearedConfigurationMasked SmearingPolicy(GridPtr, Nstep, Stout); + JacobianAction Jacobian(&SmearingPolicy); + if( ApplySmearing ) Level1.push_back(&Jacobian); + std::cout << GridLogMessage << " Built the Jacobian "<< std::endl; + + + ///////////////////////////////////////////////////////////// + // Gauge action + ///////////////////////////////////////////////////////////// + GaugeAction.is_smeared = ApplySmearing; + Level2.push_back(&GaugeAction); + + std::cout << GridLogMessage << " ************************************************"<< std::endl; + std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl; + std::cout << GridLogMessage << " ************************************************"<< std::endl; + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << std::endl; + + + std::cout << GridLogMessage << " Running the FT HMC "<< std::endl; + TheHMC.TheAction.push_back(Level1); + TheHMC.TheAction.push_back(Level2); + + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + + TheHMC.Run(SmearingPolicy); // for smearing + + Grid_finalize(); +} // main + + + diff --git a/systems/Frontier/config-command b/systems/Frontier/config-command new file mode 100644 index 00000000..60ff464c --- /dev/null +++ b/systems/Frontier/config-command @@ -0,0 +1,23 @@ +CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` +../../configure --enable-comms=mpi-auto \ +--with-lime=$CLIME \ +--enable-unified=no \ +--enable-shm=nvlink \ +--enable-tracing=timer \ +--enable-accelerator=hip \ +--enable-gen-simd-width=64 \ +--disable-gparity \ +--disable-fermion-reps \ +--enable-simd=GPU \ +--enable-accelerator-cshift \ +--with-gmp=$OLCF_GMP_ROOT \ +--with-fftw=$FFTW_DIR/.. \ +--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ +--disable-fermion-reps \ +CXX=hipcc MPICXX=mpicxx \ +CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \ + LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 " + + + + diff --git a/systems/Frontier/sourceme.sh b/systems/Frontier/sourceme.sh new file mode 100644 index 00000000..987241b4 --- /dev/null +++ b/systems/Frontier/sourceme.sh @@ -0,0 +1,13 @@ +. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh +spack load c-lime +#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib +module load emacs +module load PrgEnv-gnu +module load rocm +module load cray-mpich/8.1.23 +module load gmp +module load cray-fftw +module load craype-accel-amd-gfx90a +export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH +#Hack for lib +#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH From b77a9b8947a3f3d871a39f9c9fe836be18f9285d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 30 Nov 2023 14:31:51 -0500 Subject: [PATCH 048/114] SDDC compiles starting --- systems/SDCC-A100/config-command | 17 +++++++++++++++++ systems/SDCC-A100/sourceme.sh | 2 ++ systems/SDCC-ICE/config-command | 14 ++++++++++++++ systems/SDCC-ICE/sourceme.sh | 1 + 4 files changed, 34 insertions(+) create mode 100644 systems/SDCC-A100/config-command create mode 100644 systems/SDCC-A100/sourceme.sh create mode 100644 systems/SDCC-ICE/config-command create mode 100644 systems/SDCC-ICE/sourceme.sh diff --git a/systems/SDCC-A100/config-command b/systems/SDCC-A100/config-command new file mode 100644 index 00000000..cb773e7a --- /dev/null +++ b/systems/SDCC-A100/config-command @@ -0,0 +1,17 @@ +../../configure \ +--enable-comms=mpi-auto \ +--enable-unified=no \ +--enable-shm=nvlink \ +--enable-accelerator=cuda \ +--enable-gen-simd-width=64 \ +--enable-simd=GPU \ +--enable-accelerator-cshift \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=nvcc \ +MPICXX=mpicxx \ +LDFLAGS="-cudart shared " \ +CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared" + + + diff --git a/systems/SDCC-A100/sourceme.sh b/systems/SDCC-A100/sourceme.sh new file mode 100644 index 00000000..2aa86b7e --- /dev/null +++ b/systems/SDCC-A100/sourceme.sh @@ -0,0 +1,2 @@ +module load cuda/12.2 +module load openmpi diff --git a/systems/SDCC-ICE/config-command b/systems/SDCC-ICE/config-command new file mode 100644 index 00000000..28e560e3 --- /dev/null +++ b/systems/SDCC-ICE/config-command @@ -0,0 +1,14 @@ +../../configure \ +--enable-comms=mpi \ +--enable-unified=yes \ +--enable-shm=shmopen \ +--enable-accelerator=none \ +--enable-simd=AVX2 \ +--disable-accelerator-cshift \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=mpicxx \ +CXXFLAGS="-std=c++17" + + + diff --git a/systems/SDCC-ICE/sourceme.sh b/systems/SDCC-ICE/sourceme.sh new file mode 100644 index 00000000..a620dea5 --- /dev/null +++ b/systems/SDCC-ICE/sourceme.sh @@ -0,0 +1 @@ +module load openmpi From 14643c0aab28c0b78f2cff1718bb454ceacd95f6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 4 Dec 2023 15:45:57 -0500 Subject: [PATCH 049/114] SDCC benchmarking scripts for A100 nodes and IceLake nodes (AVX512) --- benchmarks/Benchmark_dwf_fp32.cc | 9 +++++++ systems/SDCC-A100/bench.slurm | 42 ++++++++++++++++++++++++++++++++ systems/SDCC-A100/config-command | 2 +- systems/SDCC-ICE/bench.slurm | 31 +++++++++++++++++++++++ systems/SDCC-ICE/config-command | 11 ++++++--- systems/SDCC-ICE/sourceme.sh | 1 + 6 files changed, 92 insertions(+), 4 deletions(-) create mode 100644 systems/SDCC-A100/bench.slurm create mode 100644 systems/SDCC-ICE/bench.slurm diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index ae7cabec..37287595 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -185,6 +185,7 @@ void Benchmark(int Ls, Coordinate Dirichlet) GaugeField Umu(UGrid); GaugeField UmuCopy(UGrid); SU::HotConfiguration(RNG4,Umu); + // SU::ColdConfiguration(Umu); UmuCopy=Umu; std::cout << GridLogMessage << "Random gauge initialised " << std::endl; @@ -307,6 +308,14 @@ void Benchmark(int Ls, Coordinate Dirichlet) if(( n2e>1.0e-4) ) { std::cout<Barrier(); + std::cout<Barrier(); exit(-1); } assert (n2e< 1.0e-4 ); diff --git a/systems/SDCC-A100/bench.slurm b/systems/SDCC-A100/bench.slurm new file mode 100644 index 00000000..04d1e1e2 --- /dev/null +++ b/systems/SDCC-A100/bench.slurm @@ -0,0 +1,42 @@ +#!/bin/bash +#SBATCH --partition csi +#SBATCH --time=00:10:00 +#SBATCH -A csigeneral +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --qos csi +#SBATCH --gres=gpu:4 + +source sourceme.sh + +cat << EOF > select_gpu +#!/bin/bash +export GPU_MAP=(0 1 2 3) +export GPU=\${GPU_MAP[\$SLURM_LOCALID]} +export CUDA_VISIBLE_DEVICES=\$GPU +unset ROCR_VISIBLE_DEVICES +echo RANK \$SLURM_LOCALID using GPU \$GPU +exec \$* +EOF +chmod +x ./select_gpu + + +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=cuda,gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_MEMTYPE_CACHE=n + +export OMP_NUM_THREAD=8 +#srun -N1 -n1 nvidia-smi +#srun -N1 -n1 numactl -H > numa.txt +srun -N1 -n1 lstopo A100-topo.pdf + +# 4.35 TF/s +#srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --shm 2048 --shm-mpi 0 --accelerator-threads 16 + +srun -N1 -n4 ./select_gpu ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 32.32.64.64 --shm 2048 --shm-mpi 0 --accelerator-threads 16 + diff --git a/systems/SDCC-A100/config-command b/systems/SDCC-A100/config-command index cb773e7a..26ad5377 100644 --- a/systems/SDCC-A100/config-command +++ b/systems/SDCC-A100/config-command @@ -5,7 +5,7 @@ --enable-accelerator=cuda \ --enable-gen-simd-width=64 \ --enable-simd=GPU \ ---enable-accelerator-cshift \ +--disable-accelerator-cshift \ --disable-fermion-reps \ --disable-gparity \ CXX=nvcc \ diff --git a/systems/SDCC-ICE/bench.slurm b/systems/SDCC-ICE/bench.slurm new file mode 100644 index 00000000..76beb828 --- /dev/null +++ b/systems/SDCC-ICE/bench.slurm @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:20:00 +#SBATCH -A lqcdtest +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH --ntasks=2 +#SBATCH --qos lqcd + +source sourceme.sh + +export OMP_NUM_THREAD=24 +#srun -N1 -n1 numactl -H > numa.txt +#srun -N1 -n1 lstopo ice-topo.pdf + +cat << EOF > select_socket +#!/bin/bash +export NUM_MAP=(0 1) +export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]} +exec \$* +EOF +chmod +x ./select_socket + +#for vol in 8.8.8.16 8.8.8.32 8.8.8.64 +#for vol in 8.8.16.16 8.8.16.32 8.8.16.64 +for vol in 8.16.16.16 8.16.16.32 8.16.16.64 16.16.16.32 16.16.16.64 24.24.24.64 32.32.32.32 +do +srun --cpu-bind=ldoms -N1 -n2 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid $vol --dslash-asm > $vol.2socket.out +srun --cpu-bind=ldoms -N1 -n1 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm > $vol.1socket.out +done + diff --git a/systems/SDCC-ICE/config-command b/systems/SDCC-ICE/config-command index 28e560e3..bc28c96d 100644 --- a/systems/SDCC-ICE/config-command +++ b/systems/SDCC-ICE/config-command @@ -1,13 +1,18 @@ ../../configure \ ---enable-comms=mpi \ +--enable-debug \ +--enable-comms=mpi-auto \ --enable-unified=yes \ --enable-shm=shmopen \ +--enable-shm-fast-path=shmopen \ --enable-accelerator=none \ ---enable-simd=AVX2 \ +--enable-simd=AVX512 \ --disable-accelerator-cshift \ --disable-fermion-reps \ --disable-gparity \ -CXX=mpicxx \ +CXX=clang++ \ +MPICXX=mpicxx \ +LDFLAGS=-L/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/hwloc-2.9.1-hgkscnt5pferhtde4ahctlupb6qf3vtl/lib/ \ +LIBS=-lhwloc \ CXXFLAGS="-std=c++17" diff --git a/systems/SDCC-ICE/sourceme.sh b/systems/SDCC-ICE/sourceme.sh index a620dea5..6263063c 100644 --- a/systems/SDCC-ICE/sourceme.sh +++ b/systems/SDCC-ICE/sourceme.sh @@ -1 +1,2 @@ +export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-12.0.1-agey6vtuw3e375rewhhobvkznjh5ltz4/lib/:$LD_LIBRARY_PATH module load openmpi From d1d98272638250f6ed579d760af0ca4f267004b0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 8 Dec 2023 12:11:03 -0500 Subject: [PATCH 050/114] Integrator logging update --- Grid/qcd/action/ActionBase.h | 16 +++++++++++ Grid/qcd/hmc/integrators/Integrator.h | 39 ++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index d34702c1..8acae81b 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -129,6 +129,22 @@ public: virtual ~Action(){} }; +template +class EmptyAction : public Action +{ + virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions + virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action + virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative + + /////////////////////////////// + // Logging + /////////////////////////////// + virtual std::string action_name() { return std::string("Level Force Log"); }; + virtual std::string LogParameters() { return std::string("No parameters");}; +}; + + + NAMESPACE_END(Grid); #endif // ACTION_BASE_H diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 4dd5a634..f3c728fc 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -86,7 +86,8 @@ public: MomentumFilterBase const* MomFilter; const ActionSet as; - + ActionSet LevelForces; + //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default static MomentumFilterBase const* getDefaultMomFilter(){ static MomentumFilterNone filter; @@ -123,7 +124,8 @@ public: void update_P(MomentaField& Mom, Field& U, int level, double ep) { // input U actually not used in the fundamental case // Fundamental updates, include smearing - + + Field level_force(U.Grid()); level_force =Zero(); for (int a = 0; a < as[level].actions.size(); ++a) { double start_full = usecond(); @@ -144,7 +146,10 @@ public: MomFilter->applyFilter(force); std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR; @@ -167,6 +172,16 @@ public: } + { + // total force + Real force_abs = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) + Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR; + + Real force_max = std::sqrt(maxLocalNorm2(level_force)); + Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR; + LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max); + } + // Force from the other representations as[level].apply(update_P_hireps, Representations, Mom, U, ep); @@ -216,6 +231,12 @@ public: //Default the momentum filter to "do-nothing" MomFilter = getDefaultMomFilter(); + + for (int level = 0; level < as.size(); ++level) { + ActionLevel Level; + Level.push_back(new EmptyAction); + LevelForces.push_back(Level); // does it copy by value or reference?? + } }; virtual ~Integrator() {} @@ -237,6 +258,8 @@ public: for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { as[level].actions.at(actionID)->reset_timer(); } + int actionID=0; + LevelForces[level].actions.at(actionID)->reset_timer(); } } void print_timer(void) @@ -298,6 +321,16 @@ public: <<" calls " << as[level].actions.at(actionID)->deriv_num << std::endl; } + int actionID=0; + std::cout << GridLogMessage + << LevelForces[level].actions.at(actionID)->action_name() + <<"["<deriv_max_average() + <<" norm " << LevelForces[level].actions.at(actionID)->deriv_norm_average() + <<" Fdt max " << LevelForces[level].actions.at(actionID)->Fdt_max_average() + <<" Fdt norm " << LevelForces[level].actions.at(actionID)->Fdt_norm_average() + <<" calls " << LevelForces[level].actions.at(actionID)->deriv_num + << std::endl; } std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; } From 645e47c1ba526f5695d309dadfe089f68840fb34 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 8 Dec 2023 16:17:56 -0500 Subject: [PATCH 051/114] Config for Ampere Altra ARM --- systems/SDCC-ARM/config-command-mpi | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 systems/SDCC-ARM/config-command-mpi diff --git a/systems/SDCC-ARM/config-command-mpi b/systems/SDCC-ARM/config-command-mpi new file mode 100644 index 00000000..882cfe56 --- /dev/null +++ b/systems/SDCC-ARM/config-command-mpi @@ -0,0 +1,6 @@ +HDF=$HOME/paboyle/install + +LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=NEONv8 --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF +#LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=GEN --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF + + From f48298ad4e58386b6eb4edbf0fe045a353bca6c7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 11 Dec 2023 20:56:03 -0500 Subject: [PATCH 052/114] Bug fix --- Grid/qcd/hmc/integrators/Integrator.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index f3c728fc..385ff986 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -86,6 +86,7 @@ public: MomentumFilterBase const* MomFilter; const ActionSet as; + ActionSet LevelForces; //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default @@ -124,6 +125,8 @@ public: void update_P(MomentaField& Mom, Field& U, int level, double ep) { // input U actually not used in the fundamental case // Fundamental updates, include smearing + + assert(as.size()==LevelForces.size()); Field level_force(U.Grid()); level_force =Zero(); for (int a = 0; a < as[level].actions.size(); ++a) { @@ -233,9 +236,13 @@ public: MomFilter = getDefaultMomFilter(); for (int level = 0; level < as.size(); ++level) { - ActionLevel Level; - Level.push_back(new EmptyAction); - LevelForces.push_back(Level); // does it copy by value or reference?? + int multiplier = as.at(level).multiplier; + ActionLevel * Level = new ActionLevel(multiplier); + Level->push_back(new EmptyAction); + LevelForces.push_back(*Level); + // does it copy by value or reference?? + // - answer it copies by value, BUT the action level contains a reference that is NOT updated. + // Unsafe code in Guido's area } }; @@ -254,12 +261,14 @@ public: void reset_timer(void) { + assert(as.size()==LevelForces.size()); for (int level = 0; level < as.size(); ++level) { for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { as[level].actions.at(actionID)->reset_timer(); } int actionID=0; - LevelForces[level].actions.at(actionID)->reset_timer(); + assert(LevelForces.at(level).actions.size()==1); + LevelForces.at(level).actions.at(actionID)->reset_timer(); } } void print_timer(void) @@ -352,6 +361,13 @@ public: std::cout << as[level].actions.at(actionID)->LogParameters(); } } + std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <action_name() << "] ID: " << actionID << std::endl; + } + } std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; } @@ -433,6 +449,7 @@ public: RealD S(Field& U) { // here also U not used + assert(as.size()==LevelForces.size()); std::cout << GridLogIntegrator << "Integrator action\n"; RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom From 2a0d75bac215d5b34e39ce638dc6b2933de13fb5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 Dec 2023 23:19:11 +0000 Subject: [PATCH 053/114] Aurora files --- systems/Aurora/benchmarks/bench.pbs | 54 +++++++++ systems/Aurora/benchmarks/bench2.pbs | 107 ++++++++++++++++++ systems/Aurora/benchmarks/gpu_tile_compact.sh | 65 +++++++++++ .../Aurora/benchmarks/gpu_tile_compact4.sh | 60 ++++++++++ systems/Aurora/config-command | 16 +++ systems/Aurora/proxies.sh | 9 ++ systems/Aurora/sourceme.sh | 12 ++ 7 files changed, 323 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench.pbs create mode 100644 systems/Aurora/benchmarks/bench2.pbs create mode 100755 systems/Aurora/benchmarks/gpu_tile_compact.sh create mode 100755 systems/Aurora/benchmarks/gpu_tile_compact4.sh create mode 100644 systems/Aurora/config-command create mode 100644 systems/Aurora/proxies.sh create mode 100644 systems/Aurora/sourceme.sh diff --git a/systems/Aurora/benchmarks/bench.pbs b/systems/Aurora/benchmarks/bench.pbs new file mode 100644 index 00000000..a12cde07 --- /dev/null +++ b/systems/Aurora/benchmarks/bench.pbs @@ -0,0 +1,54 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=1 +#PBS -l walltime=01:00:00 +##PBS -A Aurora_Deployment +#PBS -A LatticeQCD_aesp + +HDIR=/home/paboyle/ +#module use /soft/testing/modulefiles/ +#module load intel-UMD23.05.25593.11/23.05.25593.11 +#module load tools/pti-gpu +#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH +#export PATH=$HDIR/tools/bin:$PATH + +export TZ='/usr/share/zoneinfo/US/Central' +export OMP_PROC_BIND=spread +export OMP_NUM_THREADS=3 +unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +echo Jobid: $PBS_JOBID +echo Running on host `hostname` +echo Running on nodes `cat $PBS_NODEFILE` + +echo NODES +cat $PBS_NODEFILE +NNODES=`wc -l < $PBS_NODEFILE` +NRANKS=12 # Number of MPI ranks per node +NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node +NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS + +NTOTRANKS=$(( NNODES * NRANKS )) + +echo "NUM_NODES=${NNODES} TOTAL_RANKS=${NTOTRANKS} RANKS_PER_NODE=${NRANKS} THREADS_PER_RANK=${OMP_NUM_THREADS}" +echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES" + + +#CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \ +# ./gpu_tile_compact.sh \ +# ./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \ +# --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 1.1.2.6 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs new file mode 100644 index 00000000..6c3384dd --- /dev/null +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -0,0 +1,107 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=2 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +HDIR=/home/paboyle/ +#module use /soft/testing/modulefiles/ +#module load intel-UMD23.05.25593.11/23.05.25593.11 +#module load tools/pti-gpu +#export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH +#export PATH=$HDIR/tools/bin:$PATH + +export TZ='/usr/share/zoneinfo/US/Central' +export OMP_PROC_BIND=spread +export OMP_NUM_THREADS=3 +unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + + +echo Jobid: $PBS_JOBID +echo Running on host `hostname` +echo Running on nodes `cat $PBS_NODEFILE` + +echo NODES +cat $PBS_NODEFILE +NNODES=`wc -l < $PBS_NODEFILE` +NRANKS=12 # Number of MPI ranks per node +NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node +NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS + +NTOTRANKS=$(( NNODES * NRANKS )) + +echo "NUM_NODES=${NNODES} TOTAL_RANKS=${NTOTRANKS} RANKS_PER_NODE=${NRANKS} THREADS_PER_RANK=${OMP_NUM_THREADS}" +echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES" + + +CMD="mpiexec -np 2 -ppn 1 -d ${NDEPTH} -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 1.1.1.2 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +#$CMD | tee 1-to-1.comms.hmem0 +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +#$CMD | tee 1-to-1.comms.hmem1 + + +CMD="mpiexec -np 4 -ppn 2 -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 2.2.1.1 --grid 32.24.32.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +#$CMD | tee 2-to-2.comms.hmem1 + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +#$CMD | tee 2-to-2.comms.hmem0 + +CMD="mpiexec -np 6 -ppn 3 -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 3.2.1.1 --grid 32.24.32.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +#$CMD | tee 3-to-3.comms.hmem1 + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +#$CMD | tee 3-to-3.comms.hmem0 + + +CMD="mpiexec -np 8 -ppn 4 -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact4.sh \ + ./Benchmark_comms_host_device --mpi 2.2.2.1 --grid 32.24.32.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +$CMD | tee 4-to-4.comms.hmem1.nic-affinity + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +$CMD | tee 4-to-4.comms.hmem0.nic-affinity + + +CMD="mpiexec -np 12 -ppn 6 -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 3.2.2.1 --grid 32.24.32.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +#$CMD | tee 6-to-6.comms.hmem1 + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +#$CMD | tee 6-to-6.comms.hmem0 + + +CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 3.2.2.2 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +#$CMD | tee 12-to-12.comms.hmem1 + +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=0 +#$CMD | tee 12-to-12.comms.hmem0 diff --git a/systems/Aurora/benchmarks/gpu_tile_compact.sh b/systems/Aurora/benchmarks/gpu_tile_compact.sh new file mode 100755 index 00000000..4ea4b113 --- /dev/null +++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +display_help() { + echo " Will map gpu tile to rank in compact and then round-robin fashion" + echo " Usage (only work for one node of ATS/PVC):" + echo " mpiexec --np N gpu_tile_compact.sh ./a.out" + echo + echo " Example 3 GPU of 2 Tiles with 7 Ranks:" + echo " 0 Rank 0.0" + echo " 1 Rank 0.1" + echo " 2 Rank 1.0" + echo " 3 Rank 1.1" + echo " 4 Rank 2.0" + echo " 5 Rank 2.1" + echo " 6 Rank 0.0" + echo + echo " Hacked together by apl@anl.gov, please contact if bug found" + exit 1 +} + +#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence. +#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices") +num_gpu=6 +num_tile=2 + +if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then + display_help +fi + + + +gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu )) +tile_id=$((PALS_LOCAL_RANKID % num_tile)) + +export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 ) +export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) +export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) +export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} +export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} +export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} +export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} + +export GRID_MPICH_NIC_BIND=$NIC + +unset EnableWalkerPartition +export EnableImplicitScaling=0 +export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 +export ZE_AFFINITY_MASK=$gpu_id.$tile_id +#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id +export ONEAPI_DEVICE_FILTER=gpu,level_zero +export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 + +echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND" + +if [ $PALS_LOCAL_RANKID = 0 ] +then +numactl -m $NUMA -N $NUMA "$@" +else +numactl -m $NUMA -N $NUMA "$@" +fi diff --git a/systems/Aurora/benchmarks/gpu_tile_compact4.sh b/systems/Aurora/benchmarks/gpu_tile_compact4.sh new file mode 100755 index 00000000..c157b853 --- /dev/null +++ b/systems/Aurora/benchmarks/gpu_tile_compact4.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +display_help() { + echo " Will map gpu tile to rank in compact and then round-robin fashion" + echo " Usage (only work for one node of ATS/PVC):" + echo " mpiexec --np N gpu_tile_compact.sh ./a.out" + echo + echo " Example 3 GPU of 2 Tiles with 7 Ranks:" + echo " 0 Rank 0.0" + echo " 1 Rank 0.1" + echo " 2 Rank 1.0" + echo " 3 Rank 1.1" + echo " 4 Rank 2.0" + echo " 5 Rank 2.1" + echo " 6 Rank 0.0" + echo + echo " Hacked together by apl@anl.gov, please contact if bug found" + exit 1 +} + +#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence. +#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices") +num_gpu=6 +num_tile=2 + +if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then + display_help +fi + + + +gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu )) +tile_id=$((PALS_LOCAL_RANKID % num_tile)) + +export NUMA_MAP=(0 0 1 1 0 0 1 1 ) +export NIC_MAP=(0 1 4 5 0 1 4 5 ) +export GPU_MAP=(0 1 3 4 0 1 3 4 ) +export TILE_MAP=(0 0 0 0 1 1 1 1 ) +export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} +export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} +export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} +export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} + +export GRID_MPICH_NIC_BIND=$NIC + +unset EnableWalkerPartition +export EnableImplicitScaling=0 +export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 +export ZE_AFFINITY_MASK=$gpu_id.$tile_id +#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id +export ONEAPI_DEVICE_FILTER=gpu,level_zero +export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 + +echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND" + +numactl -m $NUMA -N $NUMA "$@" diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command new file mode 100644 index 00000000..e59ef515 --- /dev/null +++ b/systems/Aurora/config-command @@ -0,0 +1,16 @@ +TOOLS=$HOME/tools +../../configure \ + --enable-simd=GPU \ + --enable-gen-simd-width=64 \ + --enable-comms=mpi-auto \ + --enable-accelerator-cshift \ + --disable-gparity \ + --disable-fermion-reps \ + --enable-shm=nvlink \ + --enable-accelerator=sycl \ + --enable-unified=no \ + MPICXX=mpicxx \ + CXX=icpx \ + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \ + CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include" + diff --git a/systems/Aurora/proxies.sh b/systems/Aurora/proxies.sh new file mode 100644 index 00000000..ff0d5a5b --- /dev/null +++ b/systems/Aurora/proxies.sh @@ -0,0 +1,9 @@ +export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 +export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +git config --global http.proxy http://proxy.alcf.anl.gov:3128 +module use /soft/modulefiles +module load intel_compute_runtime/release/agama-devel-682.22 + diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh new file mode 100644 index 00000000..0c4e0a6e --- /dev/null +++ b/systems/Aurora/sourceme.sh @@ -0,0 +1,12 @@ +#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 + +module use /soft/modulefiles +module load intel_compute_runtime/release/agama-devel-682.22 + +export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 +export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +git config --global http.proxy http://proxy.alcf.anl.gov:3128 + From 981c93d67ac2ca2fbe66bc238c4c538492db8ad6 Mon Sep 17 00:00:00 2001 From: david clarke Date: Sun, 21 Jan 2024 21:09:19 -0700 Subject: [PATCH 054/114] update Test_fatLinks to accept Naik --- tests/smearing/Test_fatLinks.cc | 42 +++++++++++++++++---------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index e3aa0e33..d9d35e69 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -58,24 +58,28 @@ struct ConfParameters: Serializable { // -void testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD Usmr, LatticeGaugeFieldD Unaik, +void testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD Usmr, LatticeGaugeFieldD Unaik, LatticeGaugeFieldD Ucontrol, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) { Smear_HISQ hisq_fat(&GRID,c1,cnaik,c3,c5,c7,clp); + LatticeGaugeFieldD diff(&GRID); hisq_fat.smear(Usmr, Unaik, Umu); - LatticeGaugeFieldD diff1(&GRID), diff2(&GRID); - diff1 = Ucontrol-Usmr; - diff2 = Ucontrol-Unaik; - auto absDiff1 = norm2(diff1)/norm2(Ucontrol); - auto absDiff2 = norm2(diff2)/norm2(Ucontrol); - if (absDiff1 < 1e-30) { - Grid_pass(" |Umu-Usmr|/|Umu| = ",absDiff1); - } else { - Grid_error(" |Umu-Usmr|/|Umu| = ",absDiff1); - } - if (absDiff2 < 1e-30) { - Grid_pass(" |Umu-Unaik|/|Umu| = ",absDiff2); - } else { - Grid_error(" |Umu-Unaik|/|Umu| = ",absDiff2); + if (cnaik < 1e-30) { // Testing anything but Naik term + diff = Ucontrol-Usmr; + auto absDiff = norm2(diff)/norm2(Ucontrol); + if (absDiff < 1e-30) { + Grid_pass(" |Umu-Usmr|/|Umu| = ",absDiff); + } else { + Grid_error(" |Umu-Usmr|/|Umu| = ",absDiff); + } + } else { // Testing Naik specifically + diff = Ucontrol-Unaik; + auto absDiff = norm2(diff)/norm2(Ucontrol); + if (absDiff < 1e-30) { + Grid_pass(" |Umu-Unaik|/|Umu| = ",absDiff); + } else { + Grid_error(" |Umu-Unaik|/|Umu| = ",absDiff); + } +// NerscIO::writeConfiguration(Unaik,"nersc.l8t4b3360.naik"); } } @@ -87,7 +91,6 @@ int main (int argc, char** argv) { int Nt = 4; Coordinate latt_size(Nd,0); latt_size[0]=Ns; latt_size[1]=Ns; latt_size[2]=Ns; latt_size[3]=Nt; std::string conf_in = "nersc.l8t4b3360"; - std::string conf_out = "nersc.l8t4b3360.357lplink"; int threads = GridThread::GetThreads(); typedef LatticeGaugeFieldD LGF; @@ -115,15 +118,14 @@ int main (int argc, char** argv) { // Carry out various tests NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.357lplink.control"); testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,1/384.,-1/8.); - NerscIO::writeConfiguration(Usmr,conf_out,"HISQ"); NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.357link.control"); testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,1/384.,0.); NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.35link.control"); testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,0.,0.); NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.3link.control"); - testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,1.,1/16.,0.,0.,0.); - NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.3link.control"); - testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,2.,1/16.,0.,0.,0.); + testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,0.,0.,0.); + NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.naik.control"); + testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,0.,0.8675309,0.,0.,0.,0.); // Test a C-style instantiation double path_coeff[6] = {1, 2, 3, 4, 5, 6}; From f5b3d582b044ea420256f2470b4efe934c7012bb Mon Sep 17 00:00:00 2001 From: david clarke Date: Mon, 22 Jan 2024 02:49:40 -0700 Subject: [PATCH 055/114] first attempt at U3 projection --- Grid/qcd/smearing/HISQSmearing.h | 46 +++++++++++++++++++++++++++----- tests/smearing/Test_fatLinks.cc | 3 ++- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 959a6cf0..43b06534 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -83,7 +83,6 @@ struct SmearingParameters{ /*! @brief create fat links from link variables */ template class Smear_HISQ : public Gimpl { -// TODO: this needs to be renamed, becaues the Naik guy is not part of the fat smear private: GridCartesian* const _grid; @@ -289,16 +288,16 @@ public: std::vector V(Nd, u_smr.Grid()); std::vector Vnaik(Nd, u_naik.Grid()); for (int mu = 0; mu < Nd; mu++) { - U[mu] = PeekIndex(u_thin, mu); - V[mu] = PeekIndex(u_smr, mu); + U[mu] = PeekIndex(u_thin, mu); + V[mu] = PeekIndex(u_smr, mu); } for(int mu=0;mu(u_mu, mu); + Q = adj(V[mu])*V[mu]; + c1 = trace(Q*Q)/2.; // SU(N) matrices are traceless, so c0=0. + c2 = trace(Q*Q*Q)/3.; + S = c1/3.; + R = c2/2.; + theta = std::acos(R/std::pow(S,1.5)); + g0 = 2.*std::sqrt(S)*std::cos(theta/3.-2*M_PI/3.); + g1 = 2.*std::sqrt(S)*std::cos(theta/3. ); + g2 = 2.*std::sqrt(S)*std::cos(theta/3.+2*M_PI/3.); +// if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) {} + u = std::sqrt(g0) + std::sqrt(g1) + std::sqrt(g2); + v = std::sqrt(g0*g1) + std::sqrt(g0*g2) + std::sqrt(g1*g2); + w = std::sqrt(g0*g1*g2); + den = w*(u*v-w); + f0 = (-w*(u*u+v)+u*v*v)/den; + f1 = (-w-u*u*u+2*u*v)/den; + f2 = u/den; + + sqrtQinv = f0 + f1*Q + f2*Q*Q; + PokeIndex(u_proj, V*sqrtQinv, mu); + } + }; + + // void derivative(const GaugeField& Gauge) const { // }; }; diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index d9d35e69..e2dc5d6d 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -61,7 +61,7 @@ struct ConfParameters: Serializable { void testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD Usmr, LatticeGaugeFieldD Unaik, LatticeGaugeFieldD Ucontrol, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) { Smear_HISQ hisq_fat(&GRID,c1,cnaik,c3,c5,c7,clp); - LatticeGaugeFieldD diff(&GRID); + LatticeGaugeFieldD diff(&GRID), Uproj(&GRID); hisq_fat.smear(Usmr, Unaik, Umu); if (cnaik < 1e-30) { // Testing anything but Naik term diff = Ucontrol-Usmr; @@ -79,6 +79,7 @@ void testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD U } else { Grid_error(" |Umu-Unaik|/|Umu| = ",absDiff); } + hisq_fat.projectU3(Uproj,Usmr); // NerscIO::writeConfiguration(Unaik,"nersc.l8t4b3360.naik"); } } From 00f24f87653d92276f9ffeeaf403f78b5adef4cc Mon Sep 17 00:00:00 2001 From: david clarke Date: Mon, 22 Jan 2024 05:50:16 -0700 Subject: [PATCH 056/114] already found some bugs in projection, still needs testing --- Grid/qcd/smearing/HISQSmearing.h | 38 ++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 43b06534..17959495 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -93,6 +93,7 @@ public: INHERIT_GIMPL_TYPES(Gimpl); typedef typename Gimpl::GaugeField GF; typedef typename Gimpl::GaugeLinkField LF; + typedef typename Gimpl::ComplexField CF; // Don't allow default values here. Smear_HISQ(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) @@ -329,31 +330,36 @@ public: // IN--u_mu void projectU3(GF& u_proj, GF& u_mu) const { - LF V, Q, sqrtQinv; - Real c1, c2, g0, g1, g2, S, R, theta, u, v, w, den, f0, f1, f2; + LF V(u_mu.Grid()), Q(u_mu.Grid()), sqrtQinv(u_mu.Grid()), id_3(u_mu.Grid()); + CF c0(u_mu.Grid()), c1(u_mu.Grid()), c2(u_mu.Grid()), g0(u_mu.Grid()), g1(u_mu.Grid()), + g2(u_mu.Grid()), S(u_mu.Grid()), R(u_mu.Grid()), theta(u_mu.Grid()), u(u_mu.Grid()), + v(u_mu.Grid()), w(u_mu.Grid()), den(u_mu.Grid()), f0(u_mu.Grid()), f1(u_mu.Grid()), + f2(u_mu.Grid()); // Follow MILC 10.1103/PhysRevD.82.074501, eqs (B2-B3) and (C1-C8) for (int mu = 0; mu < Nd; mu++) { V = PeekIndex(u_mu, mu); - Q = adj(V[mu])*V[mu]; - c1 = trace(Q*Q)/2.; // SU(N) matrices are traceless, so c0=0. - c2 = trace(Q*Q*Q)/3.; - S = c1/3.; - R = c2/2.; - theta = std::acos(R/std::pow(S,1.5)); - g0 = 2.*std::sqrt(S)*std::cos(theta/3.-2*M_PI/3.); - g1 = 2.*std::sqrt(S)*std::cos(theta/3. ); - g2 = 2.*std::sqrt(S)*std::cos(theta/3.+2*M_PI/3.); + Q = adj(V)*V; + c0 = real(trace(Q)); + c1 = (1/2.)*real(trace(Q*Q)); + c2 = (1/3.)*real(trace(Q*Q*Q)); + S = (1/3.)*c1-(1/18.)*c0*c0; + R = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0; + theta = acos(R*pow(S,-1.5)); + g0 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.); + g1 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta ); + g2 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.); // if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) {} - u = std::sqrt(g0) + std::sqrt(g1) + std::sqrt(g2); - v = std::sqrt(g0*g1) + std::sqrt(g0*g2) + std::sqrt(g1*g2); - w = std::sqrt(g0*g1*g2); + u = sqrt(g0) + sqrt(g1) + sqrt(g2); + v = sqrt(g0*g1) + sqrt(g0*g2) + sqrt(g1*g2); + w = sqrt(g0*g1*g2); den = w*(u*v-w); f0 = (-w*(u*u+v)+u*v*v)/den; - f1 = (-w-u*u*u+2*u*v)/den; + f1 = (-w-u*u*u+2.*u*v)/den; f2 = u/den; + id_3 = 1.; - sqrtQinv = f0 + f1*Q + f2*Q*Q; + sqrtQinv = f0*id_3 + f1*Q + f2*Q*Q; PokeIndex(u_proj, V*sqrtQinv, mu); } }; From 4924b3209e9bb5c358699ebfd8c3d929eaec87c2 Mon Sep 17 00:00:00 2001 From: david clarke Date: Tue, 23 Jan 2024 14:43:58 -0700 Subject: [PATCH 057/114] projectU3 yields a unitary matrix --- .gitignore | 4 ++++ Grid/qcd/smearing/HISQSmearing.h | 31 ++++++++++++++++++------------- tests/smearing/Test_fatLinks.cc | 8 +------- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 40156f9d..94e866e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# Doxygen stuff +html/* +latex/* + # Compiled Object files # ######################### *.slo diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 17959495..c8255acc 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -330,7 +330,7 @@ public: // IN--u_mu void projectU3(GF& u_proj, GF& u_mu) const { - LF V(u_mu.Grid()), Q(u_mu.Grid()), sqrtQinv(u_mu.Grid()), id_3(u_mu.Grid()); + LF V(u_mu.Grid()), Q(u_mu.Grid()), sqrtQinv(u_mu.Grid()), id_3(u_mu.Grid()), diff(u_mu.Grid()); CF c0(u_mu.Grid()), c1(u_mu.Grid()), c2(u_mu.Grid()), g0(u_mu.Grid()), g1(u_mu.Grid()), g2(u_mu.Grid()), S(u_mu.Grid()), R(u_mu.Grid()), theta(u_mu.Grid()), u(u_mu.Grid()), v(u_mu.Grid()), w(u_mu.Grid()), den(u_mu.Grid()), f0(u_mu.Grid()), f1(u_mu.Grid()), @@ -338,18 +338,22 @@ public: // Follow MILC 10.1103/PhysRevD.82.074501, eqs (B2-B3) and (C1-C8) for (int mu = 0; mu < Nd; mu++) { - V = PeekIndex(u_mu, mu); - Q = adj(V)*V; - c0 = real(trace(Q)); - c1 = (1/2.)*real(trace(Q*Q)); - c2 = (1/3.)*real(trace(Q*Q*Q)); - S = (1/3.)*c1-(1/18.)*c0*c0; - R = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0; - theta = acos(R*pow(S,-1.5)); - g0 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.); - g1 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta ); - g2 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.); -// if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) {} + V = PeekIndex(u_mu, mu); + Q = adj(V)*V; + c0 = real(trace(Q)); + c1 = (1/2.)*real(trace(Q*Q)); + c2 = (1/3.)*real(trace(Q*Q*Q)); + S = (1/3.)*c1-(1/18.)*c0*c0; + if (norm2(S)<1e-28) { + g0 = (1/3.)*c0; g1 = g0; g2 = g1; + } else { + R = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0; + theta = acos(R*pow(S,-1.5)); + g0 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.); + g1 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta ); + g2 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.); + } +// if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) { SVD } u = sqrt(g0) + sqrt(g1) + sqrt(g2); v = sqrt(g0*g1) + sqrt(g0*g2) + sqrt(g1*g2); w = sqrt(g0*g1*g2); @@ -360,6 +364,7 @@ public: id_3 = 1.; sqrtQinv = f0*id_3 + f1*Q + f2*Q*Q; + PokeIndex(u_proj, V*sqrtQinv, mu); } }; diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc index e2dc5d6d..742cb205 100644 --- a/tests/smearing/Test_fatLinks.cc +++ b/tests/smearing/Test_fatLinks.cc @@ -51,12 +51,6 @@ struct ConfParameters: Serializable { } }; -// -// one method: input --> fat -// another : input --> long (naik) -// another : input --> unitarize -// - void testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD Usmr, LatticeGaugeFieldD Unaik, LatticeGaugeFieldD Ucontrol, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) { @@ -79,7 +73,7 @@ void testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD U } else { Grid_error(" |Umu-Unaik|/|Umu| = ",absDiff); } - hisq_fat.projectU3(Uproj,Usmr); + hisq_fat.projectU3(Uproj,Ucontrol); // NerscIO::writeConfiguration(Unaik,"nersc.l8t4b3360.naik"); } } From caa5f9772339ea5124759e4dc4c47af4ef0b01cd Mon Sep 17 00:00:00 2001 From: dbollweg Date: Wed, 31 Jan 2024 16:50:06 -0500 Subject: [PATCH 058/114] Add sliceSum gpu using cub/hipcub --- Grid/lattice/Lattice_reduction.h | 1 + Grid/lattice/Lattice_slicesum_gpu.h | 144 ++++++++++++++++++++++++++++ tests/core/Test_sliceSum.cc | 56 +++++++++++ 3 files changed, 201 insertions(+) create mode 100644 Grid/lattice/Lattice_slicesum_gpu.h create mode 100644 tests/core/Test_sliceSum.cc diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index a6bbcf15..f85ed7e3 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -27,6 +27,7 @@ Author: Christoph Lehner #if defined(GRID_CUDA)||defined(GRID_HIP) #include +#include #endif #if defined(GRID_SYCL) #include diff --git a/Grid/lattice/Lattice_slicesum_gpu.h b/Grid/lattice/Lattice_slicesum_gpu.h new file mode 100644 index 00000000..53d0afae --- /dev/null +++ b/Grid/lattice/Lattice_slicesum_gpu.h @@ -0,0 +1,144 @@ +#pragma once +#if defined(GRID_CUDA) + +#include +#define gpucub cub +#define gpuMalloc cudaMalloc +#define gpuMemcpy cudaMemcpy +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuError_t cudaError_t +#define gpuSuccess cudaSuccess + +#elif defined(GRID_HIP) + +#include +#define gpucub hipcub +#define gpuMalloc hipMalloc +#define gpuMemcpy hipMemcpy +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuError_t hipError_t +#define gpuSuccess hipSuccess + +// extern hipStream_t computeStream; +#endif + + +NAMESPACE_BEGIN(Grid); + +template inline void sliceSumGpu(const Lattice &Data,std::vector &result,int orthogdim) +{ + + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_object::scalar_type scalar_type; + GridBase *grid = Data.Grid(); + assert(grid!=NULL); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + int ostride=grid->_ostride[orthogdim]; + Vector lvSum(rd); + Vector lsSum(ld,Zero()); + commVector reduction_buffer(e1*e2); + ExtractBuffer extracted(Nsimd); + + result.resize(fd); + for(int r=0;rNsimd(),{ //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream) + + int n = s / e2; + int b = s % e2; + int so=r*ostride; // base offset for start of plane + int ss= so+n*stride+b; + + coalescedWrite(rb_p[s], coalescedRead(Data_v[ss])); + + }); + + //issue reductions in computeStream + gpuErr =gpucub::DeviceReduce::Sum(helperArray, temp_storage_bytes, rb_p, &d_out[r], size, computeStream); + if (gpuErr!=gpuSuccess) { + std::cout << "Encountered error during cub::DeviceReduce::Sum(2)! Error: " << gpuErr <iCoorFromIindex(icoor,idx); + + int ldx =rt+icoor[orthogdim]*rd; + + lsSum[ldx]=lsSum[ldx]+extracted[idx]; + + } + } + + // sum over nodes. + for(int t=0;t_processor_coor[orthogdim] ) { + result[t]=lsSum[lt]; + } else { + result[t]=Zero(); + } + + } + scalar_type * ptr = (scalar_type *) &result[0]; + int words = fd*sizeof(sobj)/sizeof(scalar_type); + grid->GlobalSumVector(ptr, words); +} + +template inline +std::vector +sliceSumGpu(const Lattice &Data,int orthogdim) +{ + std::vector result; + sliceSumGpu(Data,result,orthogdim); + return result; +} + +NAMESPACE_END(Grid); \ No newline at end of file diff --git a/tests/core/Test_sliceSum.cc b/tests/core/Test_sliceSum.cc new file mode 100644 index 00000000..4a04b41c --- /dev/null +++ b/tests/core/Test_sliceSum.cc @@ -0,0 +1,56 @@ +#include + + +int main (int argc, char ** argv) { + + using namespace Grid; + + Grid_init(&argc,&argv); + + + Coordinate latt_size({64,64,64,16}); + auto simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size, simd_layout, mpi_layout); + + std::vector seeds({1, 2, 3, 4}); + + GridParallelRNG pRNG(&Grid); + pRNG.SeedFixedIntegers(seeds); + + LatticeComplexD test_data(&Grid); + gaussian(pRNG,test_data); + + std::vector reduction_reference; + std::vector reduction_result; + + //warmup + for (int sweeps = 0; sweeps < 5; sweeps++) { + sliceSumGpu(test_data,reduction_result,0); + } + + + for (int i = 0; i < Nd; i++) { + RealD t=-usecond(); + sliceSum(test_data,reduction_reference,i); + t+=usecond(); + std::cout << " sliceSum took "< Date: Thu, 1 Feb 2024 16:41:03 -0500 Subject: [PATCH 059/114] Use accelerator_for2d and DeviceSegmentedRecude to avoid kernel launch latencies --- Grid/lattice/Lattice_slicesum_gpu.h | 71 +++++++++++++++++++---------- Grid/perfmon/Tracing.h | 2 +- tests/core/Test_sliceSum.cc | 3 +- 3 files changed, 50 insertions(+), 26 deletions(-) diff --git a/Grid/lattice/Lattice_slicesum_gpu.h b/Grid/lattice/Lattice_slicesum_gpu.h index 53d0afae..d8927708 100644 --- a/Grid/lattice/Lattice_slicesum_gpu.h +++ b/Grid/lattice/Lattice_slicesum_gpu.h @@ -6,6 +6,7 @@ #define gpuMalloc cudaMalloc #define gpuMemcpy cudaMemcpy #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuError_t cudaError_t #define gpuSuccess cudaSuccess @@ -16,6 +17,7 @@ #define gpuMalloc hipMalloc #define gpuMemcpy hipMemcpy #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuError_t hipError_t #define gpuSuccess hipSuccess @@ -49,14 +51,15 @@ template inline void sliceSumGpu(const Lattice &Data,std::vect int ostride=grid->_ostride[orthogdim]; Vector lvSum(rd); Vector lsSum(ld,Zero()); - commVector reduction_buffer(e1*e2); + commVector reduction_buffer(rd*e1*e2); ExtractBuffer extracted(Nsimd); result.resize(fd); for(int r=0;r inline void sliceSumGpu(const Lattice &Data,std::vect vobj *d_out; size_t temp_storage_bytes = 0; size_t size = e1*e2; - gpuMalloc(&d_out,rd*sizeof(vobj)); - gpuError_t gpuErr =gpucub::DeviceReduce::Sum(helperArray, temp_storage_bytes, rb_p,d_out, size, computeStream); + std::vector offsets(rd+1,0); + for (int i = 0; i < offsets.size(); i++) { + offsets[i] = i*size; + } + int* d_offsets; + + gpuError_t gpuErr = gpuMalloc(&d_out,rd*sizeof(vobj)); + if (gpuErr != gpuSuccess) { + std::cout << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc(1) Error: " << gpuErr <Nsimd(),{ //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream) - - int n = s / e2; - int b = s % e2; - int so=r*ostride; // base offset for start of plane - int ss= so+n*stride+b; - coalescedWrite(rb_p[s], coalescedRead(Data_v[ss])); + //prepare buffer for reduction + accelerator_for2dNB( s,e1*e2, r,rd, grid->Nsimd(),{ //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream) + //use 2d accelerator_for to avoid launch latencies found when looping over rd + int n = s / e2; + int b = s % e2; + int so=r*ostride; // base offset for start of plane + int ss= so+n*stride+b; - }); - - //issue reductions in computeStream - gpuErr =gpucub::DeviceReduce::Sum(helperArray, temp_storage_bytes, rb_p, &d_out[r], size, computeStream); - if (gpuErr!=gpuSuccess) { - std::cout << "Encountered error during cub::DeviceReduce::Sum(2)! Error: " << gpuErr < offsets(rd+1,0); + + for (int i = 0; i < offsets.size(); i++) { + offsets[i] = i*subvol_size; } + //Allocate memory for output and offset arrays on device + gpuError_t gpuErr = gpuMalloc(&d_out,rd*sizeof(vobj)); + if (gpuErr != gpuSuccess) { + std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc (d_out)! Error: " << gpuErr <Nsimd(),{ //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream) - //use 2d accelerator_for to avoid launch latencies found when looping over rd + //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream) + //use 2d accelerator_for to avoid launch latencies found when serially looping over rd + + accelerator_for2dNB( s,subvol_size, r,rd, grid->Nsimd(),{ + int n = s / e2; int b = s % e2; int so=r*ostride; // base offset for start of plane int ss= so+n*stride+b; - coalescedWrite(rb_p[r*e1*e2+s], coalescedRead(Data_v[ss])); + coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); }); - //issue reductions in computeStream - gpuErr =gpucub::DeviceSegmentedReduce::Reduce(helperArray, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), identity, computeStream); + //issue segmented reductions in computeStream + gpuErr = gpucub::DeviceSegmentedReduce::Reduce(helperArray, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), vobj_zero, computeStream); if (gpuErr!=gpuSuccess) { - std::cout << "Lattice_slicesum_gpu.h: Encountered error during cub::DeviceReduce::Sum(2)! Error: " << gpuErr < Date: Thu, 1 Feb 2024 18:02:30 -0500 Subject: [PATCH 061/114] Fix cuda compilation of Lattice_slicesum_gpu.h --- Grid/lattice/Lattice_slicesum_gpu.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Grid/lattice/Lattice_slicesum_gpu.h b/Grid/lattice/Lattice_slicesum_gpu.h index 196956d1..8e13808f 100644 --- a/Grid/lattice/Lattice_slicesum_gpu.h +++ b/Grid/lattice/Lattice_slicesum_gpu.h @@ -4,8 +4,9 @@ #include #define gpucub cub #define gpuMalloc cudaMalloc -#define gpuMemcpyDtoHAsync cudaMemcpyDtoHAsync -#define gpuMemcpyHtoDAsync cudaMemcpyHtoDAsync +#define gpuMemcpyAsync cudaMemcpyAsync +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuError_t cudaError_t #define gpuSuccess cudaSuccess @@ -14,8 +15,9 @@ #include #define gpucub hipcub #define gpuMalloc hipMalloc -#define gpuMemcpyDtoHAsync hipMemcpyDtoHAsync -#define gpuMemcpyHtoDAsync hipMemcpyHtoDAsync +#define gpuMemcpyAsync hipMemcpyAsync +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuError_t hipError_t #define gpuSuccess hipSuccess @@ -71,7 +73,7 @@ template inline void sliceSumGpu(const Lattice &Data,std::vect int* d_offsets; std::vector offsets(rd+1,0); - + for (int i = 0; i < offsets.size(); i++) { offsets[i] = i*subvol_size; } @@ -90,7 +92,7 @@ template inline void sliceSumGpu(const Lattice &Data,std::vect } //copy offsets to device - gpuErr = gpuMemcpyHtoDAsync(d_offsets,&offsets[0],sizeof(int)*(rd+1),computeStream); + gpuErr = gpuMemcpyAsync(d_offsets,&offsets[0],sizeof(int)*(rd+1),gpuMemcpyHostToDevice,computeStream); if (gpuErr != gpuSuccess) { std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMemcpy (d_offsets)! Error: " << gpuErr < inline void sliceSumGpu(const Lattice &Data,std::vect exit(EXIT_FAILURE); } - gpuErr = gpuMemcpyDtoHAsync(&lvSum[0],d_out,rd*sizeof(vobj),computeStream); + gpuErr = gpuMemcpyAsync(&lvSum[0],d_out,rd*sizeof(vobj),gpuMemcpyDeviceToHost,computeStream); if (gpuErr!=gpuSuccess) { std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMemcpy (d_out)! Error: " << gpuErr < Date: Tue, 6 Feb 2024 16:28:40 +0000 Subject: [PATCH 062/114] Aurora MPI standalone benchmake and options that work well --- MPI_benchmark/bench2.pbs | 22 ++ MPI_benchmark/compile-command | 1 + MPI_benchmark/gpu_tile_compact.sh | 30 ++ MPI_benchmark/halo_mpi.cc | 333 ++++++++++++++++++ systems/Aurora/benchmarks/bench.pbs | 7 +- systems/Aurora/benchmarks/bench2.pbs | 44 +-- systems/Aurora/benchmarks/gpu_tile_compact.sh | 62 +--- .../Aurora/benchmarks/gpu_tile_compact4.sh | 43 +-- systems/Aurora/sourceme.sh | 2 +- 9 files changed, 426 insertions(+), 118 deletions(-) create mode 100644 MPI_benchmark/bench2.pbs create mode 100644 MPI_benchmark/compile-command create mode 100755 MPI_benchmark/gpu_tile_compact.sh create mode 100644 MPI_benchmark/halo_mpi.cc diff --git a/MPI_benchmark/bench2.pbs b/MPI_benchmark/bench2.pbs new file mode 100644 index 00000000..2c069a20 --- /dev/null +++ b/MPI_benchmark/bench2.pbs @@ -0,0 +1,22 @@ +#!/bin/bash +#PBS -q EarlyAppAccess +#PBS -l select=2 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +export TZ='/usr/share/zoneinfo/US/Central' +export OMP_PROC_BIND=spread +export OMP_NUM_THREADS=3 +unset OMP_PLACES + +cd $PBS_O_WORKDIR + +NNODES=`wc -l < $PBS_NODEFILE` +NRANKS=12 # Number of MPI ranks per node +NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node +NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS + +NTOTRANKS=$(( NNODES * NRANKS )) + +CMD="mpiexec -np 2 -ppn 1 -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1" +$CMD diff --git a/MPI_benchmark/compile-command b/MPI_benchmark/compile-command new file mode 100644 index 00000000..20f26a3c --- /dev/null +++ b/MPI_benchmark/compile-command @@ -0,0 +1 @@ +mpicxx -fsycl halo_mpi.cc -o halo_mpi \ No newline at end of file diff --git a/MPI_benchmark/gpu_tile_compact.sh b/MPI_benchmark/gpu_tile_compact.sh new file mode 100755 index 00000000..28fdb341 --- /dev/null +++ b/MPI_benchmark/gpu_tile_compact.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) +export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) +export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) + +export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]} +export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} +export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} +export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} + + +export ZE_AFFINITY_MASK=$gpu_id.$tile_id +export ONEAPI_DEVICE_FILTER=gpu,level_zero + +#unset EnableWalkerPartition +#export EnableImplicitScaling=0 +#export GRID_MPICH_NIC_BIND=$NIC +#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id +#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 +#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 +#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 + +echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " + +numactl -m $PNUMA -N $NUMA "$@" diff --git a/MPI_benchmark/halo_mpi.cc b/MPI_benchmark/halo_mpi.cc new file mode 100644 index 00000000..9e11c473 --- /dev/null +++ b/MPI_benchmark/halo_mpi.cc @@ -0,0 +1,333 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/************************************************************** + * GPU - GPU memory cartesian halo exchange benchmark + * Config: what is the target + ************************************************************** + */ +#undef ACC_CUDA +#undef ACC_HIP +#define ACC_SYCL +#undef ACC_NONE + +/************************************************************** + * Some MPI globals + ************************************************************** + */ +MPI_Comm WorldComm; +MPI_Comm WorldShmComm; + +int WorldSize; +int WorldRank; + +int WorldShmSize; +int WorldShmRank; + +/************************************************************** + * Allocate buffers on the GPU, SYCL needs an init call and context + ************************************************************** + */ +#ifdef ACC_CUDA +#include +void acceleratorInit(void){} +void *acceleratorAllocDevice(size_t bytes) +{ + void *ptr=NULL; + auto err = cudaMalloc((void **)&ptr,bytes); + assert(err==cudaSuccess); + return ptr; +} +void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);} +#endif +#ifdef ACC_HIP +#include +void acceleratorInit(void){} +inline void *acceleratorAllocDevice(size_t bytes) +{ + void *ptr=NULL; + auto err = hipMalloc((void **)&ptr,bytes); + if( err != hipSuccess ) { + ptr = (void *) NULL; + printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); + } + return ptr; +}; +inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);}; +#endif +#ifdef ACC_SYCL +#include +#include +cl::sycl::queue *theAccelerator; +void acceleratorInit(void) +{ + int nDevices = 1; +#if 1 + cl::sycl::gpu_selector selector; + cl::sycl::device selectedDevice { selector }; + theAccelerator = new sycl::queue (selectedDevice); +#else + cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v }; + theAccelerator = new sycl::queue (selectedDevice); +#endif + auto name = theAccelerator->get_device().get_info(); + printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout); +} +inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);}; +#endif +#ifdef ACC_NONE +void acceleratorInit(void){} +inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr);}; +#endif + + +/************************************************************** + * Microsecond timer + ************************************************************** + */ +inline double usecond(void) { + struct timeval tv; + gettimeofday(&tv,NULL); + return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec; +} +/************************************************************** + * Main benchmark routine + ************************************************************** + */ +void Benchmark(int64_t L,std::vector cart_geom,bool use_device,int ncall) +{ + int64_t words = 3*4*2; + int64_t face,vol; + int Nd=cart_geom.size(); + + /************************************************************** + * L^Nd volume, L^(Nd-1) faces, 12 complex per site + * Allocate memory for these + ************************************************************** + */ + face=1; for( int d=0;d send_bufs; + std::vector recv_bufs; + size_t vw = face*words; + size_t bytes = face*words*sizeof(double); + + if ( use_device ) { + for(int d=0;d<2*Nd;d++){ + send_bufs.push_back(acceleratorAllocDevice(bytes)); + recv_bufs.push_back(acceleratorAllocDevice(bytes)); + } + } else { + for(int d=0;d<2*Nd;d++){ + send_bufs.push_back(malloc(bytes)); + recv_bufs.push_back(malloc(bytes)); + } + } + /********************************************************* + * Build cartesian communicator + ********************************************************* + */ + int ierr; + int rank; + std::vector coor(Nd); + MPI_Comm communicator; + std::vector periodic(Nd,1); + MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator); + MPI_Comm_rank(communicator,&rank); + MPI_Cart_coords(communicator,rank,Nd,&coor[0]); + + static int reported; + if ( ! reported ) { + printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank, + coor[0],coor[1],coor[2],coor[3]); fflush(stdout); + reported =1 ; + } + /********************************************************* + * Perform halo exchanges + ********************************************************* + */ + for(int d=0;d1 ) { + double t0=usecond(); + + int from,to; + + MPI_Barrier(communicator); + for(int n=0;n & vec) +{ + vec.resize(0); + std::stringstream ss(str); + int i; + while (ss >> i){ + vec.push_back(i); + if(std::ispunct(ss.peek())) + ss.ignore(); + } + return; +} +/************************************** + * Command line junk + **************************************/ +int main(int argc, char **argv) +{ + std::string arg; + + acceleratorInit(); + + MPI_Init(&argc,&argv); + + WorldComm = MPI_COMM_WORLD; + + MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); + + MPI_Comm_rank(WorldComm ,&WorldRank); + MPI_Comm_size(WorldComm ,&WorldSize); + + MPI_Comm_rank(WorldShmComm ,&WorldShmRank); + MPI_Comm_size(WorldShmComm ,&WorldShmSize); + + if ( WorldSize/WorldShmSize > 2) { + printf("This benchmark is meant to run on at most two nodes only\n"); + } + + auto mpi =std::vector({1,1,1,1}); + + if( CmdOptionExists(argv,argv+argc,"--mpi") ){ + arg = CmdOptionPayload(argv,argv+argc,"--mpi"); + CmdOptionIntVector(arg,mpi); + } else { + printf("Must specify --mpi command line argument\n"); + exit(0); + } + + if( !WorldRank ) { + printf("***********************************\n"); + printf("%d ranks\n",WorldSize); + printf("%d ranks-per-node\n",WorldShmSize); + printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout); + printf("Cartesian layout: "); + for(int d=0;d Date: Tue, 6 Feb 2024 13:24:45 -0500 Subject: [PATCH 063/114] work towards sliceSum for sycl backend --- Grid/lattice/Lattice_reduction.h | 15 ++++ Grid/lattice/Lattice_slicesum_gpu.h | 9 --- Grid/lattice/Lattice_slicesum_sycl.h | 115 +++++++++++++++++++++++++++ Grid/threads/Accelerator.h | 4 +- tests/core/Test_sliceSum.cc | 4 +- 5 files changed, 134 insertions(+), 13 deletions(-) create mode 100644 Grid/lattice/Lattice_slicesum_sycl.h diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index f85ed7e3..bfd41b6c 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -31,6 +31,7 @@ Author: Christoph Lehner #endif #if defined(GRID_SYCL) #include +#include #endif NAMESPACE_BEGIN(Grid); @@ -505,6 +506,20 @@ sliceSum(const Lattice &Data,int orthogdim) return result; } +template inline +std::vector +sliceSumGpu(const Lattice &Data,int orthogdim) +{ + std::vector result; + #if defined(GRID_CUDA) || defined(GRID_HIP) + sliceSumGpu(Data,result,orthogdim); + #elif defined(GRID_SYCL) + sliceSum_sycl(Data,result,orthogdim); + #endif + return result; +} + + template static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) { diff --git a/Grid/lattice/Lattice_slicesum_gpu.h b/Grid/lattice/Lattice_slicesum_gpu.h index 8e13808f..5d2ad049 100644 --- a/Grid/lattice/Lattice_slicesum_gpu.h +++ b/Grid/lattice/Lattice_slicesum_gpu.h @@ -177,13 +177,4 @@ template inline void sliceSumGpu(const Lattice &Data,std::vect grid->GlobalSumVector(ptr, words); } -template inline -std::vector -sliceSumGpu(const Lattice &Data,int orthogdim) -{ - std::vector result; - sliceSumGpu(Data,result,orthogdim); - return result; -} - NAMESPACE_END(Grid); \ No newline at end of file diff --git a/Grid/lattice/Lattice_slicesum_sycl.h b/Grid/lattice/Lattice_slicesum_sycl.h new file mode 100644 index 00000000..034e9dd3 --- /dev/null +++ b/Grid/lattice/Lattice_slicesum_sycl.h @@ -0,0 +1,115 @@ +#pragma once + +NAMESPACE_BEGIN(Grid); + +template +inline void sliceSum_sycl(const Lattice &Data, std::vector &result, int orthogdim) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_object::scalar_type scalar_type; + + GridBase *grid = Data.Grid(); + assert(grid!=NULL); + + const int Nd = grid->_ndimension; + const size_t Nsimd = grid->Nsimd(); + + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + int e1= grid->_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + int ostride=grid->_ostride[orthogdim]; + size_t subvol_size = e1*e2; + + vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); + vobj vobj_zero; + zeroit(vobj_zero); + + + result.resize(fd); + + Vector lvSum(rd); + Vector lsSum(ld,Zero()); + commVector reduction_buffer(rd*subvol_size); + ExtractBuffer extracted(Nsimd); + + for(int r=0;rsubmit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(mysum,vobj_zero,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{subvol_size}, + Reduction, + [=](cl::sycl::id<1> item, auto &sum) { + auto s = item[0]; + sum += rb_p[r*subvol_size+s]; + }); + }); + theGridAccelerator->wait(); + lvSum[r] = mysum[0]; + } + + Coordinate icoor(Nd); + + for(int rt=0;rtiCoorFromIindex(icoor,idx); + + int ldx =rt+icoor[orthogdim]*rd; + + lsSum[ldx]=lsSum[ldx]+extracted[idx]; + + } + } + + // sum over nodes. + for(int t=0;t_processor_coor[orthogdim] ) { + result[t]=lsSum[lt]; + } else { + result[t]=Zero(); + } + + } + scalar_type * ptr = (scalar_type *) &result[0]; + int words = fd*sizeof(sobj)/sizeof(scalar_type); + grid->GlobalSumVector(ptr, words); + +} + +NAMESPACE_END(Grid); \ No newline at end of file diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index ff5ccd7a..eaafea5d 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -256,12 +256,12 @@ NAMESPACE_END(Grid); #if 0 #include #include -#include +#include #include #else #include #include -#include +#include #include #endif diff --git a/tests/core/Test_sliceSum.cc b/tests/core/Test_sliceSum.cc index 7dd69cdc..399ab899 100644 --- a/tests/core/Test_sliceSum.cc +++ b/tests/core/Test_sliceSum.cc @@ -26,7 +26,7 @@ int main (int argc, char ** argv) { //warmup for (int sweeps = 0; sweeps < 5; sweeps++) { - sliceSumGpu(test_data,reduction_result,0); + reduction_result = sliceSumGpu(test_data,0); } int trace_id = traceStart("sliceSum benchmark"); @@ -46,7 +46,7 @@ int main (int argc, char ** argv) { RealD tgpu=-usecond(); tracePush("sliceSumGpu"); - sliceSumGpu(test_data,reduction_result,i); + reduction_result = sliceSumGpu(test_data,i); tracePop("sliceSumGpu"); tgpu+=usecond(); From 0a6e2f42c5b8382bf59e24a37aa1adc8da0c7577 Mon Sep 17 00:00:00 2001 From: david clarke Date: Tue, 6 Feb 2024 16:32:07 -0700 Subject: [PATCH 064/114] small amount of cleanup --- Grid/qcd/smearing/HISQSmearing.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index c8255acc..f053afcf 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -118,10 +118,11 @@ public: void smear(GF& u_smr, GF& u_naik, GF& u_thin) const { SmearingParameters lt = this->_linkTreatment; + auto grid = this->_grid; // Create a padded cell of extra padding depth=1 and fill the padding. int depth = 1; - PaddedCell Ghost(depth,this->_grid); + PaddedCell Ghost(depth,grid); GF Ughost = Ghost.Exchange(u_thin); // This is where auxiliary N-link fields and the final smear will be stored. @@ -285,9 +286,9 @@ public: u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin; // Load up U and V std::vectors to access thin and smeared links. - std::vector U(Nd, u_thin.Grid()); - std::vector V(Nd, u_smr.Grid()); - std::vector Vnaik(Nd, u_naik.Grid()); + std::vector U(Nd, grid); + std::vector V(Nd, grid); + std::vector Vnaik(Nd, grid); for (int mu = 0; mu < Nd; mu++) { U[mu] = PeekIndex(u_thin, mu); V[mu] = PeekIndex(u_smr, mu); @@ -330,11 +331,11 @@ public: // IN--u_mu void projectU3(GF& u_proj, GF& u_mu) const { - LF V(u_mu.Grid()), Q(u_mu.Grid()), sqrtQinv(u_mu.Grid()), id_3(u_mu.Grid()), diff(u_mu.Grid()); - CF c0(u_mu.Grid()), c1(u_mu.Grid()), c2(u_mu.Grid()), g0(u_mu.Grid()), g1(u_mu.Grid()), - g2(u_mu.Grid()), S(u_mu.Grid()), R(u_mu.Grid()), theta(u_mu.Grid()), u(u_mu.Grid()), - v(u_mu.Grid()), w(u_mu.Grid()), den(u_mu.Grid()), f0(u_mu.Grid()), f1(u_mu.Grid()), - f2(u_mu.Grid()); + auto grid = this->_grid; + + LF V(grid), Q(grid), sqrtQinv(grid), id_3(grid), diff(grid); + CF c0(grid), c1(grid), c2(grid), g0(grid), g1(grid), g2(grid), S(grid), R(grid), theta(grid), + u(grid), v(grid), w(grid), den(grid), f0(grid), f1(grid), f2(grid); // Follow MILC 10.1103/PhysRevD.82.074501, eqs (B2-B3) and (C1-C8) for (int mu = 0; mu < Nd; mu++) { From 91cf5ee312eb4650cdb17e9c23a5e29c40700d01 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 6 Feb 2024 23:45:10 +0000 Subject: [PATCH 065/114] Updated bench script --- systems/Aurora/benchmarks/bench12.pbs | 45 +++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench12.pbs diff --git a/systems/Aurora/benchmarks/bench12.pbs b/systems/Aurora/benchmarks/bench12.pbs new file mode 100644 index 00000000..96f6143f --- /dev/null +++ b/systems/Aurora/benchmarks/bench12.pbs @@ -0,0 +1,45 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=2 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD From 1514b4f13733f8952b232d1b95c00d7abb01d74a Mon Sep 17 00:00:00 2001 From: dbollweg Date: Tue, 6 Feb 2024 19:08:44 -0500 Subject: [PATCH 066/114] slicesum_sycl passes test --- Grid/lattice/Lattice_slicesum_sycl.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/Grid/lattice/Lattice_slicesum_sycl.h b/Grid/lattice/Lattice_slicesum_sycl.h index 034e9dd3..04ec8a6a 100644 --- a/Grid/lattice/Lattice_slicesum_sycl.h +++ b/Grid/lattice/Lattice_slicesum_sycl.h @@ -14,7 +14,6 @@ inline void sliceSum_sycl(const Lattice &Data, std::vector_ndimension; const size_t Nsimd = grid->Nsimd(); - assert(orthogdim >= 0); assert(orthogdim < Nd); @@ -29,9 +28,6 @@ inline void sliceSum_sycl(const Lattice &Data, std::vector &Data, std::vector lsSum(ld,Zero()); commVector reduction_buffer(rd*subvol_size); ExtractBuffer extracted(Nsimd); + vobj vobj_zero; + zeroit(vobj_zero); for(int r=0;r &Data, std::vector &Data, std::vectorsubmit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(mysum,vobj_zero,std::plus<>()); + auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); cgh.parallel_for(cl::sycl::range<1>{subvol_size}, Reduction, [=](cl::sycl::id<1> item, auto &sum) { From 701991629430c341e623b4d1a174067c1766a201 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 7 Feb 2024 00:56:39 +0000 Subject: [PATCH 067/114] RNG seed change safer for large volumes; this is a long term solution --- Grid/lattice/Lattice_rng.h | 10 ++- Grid/sitmo_rng/sitmo_prng_engine.hpp | 6 +- systems/Aurora/benchmarks/bench.pbs | 51 --------------- systems/Aurora/benchmarks/bench2.pbs | 95 ---------------------------- 4 files changed, 12 insertions(+), 150 deletions(-) delete mode 100644 systems/Aurora/benchmarks/bench.pbs delete mode 100644 systems/Aurora/benchmarks/bench2.pbs diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index b7ef0e82..2212abbe 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -152,6 +152,7 @@ public: #ifdef RNG_FAST_DISCARD static void Skip(RngEngine &eng,uint64_t site) { +#if 0 ///////////////////////////////////////////////////////////////////////////////////// // Skip by 2^40 elements between successive lattice sites // This goes by 10^12. @@ -162,9 +163,9 @@ public: // tens of seconds per trajectory so this is clean in all reasonable cases, // and margin of safety is orders of magnitude. // We could hack Sitmo to skip in the higher order words of state if necessary - // - // Replace with 2^30 ; avoid problem on large volumes - // + // + // Replace with 2^30 ; avoid problem on large volumes + // ///////////////////////////////////////////////////////////////////////////////////// // uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init const int shift = 30; @@ -179,6 +180,9 @@ public: assert((skip >> shift)==site); // check for overflow eng.discard(skip); +#else + eng.discardhi(site); +#endif // std::cout << " Engine " < Date: Tue, 6 Feb 2024 18:24:55 -0700 Subject: [PATCH 068/114] first effort toward accelerators --- Grid/qcd/smearing/HISQSmearing.h | 34 ++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index f053afcf..0deb080d 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -154,7 +154,7 @@ public: for(int mu=0;mu Date: Tue, 6 Feb 2024 18:40:13 -0700 Subject: [PATCH 069/114] acceleration compiles and doesn't break scalar mode --- Grid/qcd/smearing/HISQSmearing.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 0deb080d..2ae11bbb 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -153,18 +153,18 @@ public: // This loop handles 3-, 5-, and 7-link constructs, minus Lepage and Naik. for(int mu=0;mu_permute,Nd)) U3matrix; From 9514035b87b09e5f6d4a52a8724a144621456a20 Mon Sep 17 00:00:00 2001 From: dbollweg Date: Fri, 9 Feb 2024 13:02:28 -0500 Subject: [PATCH 070/114] refactor slicesum: slicesum uses GPU version by default now --- Grid/lattice/Lattice_reduction.h | 33 +---- Grid/lattice/Lattice_slicesum_core.h | 204 +++++++++++++++++++++++++++ Grid/lattice/Lattice_slicesum_gpu.h | 180 ----------------------- Grid/lattice/Lattice_slicesum_sycl.h | 110 --------------- tests/core/Test_sliceSum.cc | 86 ++++++++++- 5 files changed, 289 insertions(+), 324 deletions(-) create mode 100644 Grid/lattice/Lattice_slicesum_core.h delete mode 100644 Grid/lattice/Lattice_slicesum_gpu.h delete mode 100644 Grid/lattice/Lattice_slicesum_sycl.h diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index bfd41b6c..66788a4c 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -27,12 +27,11 @@ Author: Christoph Lehner #if defined(GRID_CUDA)||defined(GRID_HIP) #include -#include #endif #if defined(GRID_SYCL) #include -#include #endif +#include NAMESPACE_BEGIN(Grid); @@ -450,19 +449,10 @@ template inline void sliceSum(const Lattice &Data,std::vector< int e1= grid->_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; - - // sum over reduced dimension planes, breaking out orthog dir - // Parallel over orthog direction - autoView( Data_v, Data, CpuRead); - thread_for( r,rd, { - int so=r*grid->_ostride[orthogdim]; // base offset for start of plane - for(int n=0;n_ostride[orthogdim]; + + //Reduce Data down to lvSum + sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd); // Sum across simd lanes in the plane, breaking out orthog dir. Coordinate icoor(Nd); @@ -506,19 +496,6 @@ sliceSum(const Lattice &Data,int orthogdim) return result; } -template inline -std::vector -sliceSumGpu(const Lattice &Data,int orthogdim) -{ - std::vector result; - #if defined(GRID_CUDA) || defined(GRID_HIP) - sliceSumGpu(Data,result,orthogdim); - #elif defined(GRID_SYCL) - sliceSum_sycl(Data,result,orthogdim); - #endif - return result; -} - template static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h new file mode 100644 index 00000000..2548884a --- /dev/null +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -0,0 +1,204 @@ +#pragma once +#if defined(GRID_CUDA) + +#include +#define gpucub cub +#define gpuMalloc cudaMalloc +#define gpuMemcpyAsync cudaMemcpyAsync +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuError_t cudaError_t +#define gpuSuccess cudaSuccess + +#elif defined(GRID_HIP) + +#include +#define gpucub hipcub +#define gpuMalloc hipMalloc +#define gpuMemcpyAsync hipMemcpyAsync +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuError_t hipError_t +#define gpuSuccess hipSuccess + +#endif + + +NAMESPACE_BEGIN(Grid); + +#if defined(GRID_CUDA) || defined(GRID_HIP) +template inline void sliceSumReduction_cub(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) +{ + typedef typename vobj::scalar_object sobj; + + size_t subvol_size = e1*e2; + + commVector reduction_buffer(rd*subvol_size); + auto rb_p = &reduction_buffer[0]; + + vobj vobj_zero; //Need to provide initial value for reduction operation + zeroit(vobj_zero); + + + void *temp_storage_array = NULL; + size_t temp_storage_bytes = 0; + vobj *d_out; + int* d_offsets; + + std::vector offsets(rd+1,0); + + for (int i = 0; i < offsets.size(); i++) { + offsets[i] = i*subvol_size; + } + + //Allocate memory for output and offset arrays on device + gpuError_t gpuErr = gpuMalloc(&d_out,rd*sizeof(vobj)); + if (gpuErr != gpuSuccess) { + std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc (d_out)! Error: " << gpuErr < inline void sliceSumReduction_sycl(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +{ + typedef typename vobj::scalar_object sobj; + size_t subvol_size = e1*e2; + + vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); + vobj vobj_zero; + zeroit(vobj_zero); + + commVector reduction_buffer(rd*subvol_size); + + auto rb_p = &reduction_buffer[0]; + + autoView(Data_v, Data, AcceleratorRead); + + //prepare reduction buffer + accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ + + int n = s / e2; + int b = s % e2; + int so=r*ostride; // base offset for start of plane + int ss= so+n*stride+b; + + coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); + + }); + + for (int r = 0; r < rd; r++) { + mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{subvol_size}, + Reduction, + [=](cl::sycl::id<1> item, auto &sum) { + auto s = item[0]; + sum += rb_p[r*subvol_size+s]; + }); + }); + theGridAccelerator->wait(); + lvSum[r] = mysum[0]; + } + +} +#endif + +template inline void sliceSumReduction_cpu(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +{ + // sum over reduced dimension planes, breaking out orthog dir + // Parallel over orthog direction + autoView( Data_v, Data, CpuRead); + thread_for( r,rd, { + int so=r*ostride; // base offset for start of plane + for(int n=0;n inline void sliceSumReduction(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +{ + #if defined(GRID_CUDA) || defined(GRID_HIP) + + sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); + + #elif defined(GRID_SYCL) + + sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); + + #else + sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); + + #endif +} + + +NAMESPACE_END(Grid); \ No newline at end of file diff --git a/Grid/lattice/Lattice_slicesum_gpu.h b/Grid/lattice/Lattice_slicesum_gpu.h deleted file mode 100644 index 5d2ad049..00000000 --- a/Grid/lattice/Lattice_slicesum_gpu.h +++ /dev/null @@ -1,180 +0,0 @@ -#pragma once -#if defined(GRID_CUDA) - -#include -#define gpucub cub -#define gpuMalloc cudaMalloc -#define gpuMemcpyAsync cudaMemcpyAsync -#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost -#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice -#define gpuError_t cudaError_t -#define gpuSuccess cudaSuccess - -#elif defined(GRID_HIP) - -#include -#define gpucub hipcub -#define gpuMalloc hipMalloc -#define gpuMemcpyAsync hipMemcpyAsync -#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost -#define gpuMemcpyHostToDevice hipMemcpyHostToDevice -#define gpuError_t hipError_t -#define gpuSuccess hipSuccess - -#endif - - -NAMESPACE_BEGIN(Grid); - -template inline void sliceSumGpu(const Lattice &Data,std::vector &result,int orthogdim) -{ - - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_object::scalar_type scalar_type; - GridBase *grid = Data.Grid(); - assert(grid!=NULL); - - const int Nd = grid->_ndimension; - const int Nsimd = grid->Nsimd(); - - assert(orthogdim >= 0); - assert(orthogdim < Nd); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - int e1= grid->_slice_nblock[orthogdim]; - int e2= grid->_slice_block [orthogdim]; - int stride=grid->_slice_stride[orthogdim]; - int ostride=grid->_ostride[orthogdim]; - size_t subvol_size = e1*e2; - - Vector lvSum(rd); - Vector lsSum(ld,Zero()); - commVector reduction_buffer(rd*e1*e2); - ExtractBuffer extracted(Nsimd); - - result.resize(fd); - - for(int r=0;r offsets(rd+1,0); - - for (int i = 0; i < offsets.size(); i++) { - offsets[i] = i*subvol_size; - } - - //Allocate memory for output and offset arrays on device - gpuError_t gpuErr = gpuMalloc(&d_out,rd*sizeof(vobj)); - if (gpuErr != gpuSuccess) { - std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc (d_out)! Error: " << gpuErr <Nsimd(),{ - - int n = s / e2; - int b = s % e2; - int so=r*ostride; // base offset for start of plane - int ss= so+n*stride+b; - - coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); - - }); - - //issue segmented reductions in computeStream - gpuErr = gpucub::DeviceSegmentedReduce::Reduce(helperArray, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), vobj_zero, computeStream); - if (gpuErr!=gpuSuccess) { - std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <iCoorFromIindex(icoor,idx); - - int ldx =rt+icoor[orthogdim]*rd; - - lsSum[ldx]=lsSum[ldx]+extracted[idx]; - - } - } - - // sum over nodes. - for(int t=0;t_processor_coor[orthogdim] ) { - result[t]=lsSum[lt]; - } else { - result[t]=Zero(); - } - - } - scalar_type * ptr = (scalar_type *) &result[0]; - int words = fd*sizeof(sobj)/sizeof(scalar_type); - grid->GlobalSumVector(ptr, words); -} - -NAMESPACE_END(Grid); \ No newline at end of file diff --git a/Grid/lattice/Lattice_slicesum_sycl.h b/Grid/lattice/Lattice_slicesum_sycl.h deleted file mode 100644 index 04ec8a6a..00000000 --- a/Grid/lattice/Lattice_slicesum_sycl.h +++ /dev/null @@ -1,110 +0,0 @@ -#pragma once - -NAMESPACE_BEGIN(Grid); - -template -inline void sliceSum_sycl(const Lattice &Data, std::vector &result, int orthogdim) -{ - typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_object::scalar_type scalar_type; - - GridBase *grid = Data.Grid(); - assert(grid!=NULL); - - const int Nd = grid->_ndimension; - const size_t Nsimd = grid->Nsimd(); - - assert(orthogdim >= 0); - assert(orthogdim < Nd); - - int fd=grid->_fdimensions[orthogdim]; - int ld=grid->_ldimensions[orthogdim]; - int rd=grid->_rdimensions[orthogdim]; - - int e1= grid->_slice_nblock[orthogdim]; - int e2= grid->_slice_block [orthogdim]; - int stride=grid->_slice_stride[orthogdim]; - int ostride=grid->_ostride[orthogdim]; - size_t subvol_size = e1*e2; - - vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); - - result.resize(fd); - - Vector lvSum(rd); - Vector lsSum(ld,Zero()); - commVector reduction_buffer(rd*subvol_size); - ExtractBuffer extracted(Nsimd); - vobj vobj_zero; - zeroit(vobj_zero); - - for(int r=0;rsubmit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); - cgh.parallel_for(cl::sycl::range<1>{subvol_size}, - Reduction, - [=](cl::sycl::id<1> item, auto &sum) { - auto s = item[0]; - sum += rb_p[r*subvol_size+s]; - }); - }); - theGridAccelerator->wait(); - lvSum[r] = mysum[0]; - } - - Coordinate icoor(Nd); - - for(int rt=0;rtiCoorFromIindex(icoor,idx); - - int ldx =rt+icoor[orthogdim]*rd; - - lsSum[ldx]=lsSum[ldx]+extracted[idx]; - - } - } - - // sum over nodes. - for(int t=0;t_processor_coor[orthogdim] ) { - result[t]=lsSum[lt]; - } else { - result[t]=Zero(); - } - - } - scalar_type * ptr = (scalar_type *) &result[0]; - int words = fd*sizeof(sobj)/sizeof(scalar_type); - grid->GlobalSumVector(ptr, words); - -} - -NAMESPACE_END(Grid); \ No newline at end of file diff --git a/tests/core/Test_sliceSum.cc b/tests/core/Test_sliceSum.cc index 399ab899..e0e0c1ae 100644 --- a/tests/core/Test_sliceSum.cc +++ b/tests/core/Test_sliceSum.cc @@ -1,5 +1,79 @@ #include +template inline void sliceSumCPU(const Grid::Lattice &Data,std::vector &result,int orthogdim) +{ + using namespace Grid; + /////////////////////////////////////////////////////// + // FIXME precision promoted summation + // may be important for correlation functions + // But easily avoided by using double precision fields + /////////////////////////////////////////////////////// + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_object::scalar_type scalar_type; + GridBase *grid = Data.Grid(); + assert(grid!=NULL); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + Vector lvSum(rd); // will locally sum vectors first + Vector lsSum(ld,Zero()); // sum across these down to scalars + ExtractBuffer extracted(Nsimd); // splitting the SIMD + + result.resize(fd); // And then global sum to return the same vector to every node + for(int r=0;r_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + int ostride=grid->_ostride[orthogdim]; + + //Reduce Data down to lvSum + sliceSumReduction_cpu(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd); + + // Sum across simd lanes in the plane, breaking out orthog dir. + Coordinate icoor(Nd); + + for(int rt=0;rtiCoorFromIindex(icoor,idx); + + int ldx =rt+icoor[orthogdim]*rd; + + lsSum[ldx]=lsSum[ldx]+extracted[idx]; + + } + } + + // sum over nodes. + for(int t=0;t_processor_coor[orthogdim] ) { + result[t]=lsSum[lt]; + } else { + result[t]=Zero(); + } + + } + scalar_type * ptr = (scalar_type *) &result[0]; + int words = fd*sizeof(sobj)/sizeof(scalar_type); + grid->GlobalSumVector(ptr, words); +} + int main (int argc, char ** argv) { @@ -26,7 +100,7 @@ int main (int argc, char ** argv) { //warmup for (int sweeps = 0; sweeps < 5; sweeps++) { - reduction_result = sliceSumGpu(test_data,0); + reduction_result = sliceSum(test_data,0); } int trace_id = traceStart("sliceSum benchmark"); @@ -35,23 +109,23 @@ int main (int argc, char ** argv) { RealD t=-usecond(); tracePush("sliceSum"); - sliceSum(test_data,reduction_reference,i); + sliceSumCPU(test_data,reduction_reference,i); tracePop("sliceSum"); t+=usecond(); - - std::cout << GridLogMessage << " sliceSum took "< Date: Fri, 9 Feb 2024 13:07:56 -0500 Subject: [PATCH 071/114] Undo include path changes for level zero api header --- Grid/threads/Accelerator.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index eaafea5d..ff5ccd7a 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -256,12 +256,12 @@ NAMESPACE_END(Grid); #if 0 #include #include -#include +#include #include #else #include #include -#include +#include #include #endif From b5659d106ec5997631caef6ca6d80e39cc69cfc6 Mon Sep 17 00:00:00 2001 From: dbollweg Date: Fri, 9 Feb 2024 13:37:14 -0500 Subject: [PATCH 072/114] more test cases --- tests/core/Test_sliceSum.cc | 113 +++++++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) diff --git a/tests/core/Test_sliceSum.cc b/tests/core/Test_sliceSum.cc index e0e0c1ae..0ca2881b 100644 --- a/tests/core/Test_sliceSum.cc +++ b/tests/core/Test_sliceSum.cc @@ -103,7 +103,8 @@ int main (int argc, char ** argv) { reduction_result = sliceSum(test_data,0); } - int trace_id = traceStart("sliceSum benchmark"); + int trace_id = traceStart("sliceSum benchmark - ComplexD"); + std::cout << GridLogMessage << "Testing ComplexD" << std::endl; for (int i = 0; i < Nd; i++) { RealD t=-usecond(); @@ -138,6 +139,116 @@ int main (int argc, char ** argv) { } traceStop(trace_id); + + LatticeColourVectorD test_data_cv(&Grid); + gaussian(pRNG,test_data_cv); + + std::vector reduction_reference_cv; + std::vector reduction_result_cv; + + //warmup + for (int sweeps = 0; sweeps < 5; sweeps++) { + reduction_result_cv = sliceSum(test_data_cv,0); + } + trace_id = traceStart("sliceSum benchmark - ColourVectorD"); + + std::cout << GridLogMessage << "Testing ColourVectorD" << std::endl; + for (int i = 0; i < Nd; i++) { + + RealD t=-usecond(); + + tracePush("sliceSum"); + sliceSumCPU(test_data_cv,reduction_reference_cv,i); + tracePop("sliceSum"); + + t+=usecond(); + std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl; + std::cout << GridLogMessage << "CPU sliceSum took "< reduction_reference_scv; + std::vector reduction_result_scv; + + //warmup + for (int sweeps = 0; sweeps < 5; sweeps++) { + reduction_result_scv = sliceSum(test_data_scv,0); + } + trace_id = traceStart("sliceSum benchmark - SpinColourVectorD"); + + std::cout << GridLogMessage << "Testing SpinColourVectorD" << std::endl; + for (int i = 0; i < Nd; i++) { + + RealD t=-usecond(); + + tracePush("sliceSum"); + sliceSumCPU(test_data_scv,reduction_reference_scv,i); + tracePop("sliceSum"); + + t+=usecond(); + std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl; + std::cout << GridLogMessage << "CPU sliceSum took "< Date: Mon, 12 Feb 2024 15:03:53 -0700 Subject: [PATCH 073/114] fix Simd::Nsimd typo --- Grid/qcd/smearing/HISQSmearing.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 2ae11bbb..7635ef06 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -173,7 +173,7 @@ public: int Nsites = U_v.size(); - accelerator_for(site,Nsites,Simd:Nsimd(),{ // ----------- 3-link constructs + accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs // for(int site=0;site Date: Tue, 13 Feb 2024 10:30:22 +0100 Subject: [PATCH 074/114] updating Eigen to 3.4.0 --- bootstrap.sh | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/bootstrap.sh b/bootstrap.sh index 4bd3de5e..571a5f4b 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -1,12 +1,12 @@ #!/usr/bin/env bash set -e -EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2' -EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11' +EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2' +EIGEN_SHA256SUM='b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626' echo "-- deploying Eigen source..." -ARC=`basename ${EIGEN_URL}` +ARC=$(basename ${EIGEN_URL}) wget ${EIGEN_URL} --no-check-certificate if command -v sha256sum; then echo "$EIGEN_SHA256SUM $(basename "$EIGEN_URL")" \ @@ -14,13 +14,8 @@ if command -v sha256sum; then else echo "WARNING: could not verify checksum, please install sha256sum" >&2 fi -./scripts/update_eigen.sh ${ARC} -rm ${ARC} -# patch for non-portable includes in Eigen 3.3.5 -# apparently already fixed in Eigen HEAD so it should not be -# a problem in the future (A.P.) -patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch - +./scripts/update_eigen.sh "${ARC}" +rm "${ARC}" echo '-- generating Make.inc files...' ./scripts/filelist echo '-- generating configure script...' From e4a641b64e45ee1d65ab5b4b4ba6f9d72e16ab70 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 13 Feb 2024 10:37:14 +0100 Subject: [PATCH 075/114] removing old Eigen tensor patch --- scripts/eigen-3.3.5.Tensor.patch | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 scripts/eigen-3.3.5.Tensor.patch diff --git a/scripts/eigen-3.3.5.Tensor.patch b/scripts/eigen-3.3.5.Tensor.patch deleted file mode 100644 index 54984b94..00000000 --- a/scripts/eigen-3.3.5.Tensor.patch +++ /dev/null @@ -1,19 +0,0 @@ ---- ./Eigen/unsupported/Eigen/CXX11/Tensor 2018-07-23 10:33:42.000000000 +0100 -+++ Tensor 2018-08-28 16:15:56.000000000 +0100 -@@ -25,7 +25,7 @@ - #include - #endif - --#include -+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" - - #include "../SpecialFunctions" - #include "src/util/CXX11Meta.h" -@@ -147,6 +147,6 @@ - - #include "src/Tensor/TensorIO.h" - --#include -+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" - - //#endif // EIGEN_CXX11_TENSOR_MODULE From 62055e04ddfda2d37069a0fb10e8af1aa45a0140 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 13 Feb 2024 18:18:27 +0100 Subject: [PATCH 076/114] missing semicolon generates error with some compilers --- Grid/qcd/smearing/StoutSmearing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/smearing/StoutSmearing.h b/Grid/qcd/smearing/StoutSmearing.h index 641331dc..787ef104 100644 --- a/Grid/qcd/smearing/StoutSmearing.h +++ b/Grid/qcd/smearing/StoutSmearing.h @@ -69,7 +69,7 @@ public: /*! Construct stout smearing object from explicitly specified rho matrix */ Smear_Stout(const std::vector& rho_) : OwnedBase{new Smear_APE(rho_)}, SmearBase{OwnedBase.get()} { - std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector& " << rho_ << " )" << std::endl + std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector& " << rho_ << " )" << std::endl; assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3"); } From 585efc6f3fce63f9766b2b66d3ae279ff0944a56 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:40:49 +0000 Subject: [PATCH 077/114] More benchmark scripts --- systems/Aurora/benchmarks/bench256.pbs | 48 ++++++++++++++++++++++++++ systems/Aurora/benchmarks/bench512.pbs | 48 ++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench256.pbs create mode 100644 systems/Aurora/benchmarks/bench512.pbs diff --git a/systems/Aurora/benchmarks/bench256.pbs b/systems/Aurora/benchmarks/bench256.pbs new file mode 100644 index 00000000..405d9ed4 --- /dev/null +++ b/systems/Aurora/benchmarks/bench256.pbs @@ -0,0 +1,48 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=256 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 3072 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.8.8 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 3072 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.4.12 --grid 128.128.128.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 256node.dwf.large diff --git a/systems/Aurora/benchmarks/bench512.pbs b/systems/Aurora/benchmarks/bench512.pbs new file mode 100644 index 00000000..0d8708d3 --- /dev/null +++ b/systems/Aurora/benchmarks/bench512.pbs @@ -0,0 +1,48 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=512 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 6144 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.8.16 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 6144 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.8.12 --grid 256.128.128.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 512node.dwf.large From 1502860004f953d02e2cf8b6d892e1109d940e04 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:47:02 +0000 Subject: [PATCH 078/114] Benchmark scripts --- systems/Aurora/benchmarks/bench1024.pbs | 56 +++++++++++++++++++++++++ systems/Aurora/benchmarks/bench2048.pbs | 56 +++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 systems/Aurora/benchmarks/bench1024.pbs create mode 100644 systems/Aurora/benchmarks/bench2048.pbs diff --git a/systems/Aurora/benchmarks/bench1024.pbs b/systems/Aurora/benchmarks/bench1024.pbs new file mode 100644 index 00000000..88f0100a --- /dev/null +++ b/systems/Aurora/benchmarks/bench1024.pbs @@ -0,0 +1,56 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=1024 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.16.16 --grid 64.48.64.284 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1024node.dwf.small + +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1024node.dwf + + diff --git a/systems/Aurora/benchmarks/bench2048.pbs b/systems/Aurora/benchmarks/bench2048.pbs new file mode 100644 index 00000000..b79081a2 --- /dev/null +++ b/systems/Aurora/benchmarks/bench2048.pbs @@ -0,0 +1,56 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=2048 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.12.16.16 --grid 64.48.64.284 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 128.128.128.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2048node.dwf.small + +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 256.256.256.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2048node.dwf + + From 5ef4da3f29f95b843bf97bce603ba43f1c54029d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:47:36 +0000 Subject: [PATCH 079/114] Silence verbose --- systems/Aurora/benchmarks/gpu_tile_compact.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/Aurora/benchmarks/gpu_tile_compact.sh b/systems/Aurora/benchmarks/gpu_tile_compact.sh index 69ba5107..5cab1ee3 100755 --- a/systems/Aurora/benchmarks/gpu_tile_compact.sh +++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh @@ -28,6 +28,6 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 -echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " +#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " numactl -m $NUMA -N $NUMAP "$@" From 303b83cdb80ad4e440785854976b34b8d2381d8e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Feb 2024 19:48:03 +0000 Subject: [PATCH 080/114] Scaling benchmarks, verbosity and MPICH aware in acceleratorInit() For some reason Dirichlet benchmark fails on several nodes; need to debug this. --- Grid/threads/Accelerator.cc | 19 ++++- benchmarks/Benchmark_dwf_fp32.cc | 20 +++--- systems/Aurora/benchmarks/bench_scaling.pbs | 80 +++++++++++++++++++++ 3 files changed, 106 insertions(+), 13 deletions(-) create mode 100644 systems/Aurora/benchmarks/bench_scaling.pbs diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 3769b2aa..19411b62 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -7,6 +7,8 @@ uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; +#define ENV_LOCAL_RANK_PALS "PALS_LOCAL_RANKID" +#define ENV_RANK_PALS "PALS_RANKID" #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" #define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID" @@ -228,8 +230,17 @@ void acceleratorInit(void) { rank = atoi(localRankStr); } + if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL) + { + rank = atoi(localRankStr); + } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_PALS )) != NULL) { world_rank = atoi(localRankStr);} + + char hostname[HOST_NAME_MAX+1]; + gethostname(hostname, HOST_NAME_MAX+1); + if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); auto devices = cl::sycl::device::get_devices(); for(int d = 0;d()); #define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); + if ( world_rank == 0) { - GPU_PROP_STR(vendor); - GPU_PROP_STR(version); + GPU_PROP_STR(vendor); + GPU_PROP_STR(version); // GPU_PROP_STR(device_type); /* GPU_PROP(max_compute_units); @@ -259,7 +271,8 @@ void acceleratorInit(void) GPU_PROP(single_fp_config); */ // GPU_PROP(double_fp_config); - GPU_PROP(global_mem_size); + GPU_PROP(global_mem_size); + } } if ( world_rank == 0 ) { diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 37287595..ce4fcfab 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -90,11 +90,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <1 ? 1 : 0; - Dirichlet[0] = 0; - Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; - Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; - Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; - Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + // Dirichlet[0] = 0; + // Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + // Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + // Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + // Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; Benchmark(Ls,Dirichlet); @@ -105,11 +105,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <1 ? 1 : 0; - Dirichlet[0] = 0; - Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; - Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; - Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; - Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; + // Dirichlet[0] = 0; + // Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; + // Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; + // Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; + // Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; Benchmark(Ls,Dirichlet); diff --git a/systems/Aurora/benchmarks/bench_scaling.pbs b/systems/Aurora/benchmarks/bench_scaling.pbs new file mode 100644 index 00000000..504fd3e9 --- /dev/null +++ b/systems/Aurora/benchmarks/bench_scaling.pbs @@ -0,0 +1,80 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=32 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 384 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1node.dwf + + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2node.dwf + +CMD="mpiexec -np 48 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 4node.dwf + +CMD="mpiexec -np 96 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 8node.dwf + +CMD="mpiexec -np 192 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 16node.dwf + + +CMD="mpiexec -np 384 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 32node.dwf From 56827d6ad6c019e0219544cb793dcd40137a21fc Mon Sep 17 00:00:00 2001 From: david clarke Date: Wed, 14 Feb 2024 13:56:57 -0700 Subject: [PATCH 081/114] accelerator_inline bug --- Grid/qcd/smearing/HISQSmearing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 7635ef06..d2091806 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -54,7 +54,7 @@ void appendShift(std::vector& shifts, int dir, Args... args) { /*! @brief figure out the stencil index from mu and nu */ -inline int stencilIndex(int mu, int nu) { +accelerator_inline int stencilIndex(int mu, int nu) { // Nshifts depends on how you built the stencil int Nshifts = 6; return Nshifts*nu + Nd*Nshifts*mu; From 6f3455900e11eeda16ff6dc1976567509d1dfe0c Mon Sep 17 00:00:00 2001 From: dbollweg Date: Fri, 16 Feb 2024 13:15:02 -0500 Subject: [PATCH 082/114] Adding sliceSumReduction_cub_small/large since hipcub cannot deal with arb. large vobjs --- Grid/lattice/Lattice_slicesum_core.h | 65 ++++++++++++++++++++++------ tests/core/Test_sliceSum.cc | 22 ++++++---- 2 files changed, 65 insertions(+), 22 deletions(-) diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h index 2548884a..f4301c33 100644 --- a/Grid/lattice/Lattice_slicesum_core.h +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -1,4 +1,5 @@ #pragma once +#include #if defined(GRID_CUDA) #include @@ -26,20 +27,16 @@ NAMESPACE_BEGIN(Grid); + #if defined(GRID_CUDA) || defined(GRID_HIP) -template inline void sliceSumReduction_cub(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) -{ - typedef typename vobj::scalar_object sobj; - +template inline void sliceSumReduction_cub_small(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { size_t subvol_size = e1*e2; - commVector reduction_buffer(rd*subvol_size); auto rb_p = &reduction_buffer[0]; + vobj zero_init; + zeroit(zero_init); - vobj vobj_zero; //Need to provide initial value for reduction operation - zeroit(vobj_zero); - void *temp_storage_array = NULL; size_t temp_storage_bytes = 0; vobj *d_out; @@ -71,8 +68,8 @@ template inline void sliceSumReduction_cub(const Lattice &Data exit(EXIT_FAILURE); } - //determine temp_storage_array size - gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), vobj_zero, computeStream); + + gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); if (gpuErr!=gpuSuccess) { std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr < inline void sliceSumReduction_cub(const Lattice &Data exit(EXIT_FAILURE); } - autoView( Data_v, Data, AcceleratorRead); //prepare buffer for reduction //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream) //use 2d accelerator_for to avoid launch latencies found when serially looping over rd - accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{ int n = s / e2; @@ -97,12 +92,12 @@ template inline void sliceSumReduction_cub(const Lattice &Data int so=r*ostride; // base offset for start of plane int ss= so+n*stride+b; - coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); + coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss])); }); //issue segmented reductions in computeStream - gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), vobj_zero, computeStream); + gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream); if (gpuErr!=gpuSuccess) { std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr < inline void sliceSumReduction_cub(const Lattice &Data } + +template inline void sliceSumReduction_cub_large(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { + typedef typename vobj::vector_type vector; + const int words = sizeof(vobj)/sizeof(vector); + const int osites = rd*e1*e2; + Vectorbuffer(osites); + vector *dat = (vector *)Data; + vector *buf = &buffer[0]; + Vector lvSum_small(rd); + vector *lvSum_ptr = (vector *)&lvSum[0]; + + for (int w = 0; w < words; w++) { + accelerator_for(ss,osites,1,{ + buf[ss] = dat[ss*words+w]; + }); + + sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); + + for (int r = 0; r < rd; r++) { + lvSum_ptr[w+words*r]=lvSum_small[r]; + } + + } + + +} + +template inline void sliceSumReduction_cub(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) +{ + autoView(Data_v, Data, AcceleratorRead); + #if defined(GRID_CUDA) + sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + + #elif defined (GRID_HIP) //hipcub cannot deal with large vobjs that don't fit in shared memory, therefore separate into _small/_large. + if constexpr (sizeof(vobj) <= 256) { + sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + } + else { + sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + } + #endif +} #endif diff --git a/tests/core/Test_sliceSum.cc b/tests/core/Test_sliceSum.cc index 0ca2881b..f86f96f8 100644 --- a/tests/core/Test_sliceSum.cc +++ b/tests/core/Test_sliceSum.cc @@ -140,19 +140,21 @@ int main (int argc, char ** argv) { } traceStop(trace_id); - LatticeColourVectorD test_data_cv(&Grid); + LatticeSpinVectorD test_data_cv(&Grid); gaussian(pRNG,test_data_cv); - std::vector reduction_reference_cv; - std::vector reduction_result_cv; + std::vector reduction_reference_cv; + std::vector reduction_result_cv; //warmup for (int sweeps = 0; sweeps < 5; sweeps++) { reduction_result_cv = sliceSum(test_data_cv,0); } - trace_id = traceStart("sliceSum benchmark - ColourVectorD"); + trace_id = traceStart("sliceSum benchmark - SpinVectorD"); - std::cout << GridLogMessage << "Testing ColourVectorD" << std::endl; + std::cout << GridLogMessage << "Testing SpinVectorD" << std::endl; + std::cout << GridLogMessage << "sizeof(SpinVectorD) = " << sizeof(SpinVectorD) << std::endl; + std::cout << GridLogMessage << "sizeof(vSpinVectorD) = " << sizeof(vSpinVectorD) << std::endl; for (int i = 0; i < Nd; i++) { RealD t=-usecond(); @@ -180,9 +182,10 @@ int main (int argc, char ** argv) { for(int t=0;t hisq_fat_Cstyle(&GRID,path_coeff); - if (param.benchmark) { autoView(U_v, Umu, CpuRead); // Gauge accessor From b02d022993031f5fe58658a0ddb63a381cecb92f Mon Sep 17 00:00:00 2001 From: david clarke Date: Fri, 23 Feb 2024 17:14:28 -0700 Subject: [PATCH 087/114] fixed race condition (thx michael) --- Grid/qcd/smearing/HISQSmearing.h | 59 +++++++++++++++++--------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index ac4cb8b6..6fc6993e 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -168,25 +168,26 @@ public: // We infer some types that will be needed in the calculation. typedef decltype(gStencil.GetEntry(0,0)) stencilElement; typedef decltype(coalescedReadGeneralPermute(U_v[0](0),gStencil.GetEntry(0,0)->_permute,Nd)) U3matrix; - stencilElement SE0, SE1, SE2, SE3, SE4, SE5; - U3matrix U0, U1, U2, U3, U4, U5, W; int Nsites = U_v.size(); + auto gStencil_v = gStencil.View(); -// accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs - for(int site=0;site_offset; - SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; - SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; - SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; - SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; - SE5 = gStencil.GetEntry(s+5,site); int x_m_mu = SE5->_offset; + SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset; + SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + SE5 = gStencil_v.GetEntry(s+5,site); int x_m_mu = SE5->_offset; // When you're deciding whether to take an adjoint, the question is: how is the // stored link oriented compared to the one you want? If I imagine myself travelling @@ -212,10 +213,12 @@ public: // But on GPU it's non-trivial and maps scalar object to vector object and vice versa. coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_3*W); } - }//) + }) -// accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 5-link - for(int site=0;site_offset; - SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; - SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; - SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; - SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset; + SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset; U0 = coalescedReadGeneralPermute( U_v[x_p_mu ](nu ),SE0->_permute,Nd); U1 = coalescedReadGeneralPermute(U_3link_v[x_p_nu ](rho),SE1->_permute,Nd); @@ -248,10 +251,12 @@ public: sigmaIndex++; } } - }//) + }) -// accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 7-link - for(int site=0;site_offset; - SE1 = gStencil.GetEntry(s+1,site); int x_p_nu = SE1->_offset; - SE2 = gStencil.GetEntry(s+2,site); int x = SE2->_offset; - SE3 = gStencil.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; - SE4 = gStencil.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset; + SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset; U0 = coalescedReadGeneralPermute(U_v[x_p_mu](nu),SE0->_permute,Nd); if(sigmaIndex<3) { @@ -286,7 +291,7 @@ public: sigmaIndex++; } } - }//) + }) } // end mu loop From 6cd2d8fcd597118ccfe24aecce3032eb6cb29945 Mon Sep 17 00:00:00 2001 From: Dennis Bollweg Date: Mon, 26 Feb 2024 09:55:07 -0500 Subject: [PATCH 088/114] Replace cuda/hip memcpy with Grid functions --- Grid/lattice/Lattice_slicesum_core.h | 58 +++++++--------------------- Grid/threads/Accelerator.h | 4 ++ 2 files changed, 18 insertions(+), 44 deletions(-) diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h index 159a331b..7c3518cd 100644 --- a/Grid/lattice/Lattice_slicesum_core.h +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -4,11 +4,6 @@ #include #define gpucub cub -#define gpuMalloc cudaMalloc -#define gpuFree cudaFree -#define gpuMemcpyAsync cudaMemcpyAsync -#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost -#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuError_t cudaError_t #define gpuSuccess cudaSuccess @@ -16,11 +11,6 @@ #include #define gpucub hipcub -#define gpuMalloc hipMalloc -#define gpuFree hipFree -#define gpuMemcpyAsync hipMemcpyAsync -#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost -#define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuError_t hipError_t #define gpuSuccess hipSuccess @@ -51,38 +41,22 @@ template inline void sliceSumReduction_cub_small(const vobj *Data, V } //Allocate memory for output and offset arrays on device - gpuError_t gpuErr = gpuMalloc(&d_out,rd*sizeof(vobj)); - if (gpuErr != gpuSuccess) { - std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMalloc (d_out)! Error: " << gpuErr <(acceleratorAllocDevice(rd*sizeof(vobj))); - gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); + d_offsets = static_cast(acceleratorAllocDevice((rd+1)*sizeof(int))); + + //copy offsets to device + acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream); + + + gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); if (gpuErr!=gpuSuccess) { std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr < inline void sliceSumReduction_cub_small(const vobj *Data, V exit(EXIT_FAILURE); } - gpuErr = gpuMemcpyAsync(&lvSum[0],d_out,rd*sizeof(vobj),gpuMemcpyDeviceToHost,computeStream); - if (gpuErr!=gpuSuccess) { - std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpuMemcpy (d_out)! Error: " << gpuErr < Date: Tue, 27 Feb 2024 11:28:32 -0500 Subject: [PATCH 089/114] Added SpinColourMatrix case to sliceSum Test --- tests/core/Test_sliceSum.cc | 58 +++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/core/Test_sliceSum.cc b/tests/core/Test_sliceSum.cc index 5275ddc2..e366f1f3 100644 --- a/tests/core/Test_sliceSum.cc +++ b/tests/core/Test_sliceSum.cc @@ -257,7 +257,65 @@ int main (int argc, char ** argv) { } traceStop(trace_id); + LatticeSpinColourMatrixD test_data_scm(&Grid); + gaussian(pRNG,test_data_scm); + + std::vector reduction_reference_scm; + std::vector reduction_result_scm; + + //warmup + for (int sweeps = 0; sweeps < 5; sweeps++) { + reduction_result_scm = sliceSum(test_data_scm,0); + } + trace_id = traceStart("sliceSum benchmark - SpinColourMatrixD"); + + std::cout << GridLogMessage << "Testing SpinColourMatrixD" << std::endl; + std::cout << GridLogMessage << "sizeof(SpinColourMatrixD) = " << sizeof(SpinColourMatrixD) << std::endl; + std::cout << GridLogMessage << "sizeof(vSpinColourMatrixD) = " << sizeof(vSpinColourMatrixD) << std::endl; + for (int i = 0; i < Nd; i++) { + + RealD t=-usecond(); + + tracePush("sliceSum"); + sliceSumCPU(test_data_scm,reduction_reference_scm,i); + tracePop("sliceSum"); + + t+=usecond(); + std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl; + std::cout << GridLogMessage << "CPU sliceSum took "< Date: Tue, 27 Feb 2024 12:41:45 -0500 Subject: [PATCH 090/114] CUDA cub refuses to reduce vSpinColourMatrix, breaking up into smaller parts like already done for HIP case. --- Grid/lattice/Lattice_slicesum_core.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/Grid/lattice/Lattice_slicesum_core.h b/Grid/lattice/Lattice_slicesum_core.h index 7c3518cd..9c4cc051 100644 --- a/Grid/lattice/Lattice_slicesum_core.h +++ b/Grid/lattice/Lattice_slicesum_core.h @@ -119,18 +119,13 @@ template inline void sliceSumReduction_cub_large(const vobj *Data, V template inline void sliceSumReduction_cub(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { - autoView(Data_v, Data, AcceleratorRead); - #if defined(GRID_CUDA) - sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); - - #elif defined (GRID_HIP) //hipcub cannot deal with large vobjs that don't fit in shared memory, therefore separate into _small/_large. - if constexpr (sizeof(vobj) <= 256) { + autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case. + if constexpr (sizeof(vobj) <= 256) { sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); } else { sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); } - #endif } #endif @@ -215,4 +210,4 @@ template inline void sliceSumReduction(const Lattice &Data, Ve } -NAMESPACE_END(Grid); \ No newline at end of file +NAMESPACE_END(Grid); From 22b43b86cb8737cd74f322f401f65e8549798a25 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 28 Feb 2024 12:57:17 +0100 Subject: [PATCH 091/114] Make GPT test suite work with SYCL --- Grid/lattice/Lattice_basis.h | 2 +- Grid/lattice/Lattice_transfer.h | 4 +-- Grid/threads/Accelerator.h | 48 +++++++++++++++++++++------------ 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index 9415bd4f..03a869fb 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) basis_v.push_back(basis[k].View(AcceleratorWrite)); } -#if ( !(defined(GRID_CUDA) || defined(GRID_HIP)) ) +#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) ) int max_threads = thread_max(); Vector < vobj > Bt(Nm * max_threads); thread_region diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 668ef4b4..a936b1c0 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -469,15 +469,13 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) Coordinate fine_rdimensions = fine->_rdimensions; Coordinate coarse_rdimensions = coarse->_rdimensions; - vobj zz = Zero(); - accelerator_for(sc,coarse->oSites(),1,{ // One thread per sub block Coordinate coor_c(_ndimension); Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate - vobj cd = zz; + vobj cd = Zero(); for(int sb=0;sbsubmit([&](cl::sycl::handler &cgh) { \ - unsigned long nt=acceleratorThreads(); \ - unsigned long unum1 = num1; \ - unsigned long unum2 = num2; \ - if(nt < 8)nt=8; \ - cl::sycl::range<3> local {nt,1,nsimd}; \ - cl::sycl::range<3> global{unum1,unum2,nsimd}; \ - cgh.parallel_for( \ - cl::sycl::nd_range<3>(global,local), \ - [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ - [[intel::reqd_sub_group_size(16)]] \ - { \ - auto iter1 = item.get_global_id(0); \ - auto iter2 = item.get_global_id(1); \ - auto lane = item.get_global_id(2); \ - { __VA_ARGS__ }; \ - }); \ - }); + unsigned long nt=acceleratorThreads(); \ + if(nt < 8)nt=8; \ + unsigned long unum1 = num1; \ + unsigned long unum2 = num2; \ + unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \ + cl::sycl::range<3> local {nt,1,nsimd}; \ + cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \ + if (unum1_divisible_by_nt != unum1) { \ + cgh.parallel_for( \ + cl::sycl::nd_range<3>(global,local), \ + [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ + [[intel::reqd_sub_group_size(16)]] \ + { \ + auto iter1 = item.get_global_id(0); \ + auto iter2 = item.get_global_id(1); \ + auto lane = item.get_global_id(2); \ + { if (iter1 < unum1){ __VA_ARGS__ } }; \ + }); \ + } else { \ + cgh.parallel_for( \ + cl::sycl::nd_range<3>(global,local), \ + [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ + [[intel::reqd_sub_group_size(16)]] \ + { \ + auto iter1 = item.get_global_id(0); \ + auto iter2 = item.get_global_id(1); \ + auto lane = item.get_global_id(2); \ + { __VA_ARGS__ }; \ + }); \ + } \ + }); #define accelerator_barrier(dummy) { theGridAccelerator->wait(); } From 9f89486df5e65c873308df23240a3b826c257d76 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 28 Feb 2024 19:56:23 +0100 Subject: [PATCH 092/114] remove unnecessary code path --- Grid/threads/Accelerator.h | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index b4df0924..054e9fbc 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -294,29 +294,16 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \ cl::sycl::range<3> local {nt,1,nsimd}; \ cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \ - if (unum1_divisible_by_nt != unum1) { \ - cgh.parallel_for( \ - cl::sycl::nd_range<3>(global,local), \ - [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ - [[intel::reqd_sub_group_size(16)]] \ - { \ - auto iter1 = item.get_global_id(0); \ - auto iter2 = item.get_global_id(1); \ - auto lane = item.get_global_id(2); \ - { if (iter1 < unum1){ __VA_ARGS__ } }; \ - }); \ - } else { \ - cgh.parallel_for( \ - cl::sycl::nd_range<3>(global,local), \ - [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ - [[intel::reqd_sub_group_size(16)]] \ - { \ - auto iter1 = item.get_global_id(0); \ - auto iter2 = item.get_global_id(1); \ - auto lane = item.get_global_id(2); \ - { __VA_ARGS__ }; \ - }); \ - } \ + cgh.parallel_for( \ + cl::sycl::nd_range<3>(global,local), \ + [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ + [[intel::reqd_sub_group_size(16)]] \ + { \ + auto iter1 = item.get_global_id(0); \ + auto iter2 = item.get_global_id(1); \ + auto lane = item.get_global_id(2); \ + { if (iter1 < unum1){ __VA_ARGS__ } }; \ + }); \ }); #define accelerator_barrier(dummy) { theGridAccelerator->wait(); } From f70df6e1955107d83b62df323608f969a53c7c0e Mon Sep 17 00:00:00 2001 From: david clarke Date: Thu, 29 Feb 2024 12:29:30 -0700 Subject: [PATCH 093/114] changed NO_SHIFT and BACKWARD_CONST from define to enum --- Grid/qcd/smearing/HISQSmearing.h | 5 +---- Grid/stencil/GeneralLocalStencil.h | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/Grid/qcd/smearing/HISQSmearing.h b/Grid/qcd/smearing/HISQSmearing.h index 6fc6993e..529ea090 100644 --- a/Grid/qcd/smearing/HISQSmearing.h +++ b/Grid/qcd/smearing/HISQSmearing.h @@ -138,7 +138,7 @@ public: for(int nu=0;nu Nd! @@ -150,16 +156,16 @@ public: inline int Back(const int dir) { // generalShift will use BACKWARD_CONST to determine whether we step forward or // backward. Trick inspired by SIMULATeQCD. - return dir + BACKWARD_CONST; + return dir + shiftSignal::BACKWARD_CONST; } /*! @brief shift one unit in direction dir */ template void generalShift(Coordinate& shift, int dir) { - if (dir >= BACKWARD_CONST) { - dir -= BACKWARD_CONST; + if (dir >= shiftSignal::BACKWARD_CONST) { + dir -= shiftSignal::BACKWARD_CONST; shift[dir]+=-1; - } else if (dir == NO_SHIFT) { + } else if (dir == shiftSignal::NO_SHIFT) { ; // do nothing } else { shift[dir]+=1; @@ -169,10 +175,10 @@ void generalShift(Coordinate& shift, int dir) { /*! @brief follow a path of directions, shifting one unit in each direction */ template void generalShift(Coordinate& shift, int dir, Args... args) { - if (dir >= BACKWARD_CONST) { - dir -= BACKWARD_CONST; + if (dir >= shiftSignal::BACKWARD_CONST) { + dir -= shiftSignal::BACKWARD_CONST; shift[dir]+=-1; - } else if (dir == NO_SHIFT) { + } else if (dir == shiftSignal::NO_SHIFT) { ; // do nothing } else { shift[dir]+=1; From 3c49762875832c28c0b80a8810af74bb8d1992a5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 29 Feb 2024 15:33:06 -0500 Subject: [PATCH 094/114] Propagate in the blas routine --- Grid/algorithms/blas/BatchedBlas.h | 685 +++++++++++++++++++++++++++++ 1 file changed, 685 insertions(+) create mode 100644 Grid/algorithms/blas/BatchedBlas.h diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h new file mode 100644 index 00000000..82da2d5d --- /dev/null +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -0,0 +1,685 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: BatchedBlas.h + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +#ifdef GRID_HIP +#include +#endif +#ifdef GRID_CUDA +#include +#endif +#ifdef GRID_SYCL +#error // need oneMKL version +#endif + +/////////////////////////////////////////////////////////////////////// +// Need to rearrange lattice data to be in the right format for a +// batched multiply. Might as well make these static, dense packed +/////////////////////////////////////////////////////////////////////// +NAMESPACE_BEGIN(Grid); +#ifdef GRID_HIP + typedef hipblasHandle_t gridblasHandle_t; +#endif +#ifdef GRID_CUDA + typedef cudablasHandle_t gridblasHandle_t; +#endif +#ifdef GRID_SYCL + typedef int32_t gridblasHandle_t; +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + typedef int32_t gridblasHandle_t; +#endif + +enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ; + +class GridBLAS { +public: + + + static gridblasHandle_t gridblasHandle; + static int gridblasInit; + + static void Init(void) + { + if ( ! gridblasInit ) { +#ifdef GRID_CUDA + std::cout << "cublasCreate"< &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexD beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + void gemmBatched(int m,int n, int k, + ComplexF alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexF beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + void gemmBatched(int m,int n, int k, + RealD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealD beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + void gemmBatched(int m,int n, int k, + RealF alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealF beta, + deviceVector &Cmn) + { + gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, + m,n,k, + alpha, + Amk, + Bkn, + beta, + Cmn); + } + + void gemmBatched(GridBLASOperation_t OpA, + GridBLASOperation_t OpB, + int m,int n, int k, + ComplexD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexD beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); + RealD t0=usecond(); + // std::cout << "ZgemmBatched mnk "<gemm_batch & OneAPI +#warning "oneMKL implementation not built " +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + ComplexD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } +#endif + // synchronise(); + RealD t1=usecond(); + RealD flops = 8.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; + // std::cout < &Amk, // pointer list to matrices + deviceVector &Bkn, + ComplexF beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF)); + RealD t0=usecond(); + + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); +#ifdef GRID_HIP + hipblasOperation_t hOpA; + hipblasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; + auto err = hipblasCgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (hipblasComplex *) &alpha_p[0], + (hipblasComplex **)&Amk[0], lda, + (hipblasComplex **)&Bkn[0], ldb, + (hipblasComplex *) &beta_p[0], + (hipblasComplex **)&Cmn[0], ldc, + batchCount); + + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + cublasOperation_t hOpA; + cublasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; + auto err = cublasCgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (cuComplex *) &alpha_p[0], + (cuComplex **)&Amk[0], lda, + (cuComplex **)&Bkn[0], ldb, + (cuComplex *) &beta_p[0], + (cuComplex **)&Cmn[0], ldc, + batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + //MKL’s cblas_gemm_batch & OneAPI +#warning "oneMKL implementation not built " +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + ComplexD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } +#endif + RealD t1=usecond(); + RealD flops = 8.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount; + } + + /////////////////////////////////////////////////////////////////////////// + // Single precision real GEMM + /////////////////////////////////////////////////////////////////////////// + + void gemmBatched(GridBLASOperation_t OpA, + GridBLASOperation_t OpB, + int m,int n, int k, + RealF alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealF beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF)); + RealD t0=usecond(); + + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); +#ifdef GRID_HIP + hipblasOperation_t hOpA; + hipblasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; + auto err = hipblasSgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (float *) &alpha_p[0], + (float **)&Amk[0], lda, + (float **)&Bkn[0], ldb, + (float *) &beta_p[0], + (float **)&Cmn[0], ldc, + batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + cublasOperation_t hOpA; + cublasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; + auto err = cublasSgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (float *) &alpha_p[0], + (float **)&Amk[0], lda, + (float **)&Bkn[0], ldb, + (float *) &beta_p[0], + (float **)&Cmn[0], ldc, + batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + //MKL’s cblas_gemm_batch & OneAPI +#warning "oneMKL implementation not built " +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + RealD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } +#endif + RealD t1=usecond(); + RealD flops = 2.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount; + } + + + /////////////////////////////////////////////////////////////////////////// + // Double precision real GEMM + /////////////////////////////////////////////////////////////////////////// + + void gemmBatched(GridBLASOperation_t OpA, + GridBLASOperation_t OpB, + int m,int n, int k, + RealD alpha, + deviceVector &Amk, // pointer list to matrices + deviceVector &Bkn, + RealD beta, + deviceVector &Cmn) + { + RealD t2=usecond(); + int32_t batchCount = Amk.size(); + + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + if(OpA!=GridBLAS_OP_N) + lda = k; + if(OpB!=GridBLAS_OP_N) + ldb = n; + + static deviceVector alpha_p(1); + static deviceVector beta_p(1); + // can prestore the 1 and the zero on device + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD)); + RealD t0=usecond(); + + assert(Bkn.size()==batchCount); + assert(Cmn.size()==batchCount); +#ifdef GRID_HIP + hipblasOperation_t hOpA; + hipblasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; + auto err = hipblasDgemmBatched(gridblasHandle, + HIPBLAS_OP_N, + HIPBLAS_OP_N, + m,n,k, + (double *) &alpha_p[0], + (double **)&Amk[0], lda, + (double **)&Bkn[0], ldb, + (double *) &beta_p[0], + (double **)&Cmn[0], ldc, + batchCount); + assert(err==HIPBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_CUDA + cublasOperation_t hOpA; + cublasOperation_t hOpB; + if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; + if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; + if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; + if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; + if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; + if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; + auto err = cublasDgemmBatched(gridblasHandle, + hOpA, + hOpB, + m,n,k, + (double *) &alpha_p[0], + (double **)&Amk[0], lda, + (double **)&Bkn[0], ldb, + (double *) &beta_p[0], + (double **)&Cmn[0], ldc, + batchCount); + assert(err==CUBLAS_STATUS_SUCCESS); +#endif +#ifdef GRID_SYCL + /* + int64_t m64=m; + int64_t n64=n; + int64_t k64=k; + int64_t batchCount64=batchCount; + oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator, + onemkl::transpose::N, + onemkl::transpose::N, + &m64,&n64,&k64, + (double *) &alpha_p[0], + (double **)&Amk[0], lda, + (double **)&Bkn[0], ldb, + (double *) &beta_p[0], + (double **)&Cmn[0], ldc, + 1,&batchCount64); + */ + //MKL’s cblas_gemm_batch & OneAPI +#warning "oneMKL implementation not built " +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + // Need a default/reference implementation + for (int p = 0; p < batchCount; ++p) { + for (int mm = 0; mm < m; ++mm) { + for (int nn = 0; nn < n; ++nn) { + RealD c_mn(0.0); + for (int kk = 0; kk < k, ++kk) + c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; + Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + } + } + } +#endif + RealD t1=usecond(); + RealD flops = 2.0*m*n*k*batchCount; + RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; + } + + + + //////////////////////////////////////////////////////////////////////////////////////////////// + // Strided case used by benchmark, but generally unused in Grid + // Keep a code example in double complex, but don't generate the single and real variants for now + //////////////////////////////////////////////////////////////////////////////////////////////// + + void gemmStridedBatched(int m,int n, int k, + ComplexD alpha, + ComplexD* Amk, // pointer list to matrices + ComplexD* Bkn, + ComplexD beta, + ComplexD* Cmn, + int batchCount) + { + // Use C-row major storage, so transpose calls + int lda = m; // m x k column major + int ldb = k; // k x n column major + int ldc = m; // m x b column major + int sda = m*k; + int sdb = k*n; + int sdc = m*n; + deviceVector alpha_p(1); + deviceVector beta_p(1); + acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); + acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); + std::cout << "blasZgemmStridedBatched mnk "< A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); + deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); + deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); + ComplexD alpha(1.0); + ComplexD beta (1.0); + for(int i=0;i<10;i++){ + RealD t0 = usecond(); + for(int s=0;s Date: Thu, 29 Feb 2024 20:01:44 -0500 Subject: [PATCH 095/114] Benchmark development --- Grid/algorithms/blas/BatchedBlas.h | 66 +- Grid/allocator/AlignedAllocator.h | 1 + benchmarks/Benchmark_usqcd.cc | 959 +++++++++++++++++++++++++++++ systems/mac-arm/config-command-mpi | 3 +- 4 files changed, 1000 insertions(+), 29 deletions(-) create mode 100644 benchmarks/Benchmark_usqcd.cc diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 82da2d5d..87bcbd73 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -252,13 +252,16 @@ public: #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) // Need a default/reference implementation + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { ComplexD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; } } } @@ -348,14 +351,19 @@ public: #warning "oneMKL implementation not built " #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; + ComplexF alphaf(real(alpha),imag(alpha)); + ComplexF betaf(real(beta),imag(beta)); // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { - ComplexD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + ComplexF c_mn(0.0); + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ]; } } } @@ -444,14 +452,17 @@ public: #warning "oneMKL implementation not built " #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { RealD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; } } } @@ -558,14 +569,17 @@ public: #warning "oneMKL implementation not built " #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { RealD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; } } } @@ -638,43 +652,41 @@ public: for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { ComplexD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) + for (int kk = 0; kk < k; ++kk) c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + Cmn[mm + nn*ldc + p*sdc] = (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc]; } } } #endif } - void benchmark(int nbasis, int nrhs, int coarseVol, int nstencil) + double benchmark(int M, int N, int K, int BATCH) { - int32_t N_A = nbasis*nbasis*coarseVol*nstencil; - int32_t N_B = nbasis*nrhs*coarseVol*nstencil; // One leg of stencil at a time - int32_t N_C = nbasis*nrhs*coarseVol*nstencil; + int32_t N_A = M*K*BATCH; + int32_t N_B = K*N*BATCH; + int32_t N_C = M*N*BATCH; deviceVector A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); ComplexD alpha(1.0); ComplexD beta (1.0); + RealD flops = 8.0*M*N*K*BATCH; for(int i=0;i<10;i++){ RealD t0 = usecond(); - for(int s=0;s using cshiftAllocator = std::allocator; template using Vector = std::vector >; template using stencilVector = std::vector >; template using commVector = std::vector >; +template using deviceVector = std::vector >; template using cshiftVector = std::vector >; NAMESPACE_END(Grid); diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc new file mode 100644 index 00000000..526e5659 --- /dev/null +++ b/benchmarks/Benchmark_usqcd.cc @@ -0,0 +1,959 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_usqcd.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace Grid; + +std::vector L_list; +std::vector Ls_list; +std::vector mflop_list; + +double mflop_ref; +double mflop_ref_err; + +int NN_global; + +FILE * FP; + +struct time_statistics{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v){ + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; +} +}; + +void comms_header(){ + std::cout <1) nmu++; + + std::vector t_time(Nloop); + time_statistics timestat; + + std::cout< xbuf(8); + std::vector rbuf(8); + //Grid.ShmBufferFreeAll(); + uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + // bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + // bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + } + + // int ncomm; + double dbytes; + + for(int dir=0;dir<8;dir++) { + int mu =dir % 4; + if (mpi_layout[mu]>1 ) { + + std::vector times(Nloop); + for(int i=0;i > LatticeVec; + typedef iVector Vec; + + Coordinate simd_layout = GridDefaultSimd(Nd,vReal::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + fprintf(FP,"Memory Bandwidth\n\n"); + fprintf(FP,"Bytes, GB/s per node\n"); + std::cout<({45,12,81,9})); + for(int lat=8;lat<=lmax;lat+=8){ + + Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + // NP= Grid.RankCount(); + NN =Grid.NodeCount(); + + Vec rn ; random(sRNG,rn); + + LatticeVec z(&Grid); z=Zero(); + LatticeVec x(&Grid); x=Zero(); + LatticeVec y(&Grid); y=Zero(); + double a=2.0; + + uint64_t Nloop=NLOOP; + + double start=usecond(); + for(int i=0;i > LatticeSU4; + + Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + std::cout<({45,12,81,9})); + for(int lat=8;lat<=lmax;lat+=8){ + + Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + NN =Grid.NodeCount(); + + + LatticeSU4 z(&Grid); z=Zero(); + LatticeSU4 x(&Grid); x=Zero(); + LatticeSU4 y(&Grid); y=Zero(); + // double a=2.0; + + uint64_t Nloop=NLOOP; + + double start=usecond(); + for(int i=0;i mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); + Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + typedef DomainWallFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + ///////// Source preparation //////////// + Gauge Umu(UGrid); SU::HotConfiguration(RNG4,Umu); + Fermion src (FGrid); random(RNG5,src); + Fermion src_e (FrbGrid); + Fermion src_o (FrbGrid); + Fermion r_e (FrbGrid); + Fermion r_o (FrbGrid); + Fermion r_eo (FGrid); + Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + + const int num_cases = 1; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); + Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + + typedef ImprovedStaggeredFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); SU::HotConfiguration(RNG4,Umu); + + typename Action::ImplParams params; + Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); + + ///////// Source preparation //////////// + Fermion src (FGrid); random(RNG4,src); + Fermion src_e (FrbGrid); + Fermion src_o (FrbGrid); + Fermion r_e (FrbGrid); + Fermion r_o (FrbGrid); + Fermion r_eo (FGrid); + + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + + const int num_cases = 1; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); + Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass=0.1; + RealD csw=1.0; + + typedef WilsonCloverFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); SU::HotConfiguration(RNG4,Umu); + + Action Dc(Umu,*FGrid,*FrbGrid,mass,csw,csw); + + ///////// Source preparation //////////// + Fermion src (FGrid); random(RNG4,src); + Fermion r (FGrid); + + { + + const int num_cases = 1; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops({8,2,2,2}); +#else + LebesgueOrder::Block = std::vector({2,2,2,2}); +#endif + Benchmark::Decomposition(); + + int do_su4=0; + int do_memory=1; + int do_comms =1; + int do_blas =1; + + int sel=4; + std::vector L_list({8,12,16,24,32}); + int selm1=sel-1; + + std::vector clover; + std::vector dwf4; + std::vector staggered; + + int Ls=1; + std::cout< Date: Thu, 29 Feb 2024 20:09:11 -0500 Subject: [PATCH 096/114] Only one rank opens --- benchmarks/Benchmark_usqcd.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc index 526e5659..7b1fddd4 100644 --- a/benchmarks/Benchmark_usqcd.cc +++ b/benchmarks/Benchmark_usqcd.cc @@ -842,10 +842,14 @@ public: int main (int argc, char ** argv) { - - FP = fopen("Benchmark_usqcd.csv","w"); Grid_init(&argc,&argv); + if (GlobalSharedMemory::WorldRank==0) { + FP = fopen("Benchmark_usqcd.csv","w"); + } else { + FP = fopen("/dev/null","w"); + } + CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); #ifdef KNL LebesgueOrder::Block = std::vector({8,2,2,2}); From c805f86343ce7714f9281e704a16cec3cec08401 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 1 Mar 2024 00:05:04 -0500 Subject: [PATCH 097/114] USQCD benchmark --- Grid/algorithms/blas/BatchedBlas.cc | 34 ++++++++++++++++++++ Grid/algorithms/blas/BatchedBlas.h | 37 +++++++++++---------- benchmarks/Benchmark_usqcd.cc | 50 +++++++++++++---------------- systems/Frontier/config-command | 2 +- 4 files changed, 78 insertions(+), 45 deletions(-) create mode 100644 Grid/algorithms/blas/BatchedBlas.cc diff --git a/Grid/algorithms/blas/BatchedBlas.cc b/Grid/algorithms/blas/BatchedBlas.cc new file mode 100644 index 00000000..e79ab1a9 --- /dev/null +++ b/Grid/algorithms/blas/BatchedBlas.cc @@ -0,0 +1,34 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: BatchedBlas.h + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +NAMESPACE_BEGIN(Grid); +gridblasHandle_t GridBLAS::gridblasHandle; +int GridBLAS::gridblasInit; +NAMESPACE_END(Grid); + diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 87bcbd73..2924350d 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -615,9 +615,10 @@ public: deviceVector beta_p(1); acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); - std::cout << "blasZgemmStridedBatched mnk "< Date: Tue, 5 Mar 2024 13:38:32 -0500 Subject: [PATCH 098/114] Update setup.sh --- systems/PVC-OEM/setup.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/systems/PVC-OEM/setup.sh b/systems/PVC-OEM/setup.sh index 3b8188f0..0e780ef4 100644 --- a/systems/PVC-OEM/setup.sh +++ b/systems/PVC-OEM/setup.sh @@ -1,3 +1,5 @@ export https_proxy=http://proxy-chain.intel.com:911 module load intel-release module load intel/mpich +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" From 2ae980ae439b92285341ec5777c9c4b5ec293547 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Mar 2024 13:39:18 -0500 Subject: [PATCH 099/114] Update sourceme.sh --- systems/Aurora/sourceme.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 7a2b3815..60abed41 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -9,4 +9,5 @@ export http_proxy=http://proxy.alcf.anl.gov:3128 export https_proxy=http://proxy.alcf.anl.gov:3128 #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 git config --global http.proxy http://proxy.alcf.anl.gov:3128 - + +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" From 30228214f737c002ca2c5636b73bc23af59a88ae Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Mar 2024 23:56:10 +0000 Subject: [PATCH 100/114] SYCL conflict with Eigen --- Grid/Grid_Eigen_Dense.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index bdd39a65..8bd1d113 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -34,7 +34,7 @@ #pragma push_macro("__SYCL_DEVICE_ONLY__") #undef __SYCL_DEVICE_ONLY__ #define EIGEN_DONT_VECTORIZE -//#undef EIGEN_USE_SYCL +#undef EIGEN_USE_SYCL #define __SYCL__REDEFINE__ #endif From 21bc8c24df800a480c3d36c5a135f9512658631f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Mar 2024 23:58:20 +0000 Subject: [PATCH 101/114] OneMKL batched blas starting --- Grid/algorithms/blas/BatchedBlas.h | 40 ++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 2924350d..5ab46333 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -34,9 +34,14 @@ Author: Peter Boyle #include #endif #ifdef GRID_SYCL -#error // need oneMKL version +#include +#endif +#if 0 +#define GRID_ONE_MKL +#endif +#ifdef GRID_ONE_MKL +#include #endif - /////////////////////////////////////////////////////////////////////// // Need to rearrange lattice data to be in the right format for a // batched multiply. Might as well make these static, dense packed @@ -49,9 +54,12 @@ NAMESPACE_BEGIN(Grid); typedef cudablasHandle_t gridblasHandle_t; #endif #ifdef GRID_SYCL - typedef int32_t gridblasHandle_t; + typedef cl::sycl::queue *gridblasHandle_t; #endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) +#ifdef GRID_ONE_MKL + typedef cl::sycl::queue *gridblasHandle_t; +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) typedef int32_t gridblasHandle_t; #endif @@ -76,6 +84,12 @@ public: hipblasCreate(&gridblasHandle); #endif #ifdef GRID_SYCL + gridblasHandle = theGridAccelerator; +#endif +#ifdef GRID_ONE_MKL + cl::sycl::cpu_selector selector; + cl::sycl::device selectedDevice { selector }; + gridblasHandle =new sycl::queue (selectedDevice); #endif gridblasInit=1; } @@ -110,6 +124,9 @@ public: #endif #ifdef GRID_SYCL accelerator_barrier(); +#endif +#ifdef GRID_ONE_MKL + gridblasHandle->wait(); #endif } @@ -644,10 +661,19 @@ public: (cuDoubleComplex *) Cmn, ldc, sdc, batchCount); #endif -#ifdef GRID_SYCL - #warning "oneMKL implementation not made " +#if defined(GRID_SYCL) || defined(GRID_ONE_MKL) + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + oneapi::mkl::transpose::N, + oneapi::mkl::transpose::N, + m,n,k, + alpha, + (const ComplexD *)Amk,lda,sda, + (const ComplexD *)Bkn,ldb,sdb, + beta, + (ComplexD *)Cmn,ldc,sdc, + batchCount); #endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { From f8ca971daedc087fbb1f554705d27d6af44eef6f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Mar 2024 23:59:13 +0000 Subject: [PATCH 102/114] Use of a bare PRECISION macro is not namespace safe and collides with SYCL --- Grid/algorithms/approx/Zolotarev.cc | 96 ++++++++++++++--------------- Grid/algorithms/approx/Zolotarev.h | 11 ++-- 2 files changed, 54 insertions(+), 53 deletions(-) diff --git a/Grid/algorithms/approx/Zolotarev.cc b/Grid/algorithms/approx/Zolotarev.cc index c2efd41c..47779eae 100644 --- a/Grid/algorithms/approx/Zolotarev.cc +++ b/Grid/algorithms/approx/Zolotarev.cc @@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k, * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and * type = 1 for the approximation which is infinite at x = 0. */ -zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { +zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) { INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F, l, invlambda, xi, xisq, *tv, s, opl; int m, czero, ts; @@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { construct_partfrac(d); construct_contfrac(d); - /* Converting everything to PRECISION for external use only */ + /* Converting everything to ZOLO_PRECISION for external use only */ zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); - zd -> A = (PRECISION) d -> A; - zd -> Delta = (PRECISION) d -> Delta; - zd -> epsilon = (PRECISION) d -> epsilon; + zd -> A = (ZOLO_PRECISION) d -> A; + zd -> Delta = (ZOLO_PRECISION) d -> Delta; + zd -> epsilon = (ZOLO_PRECISION) d -> epsilon; zd -> n = d -> n; zd -> type = d -> type; zd -> dn = d -> dn; @@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { zd -> deg_num = d -> deg_num; zd -> deg_denom = d -> deg_denom; - zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION)); - for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m]; + zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m]; free(d -> a); - zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION)); - for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m]; + zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m]; free(d -> ap); - zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION)); - for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m]; + zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m]; free(d -> alpha); - zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION)); - for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m]; + zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m]; free(d -> beta); - zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION)); - for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m]; + zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m]; free(d -> gamma); free(d); @@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata) } -zolotarev_data* higham(PRECISION epsilon, int n) { +zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) { INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq; int m, czero; zolotarev_data *zd; @@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) { /* Converting everything to PRECISION for external use only */ zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); - zd -> A = (PRECISION) d -> A; - zd -> Delta = (PRECISION) d -> Delta; - zd -> epsilon = (PRECISION) d -> epsilon; + zd -> A = (ZOLO_PRECISION) d -> A; + zd -> Delta = (ZOLO_PRECISION) d -> Delta; + zd -> epsilon = (ZOLO_PRECISION) d -> epsilon; zd -> n = d -> n; zd -> type = d -> type; zd -> dn = d -> dn; @@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) { zd -> deg_num = d -> deg_num; zd -> deg_denom = d -> deg_denom; - zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION)); - for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m]; + zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m]; free(d -> a); - zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION)); - for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m]; + zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m]; free(d -> ap); - zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION)); - for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m]; + zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m]; free(d -> alpha); - zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION)); - for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m]; + zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m]; free(d -> beta); - zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION)); - for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m]; + zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m]; free(d -> gamma); free(d); @@ -523,17 +523,17 @@ NAMESPACE_END(Grid); #ifdef TEST #undef ZERO -#define ZERO ((PRECISION) 0) +#define ZERO ((ZOLO_PRECISION) 0) #undef ONE -#define ONE ((PRECISION) 1) +#define ONE ((ZOLO_PRECISION) 1) #undef TWO -#define TWO ((PRECISION) 2) +#define TWO ((ZOLO_PRECISION) 2) /* Evaluate the rational approximation R(x) using the factored form */ -static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R; + ZOLO_PRECISION R; if (rdata -> type == 0) { R = rdata -> A * x; @@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) { /* Evaluate the rational approximation R(x) using the partial fraction form */ -static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R = rdata -> alpha[rdata -> da - 1]; + ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1]; for (m = 0; m < rdata -> dd; m++) R += rdata -> alpha[m] / (x * x - rdata -> ap[m]); if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x); @@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) { * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0, * but with signalling overflow you will get an error message. */ -static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R = rdata -> beta[0] * x; + ZOLO_PRECISION R = rdata -> beta[0] * x; for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R; return R; } /* Evaluate the rational approximation R(x) using Cayley form */ -static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION T; + ZOLO_PRECISION T; T = rdata -> type == 0 ? ONE : -ONE; for (m = 0; m < rdata -> n; m++) @@ -607,7 +607,7 @@ int main(int argc, char** argv) { int m, n, plotpts = 5000, type = 0; float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr; zolotarev_data *rdata; - PRECISION y; + ZOLO_PRECISION y; FILE *plot_function, *plot_error, *plot_partfrac, *plot_contfrac, *plot_cayley; @@ -626,13 +626,13 @@ int main(int argc, char** argv) { } rdata = type == 2 - ? higham((PRECISION) eps, n) - : zolotarev((PRECISION) eps, n, type); + ? higham((ZOLO_PRECISION) eps, n) + : zolotarev((ZOLO_PRECISION) eps, n, type); printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION) "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION) - "\tPRECISION = " STRINGIFY(PRECISION) + "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION) "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n" "\tDelta = %g (maximum error)\n\n" "\tA = %g (overall factor)\n", @@ -681,15 +681,15 @@ int main(int argc, char** argv) { x = 2.4 * (float) m / plotpts - 1.2; if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) { /* skip x = 0 for type 1, as R(0) is singular */ - y = zolotarev_eval((PRECISION) x, rdata); + y = zolotarev_eval((ZOLO_PRECISION) x, rdata); fprintf(plot_function, "%g %g\n", x, (float) y); fprintf(plot_error, "%g %g\n", x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta)); - ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y) + ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); - ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y) + ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); - ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y) + ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) { maxypferr = MAX(maxypferr, fabs(ypferr)); diff --git a/Grid/algorithms/approx/Zolotarev.h b/Grid/algorithms/approx/Zolotarev.h index 800cf3c7..3c983cd3 100644 --- a/Grid/algorithms/approx/Zolotarev.h +++ b/Grid/algorithms/approx/Zolotarev.h @@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx); #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY> #ifndef ZOLOTAREV_INTERNAL -#ifndef PRECISION -#define PRECISION double +#ifndef ZOLO_PRECISION +#define ZOLO_PRECISION double #endif -#define ZPRECISION PRECISION +#define ZPRECISION ZOLO_PRECISION #define ZOLOTAREV_DATA zolotarev_data #endif @@ -77,8 +77,8 @@ typedef struct { * zolotarev_data structure. The arguments must satisfy the constraints that * epsilon > 0, n > 0, and type = 0 or 1. */ -ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ; -ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type); +ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ; +ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type); void zolotarev_free(zolotarev_data *zdata); #endif @@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata); NAMESPACE_END(Approx); NAMESPACE_END(Grid); #endif + From 976c3e9b59a10139611886b1cf97e40e3470cbfd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Mar 2024 23:59:57 +0000 Subject: [PATCH 103/114] Hack for flight logging CG inner products. Can be made to work, but could put in some more serious infrastructure for repro testing and blame attribution (Britney test) if necessary --- Grid/lattice/Lattice.h | 2 +- Grid/lattice/Lattice_crc.h | 4 ++-- Grid/lattice/Lattice_reduction.h | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 6343db99..79572949 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -35,6 +35,7 @@ Author: Peter Boyle #include #include #include +#include #include #include #include @@ -46,5 +47,4 @@ Author: Peter Boyle #include #include #include -#include #include diff --git a/Grid/lattice/Lattice_crc.h b/Grid/lattice/Lattice_crc.h index 142e2349..e31d8441 100644 --- a/Grid/lattice/Lattice_crc.h +++ b/Grid/lattice/Lattice_crc.h @@ -42,13 +42,13 @@ template void DumpSliceNorm(std::string s,Lattice &f,int mu=-1 } } -template uint32_t crc(Lattice & buf) +template uint32_t crc(const Lattice & buf) { autoView( buf_v , buf, CpuRead); return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites()); } -#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "< inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); ComplexD nrm = rankInnerProduct(left,right); + // std::cerr<<"flight log " << std::hexfloat << nrm <<" "<GlobalSum(nrm); return nrm; } From 783a66b3485169888000cdb735c34617126f47f3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 00:01:37 +0000 Subject: [PATCH 104/114] Deterministic reduction please --- Grid/threads/Accelerator.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index f6efdee9..392cba61 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -255,17 +255,13 @@ inline int acceleratorIsCommunicable(void *ptr) #define GRID_SYCL_LEVEL_ZERO_IPC NAMESPACE_END(Grid); -#if 0 -#include -#include -#include -#include -#else + +// Force deterministic reductions +#define SYCL_REDUCTION_DETERMINISTIC #include #include #include #include -#endif NAMESPACE_BEGIN(Grid); From 1b93a9be88cad5d456a84fafdda4104d4213ff82 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 00:01:58 +0000 Subject: [PATCH 105/114] Print out the hostname --- Grid/util/Init.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index d013763a..9a0b4376 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -393,6 +393,9 @@ void Grid_init(int *argc,char ***argv) std::cout << GridLogMessage << "MPI is initialised and logging filters activated "< Date: Wed, 6 Mar 2024 00:02:27 +0000 Subject: [PATCH 106/114] SPR HBM benchmarking right and also PVC batched GEMM --- benchmarks/Benchmark_usqcd.cc | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc index eaa78e40..3b729b9e 100644 --- a/benchmarks/Benchmark_usqcd.cc +++ b/benchmarks/Benchmark_usqcd.cc @@ -219,7 +219,7 @@ public: uint64_t NN; - uint64_t lmax=32; + uint64_t lmax=40; #define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat) GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); @@ -454,11 +454,17 @@ public: pickCheckerboard(Even,src_e,src); pickCheckerboard(Odd,src_o,src); - const int num_cases = 1; +#ifdef AVX512 + const int num_cases = 3; +#else + const int num_cases = 2; +#endif std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); controls Cases [] = { - { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } + { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } }; for(int c=0;c({8,2,2,2}); -#else LebesgueOrder::Block = std::vector({2,2,2,2}); -#endif + Benchmark::Decomposition(); int do_su4=0; @@ -910,7 +919,7 @@ int main (int argc, char ** argv) } if ( do_blas ) { -#if defined(GRID_CUDA) || defined(GRID_HIP) +#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) std::cout< Date: Wed, 6 Mar 2024 00:03:16 +0000 Subject: [PATCH 107/114] Reproducing CG can be more useful now --- tests/Test_dwf_mixedcg_prec.cc | 111 +++++++++++++++++---------------- 1 file changed, 58 insertions(+), 53 deletions(-) diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index cbc573d1..13cc0bb6 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -30,27 +30,16 @@ Author: Peter Boyle using namespace std; using namespace Grid; -template -struct scal { - d internal; -}; - - Gamma::Algebra Gmu [] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT - }; - int main (int argc, char ** argv) { + char hostname[HOST_NAME_MAX+1]; + gethostname(hostname, HOST_NAME_MAX+1); + std::string host(hostname); + Grid_init(&argc,&argv); const int Ls=12; - std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl; - - { GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -92,7 +81,14 @@ int main (int argc, char ** argv) SchurDiagMooeeOperator HermOpEO(Ddwf); SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); - std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; + int nsecs=600; + if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ + std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds"); + GridCmdOptionInt(arg,nsecs); + } + + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "< mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); double t1,t2,flops; double MdagMsiteflops = 1452; // Mobius (real coeffs) @@ -101,7 +97,14 @@ int main (int argc, char ** argv) std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<gSites()*iters; std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< CG(1.0e-8,10000); - for(int i=0;i<1;i++){ + csumref=0; + int i=0; + do { + std::cerr << "******************* DOUBLE PRECISION SOLVE "<gSites()*iters; flops+= CGsiteflops*FrbGrid->gSites()*iters; - + std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< munge; - std::string format = getFormatString(); - - BinaryIO::writeLatticeObject(result_o,file1,munge, 0, format, - nersc_csum,scidac_csuma,scidac_csumb); - - std::cout << GridLogMessage << " Mixed checksums "<(result_o_2,file1,munge, 0, format, - nersc_csum,scidac_csuma,scidac_csumb); - - std::cout << GridLogMessage << " CG checksums "< Date: Wed, 6 Mar 2024 00:03:59 +0000 Subject: [PATCH 108/114] More blasted shell variables --- systems/Aurora/benchmarks/bench1024.pbs | 10 +++++++--- systems/Aurora/benchmarks/bench12.pbs | 17 ++++++++++++++++- systems/Aurora/config-command | 4 ++-- systems/Aurora/sourceme.sh | 13 +++++++++++++ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/systems/Aurora/benchmarks/bench1024.pbs b/systems/Aurora/benchmarks/bench1024.pbs index 88f0100a..2e99ae4b 100644 --- a/systems/Aurora/benchmarks/bench1024.pbs +++ b/systems/Aurora/benchmarks/bench1024.pbs @@ -25,12 +25,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 -export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 export MPICH_OFI_NIC_POLICY=GPU +export FI_CXI_CQ_FILL_PERCENT=10 +export FI_CXI_DEFAULT_CQ_SIZE=262144 +#export FI_CXI_DEFAULT_CQ_SIZE=131072 +#export FI_CXI_CQ_FILL_PERCENT=20 # 12 ppn, 32 nodes, 384 ranks # @@ -45,12 +49,12 @@ CMD="mpiexec -np 12288 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -$CMD | tee 1024node.dwf.small +$CMD | tee 1024node.dwf.small.cq CMD="mpiexec -np 12288 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -$CMD | tee 1024node.dwf +$CMD | tee 1024node.dwf.cq diff --git a/systems/Aurora/benchmarks/bench12.pbs b/systems/Aurora/benchmarks/bench12.pbs index 96f6143f..ee3cb381 100644 --- a/systems/Aurora/benchmarks/bench12.pbs +++ b/systems/Aurora/benchmarks/bench12.pbs @@ -17,6 +17,7 @@ source ../sourceme.sh export OMP_NUM_THREADS=3 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST @@ -35,11 +36,25 @@ CMD="mpiexec -np 24 -ppn 12 -envall \ ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" -$CMD +#$CMD CMD="mpiexec -np 24 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" +#$CMD + +CMD="mpiexec -np 1 -ppn 1 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 1 -ppn 1 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + $CMD diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index e59ef515..689747c9 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -11,6 +11,6 @@ TOOLS=$HOME/tools --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \ - CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include" + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \ + CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel" diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 7a2b3815..effb2d5d 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -3,6 +3,19 @@ module use /soft/modulefiles module load intel_compute_runtime/release/agama-devel-682.22 +export FI_CXI_DEFAULT_CQ_SIZE=131072 +export FI_CXI_CQ_FILL_PERCENT=20 + +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" +#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" + +# +# -ftarget-register-alloc-mode=pvc:default +# -ftarget-register-alloc-mode=pvc:small +# -ftarget-register-alloc-mode=pvc:large +# -ftarget-register-alloc-mode=pvc:auto +# + export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128 From a46a0f088276de56a68d5020e9eee6875647c688 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 01:12:49 +0000 Subject: [PATCH 109/114] force device copyable and don't take crap from SYCL --- Grid/simd/Grid_vector_types.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index daf41cae..0a3d176f 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -1133,4 +1133,13 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc NAMESPACE_END(Grid); +#ifdef GRID_SYCL +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +#endif + + #endif From 10116b3be8730507876336b92490a82d39000f50 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 01:13:27 +0000 Subject: [PATCH 110/114] Force device copyable and tell SYCL to shut it. --- Grid/tensors/Tensor_traits.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 98bc3986..536e17f1 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -404,3 +404,12 @@ NAMESPACE_BEGIN(Grid); }; NAMESPACE_END(Grid); + +#ifdef GRID_SYCL +template struct +sycl::is_device_copyable::value && (!std::is_trivially_copyable::value), + void>::type> + : public std::true_type {}; +#endif + From 891a366f73a5dfe9a6822c8d884c9b22c82de971 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 01:22:55 +0000 Subject: [PATCH 111/114] Repro CG script --- systems/Aurora/tests/repro16.pbs | 40 ++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 systems/Aurora/tests/repro16.pbs diff --git a/systems/Aurora/tests/repro16.pbs b/systems/Aurora/tests/repro16.pbs new file mode 100644 index 00000000..28030a3d --- /dev/null +++ b/systems/Aurora/tests/repro16.pbs @@ -0,0 +1,40 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=16 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 16 nodes, 192 ranks +CMD="mpiexec -np 192 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" +$CMD From b812a7b4c67e712ec1524fa1958c825057d1e27a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 01:32:40 +0000 Subject: [PATCH 112/114] Staggered launch script --- systems/Aurora/tests/solver/stag16.pbs | 40 ++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 systems/Aurora/tests/solver/stag16.pbs diff --git a/systems/Aurora/tests/solver/stag16.pbs b/systems/Aurora/tests/solver/stag16.pbs new file mode 100644 index 00000000..5bfe04a6 --- /dev/null +++ b/systems/Aurora/tests/solver/stag16.pbs @@ -0,0 +1,40 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=16 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 16 nodes, 192 ranks +CMD="mpiexec -np 192 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" +$CMD From 228bbb9d81a45cd08a3c49cfbe4f3a911a15ac5e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 19:03:35 +0100 Subject: [PATCH 113/114] Benchmark results --- .../Booster/benchmarks/Benchmark_usqcd.csv | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 systems/Booster/benchmarks/Benchmark_usqcd.csv diff --git a/systems/Booster/benchmarks/Benchmark_usqcd.csv b/systems/Booster/benchmarks/Benchmark_usqcd.csv new file mode 100644 index 00000000..68689deb --- /dev/null +++ b/systems/Booster/benchmarks/Benchmark_usqcd.csv @@ -0,0 +1,70 @@ +Memory Bandwidth + +Bytes, GB/s per node +3145728, 225.900365 +50331648, 2858.859504 +254803968, 4145.556367 +805306368, 4905.772480 +1966080000, 4978.312557 + + +GEMM + + M, N, K, BATCH, GF/s per rank +16, 8, 16, 256, 1.713639 +16, 16, 16, 256, 288.268316 +16, 32, 16, 256, 597.053950 +32, 8, 32, 256, 557.382591 +32, 16, 32, 256, 1100.145311 +32, 32, 32, 256, 1885.080449 +64, 8, 64, 256, 1725.163599 +64, 16, 64, 256, 3389.336566 +64, 32, 64, 256, 4168.252422 +16, 8, 256, 256, 1326.262134 +16, 16, 256, 256, 2318.095475 +16, 32, 256, 256, 3555.436503 +32, 8, 256, 256, 1920.139170 +32, 16, 256, 256, 3486.174753 +32, 32, 256, 256, 5320.821724 +64, 8, 256, 256, 2539.597502 +64, 16, 256, 256, 5003.456775 +64, 32, 256, 256, 7837.531562 +8, 256, 16, 256, 1427.848170 +16, 256, 16, 256, 2222.147815 +32, 256, 16, 256, 2877.121715 +8, 256, 32, 256, 1922.890086 +16, 256, 32, 256, 3199.469082 +32, 256, 32, 256, 4845.405343 +8, 256, 64, 256, 2639.483343 +16, 256, 64, 256, 5012.800299 +32, 256, 64, 256, 7216.006882 + + + +Communications + +Packet bytes, direction, GB/s per node +4718592, 2, 206.570734 +4718592, 3, 207.501847 +4718592, 6, 189.730277 +4718592, 7, 204.301218 +15925248, 2, 307.882997 +15925248, 3, 287.901076 +15925248, 6, 295.603109 +15925248, 7, 300.682033 +37748736, 2, 331.740364 +37748736, 3, 338.610627 +37748736, 6, 332.580657 +37748736, 7, 336.336579 + + +Per node summary table + +L , Wilson, DWF4, Staggered, GF/s per node + +8 , 16, 1165, 10 +12 , 473, 4901, 163 +16 , 1436, 8464, 442 +24 , 4133, 10139, 1530 +32 , 5726, 11487, 2518 + From 7e5bd46dd3033aab62599c4cde1d1fc6bb7af8e7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 19:03:45 +0100 Subject: [PATCH 114/114] Booster update --- Grid/algorithms/blas/BatchedBlas.h | 5 +++-- .../implementation/StaggeredKernelsImplementation.h | 12 +----------- systems/Booster/config-command | 6 ++++-- systems/Booster/sourceme.sh | 10 +++++----- 4 files changed, 13 insertions(+), 20 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 5ab46333..f6418b7e 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -31,7 +31,7 @@ Author: Peter Boyle #include #endif #ifdef GRID_CUDA -#include +#include #endif #ifdef GRID_SYCL #include @@ -51,7 +51,7 @@ NAMESPACE_BEGIN(Grid); typedef hipblasHandle_t gridblasHandle_t; #endif #ifdef GRID_CUDA - typedef cudablasHandle_t gridblasHandle_t; + typedef cublasHandle_t gridblasHandle_t; #endif #ifdef GRID_SYCL typedef cl::sycl::queue *gridblasHandle_t; @@ -78,6 +78,7 @@ public: #ifdef GRID_CUDA std::cout << "cublasCreate"<::DhopImproved(StencilImpl &st, LebesgueOrder &lo, if( interior && exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;} +#ifndef GRID_CUDA if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;} #endif } else if( interior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;} -#endif } else if( exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;} -#endif } assert(0 && " Kernel optimisation case not covered "); } @@ -322,19 +318,13 @@ void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, if( interior && exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;} -#endif } else if( interior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;} -#endif } else if( exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;} -#endif } } diff --git a/systems/Booster/config-command b/systems/Booster/config-command index 8530c5f9..1ba2dc7a 100644 --- a/systems/Booster/config-command +++ b/systems/Booster/config-command @@ -5,10 +5,12 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/ --enable-gen-simd-width=64 \ --enable-shm=nvlink \ --enable-accelerator=cuda \ + --disable-gparity \ + --disable-fermion-reps \ --with-lime=$LIME \ - --disable-accelerator-cshift \ + --enable-accelerator-cshift \ --disable-unified \ CXX=nvcc \ LDFLAGS="-cudart shared " \ - CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared" + CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas" diff --git a/systems/Booster/sourceme.sh b/systems/Booster/sourceme.sh index 56499be4..2341267f 100644 --- a/systems/Booster/sourceme.sh +++ b/systems/Booster/sourceme.sh @@ -1,5 +1,5 @@ -module load GCC/9.3.0 -module load GMP/6.2.0 -module load MPFR/4.1.0 -module load OpenMPI/4.1.0rc1 -module load CUDA/11.3 +module load GCC +module load GMP +module load MPFR +module load OpenMPI +module load CUDA