diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index b7afc43a..8bfe6c1a 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -141,7 +141,7 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { GridRedBlackCartesian &Hgrid, RealD _mass, const ImplParams &p = ImplParams(), const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() ); - + // DoubleStore impl dependent void ImportGauge(const GaugeField &_Umu); diff --git a/documentation/_build/latex/Grid.pdf b/documentation/Grid.pdf similarity index 50% rename from documentation/_build/latex/Grid.pdf rename to documentation/Grid.pdf index 921e48ec..8b9f2be1 100644 Binary files a/documentation/_build/latex/Grid.pdf and b/documentation/Grid.pdf differ diff --git a/documentation/conf.py b/documentation/conf.py index 3915e259..dc055c4f 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -80,7 +80,8 @@ primary_domain = 'cpp' # a list of builtin themes. # html_theme = 'alabaster' - +html_use_smartypants = False +smart_quotes = False # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. diff --git a/documentation/interfacing.rst b/documentation/interfacing.rst index 3fd0c8a3..79e44881 100644 --- a/documentation/interfacing.rst +++ b/documentation/interfacing.rst @@ -10,7 +10,7 @@ examples. MPI initialization ------------------- +-------------------- Grid supports threaded MPI sends and receives and, if running with more than one thread, requires the MPI_THREAD_MULTIPLE mode of message @@ -21,39 +21,46 @@ appropriate initialization call is:: assert(MPI_THREAD_MULTIPLE == provided); Grid Initialization -------------------- +--------------------- Grid itself is initialized with a call:: Grid_init(&argc, &argv); -.. todo:: CD: Where are the command-line arguments explained above? - +Command line options include:: + + --mpi n.n.n.n : default MPI decomposition + --threads n : default number of OMP threads + --grid n.n.n.n : default Grid size + where `argc` and `argv` are constructed to simulate the command-line -options described above. At a minimum one must provide the `--grid` -and `--mpi` parameters. The latter specifies the grid of processors -(MPI ranks). +options described above. At a minimum one usually provides the +`--grid` and `--mpi` parameters. The former specifies the lattice +dimensions and the latter specifies the grid of processors (MPI +ranks). If these parameters are not specified with the `Grid_init` +call, they need to be supplied later when creating Grid fields. -The following Grid procedures are useful for verifying that Grid is -properly initialized. +The following Grid procedures are useful for verifying that Grid +"default" values are properly initialized. ============================================================= =========================================================================================================== - Grid procedure returns + Grid procedure returns ============================================================= =========================================================================================================== - std::vector GridDefaultLatt(); lattice size - std::vector GridDefaultSimd(int Nd,vComplex::Nsimd()); SIMD layout - std::vector GridDefaultMpi(); MPI layout - int Grid::GridThread::GetThreads(); number of threads + std::vector GridDefaultLatt(); lattice size + std::vector GridDefaultSimd(int Nd,vComplex::Nsimd()); SIMD layout + std::vector GridDefaultMpi(); MPI layout + int Grid::GridThread::GetThreads(); number of threads +============================================================= =========================================================================================================== + MPI coordination ---------------- Grid wants to use its own numbering of MPI ranks and its own assignment of the lattice coordinates with each rank. Obviously, the -calling program and Grid must agree on these conventions. It is -convenient to use Grid's Cartesian communicator class to discover the -processor assignments. For a four-dimensional processor grid one can -define:: +calling program and Grid must agree on these conventions. One should +use Grid's Cartesian communicator class to discover the processor +assignments. For a four-dimensional processor grid one can define:: static Grid::CartesianCommunicator *grid_cart = NULL; grid_cart = new Grid::CartesianCommunicator(processors); @@ -94,14 +101,38 @@ index number of the subcommunicator. Once this is done,:: returns a rank that agrees with Grid's `peRank`. +QMP coordination +---------------- + +If the calling program uses the SciDAC QMP message-passing package, a +call to QMP_comm_split() instead can be used to reassign the ranks. +In the example below, `peGrid` gives the processor-grid dimensions, +usually set on the command line with `-qmp-geom`. + +**Example**:: + + int NDIM = QMP_get_allocated_number_of_dimensions(); + Grid::Grid_init(argc,argv); + FgridBase::grid_initted=true; + std::vector processors; + for(int i=0;i pePos(NDIM); + for(int i=NDIM-1;i>=0;i--) + pePos[i] = grid_cart._processor_coor[i]; + int peRank = grid_cart->RankFromProcessorCoor(pePos); + QMP_comm_split(QMP_comm_get_default(),0,peRank,&qmp_comm); + QMP_comm_set_default(qmp_comm); + Mapping fields between Grid and user layouts -------------------------------------------- +--------------------------------------------- -In order to map data between layouts, it is important to know -how the lattice sites are distributed across the processor grid. A -lattice site with coordinates `r[mu]` is assigned to the processor with -processor coordinates `pePos[mu]` according to the rule:: +In order to map data between calling-program and Grid layouts, it is +important to know how the lattice sites are distributed across the +processor grid. A lattice site with coordinates `r[mu]` is assigned +to the processor with processor coordinates `pePos[mu]` according to +the rule:: pePos[mu] = r[mu]/dim[mu] @@ -116,7 +147,7 @@ defined on the appropriate grid, whether it be a full lattice (4D `GridCartesian`), one of the checkerboards (4D `GridRedBlackCartesian`), a five-dimensional full grid (5D `GridCartesian`), or a five-dimensional checkerboard (5D -`GridRedBlackCartesian`). For example, an improved staggered fermion +`GridRedBlackCartesian`). For example, an improved staggered-fermion color-vector field `cv` on a single checkerboard would be constructed using @@ -131,12 +162,16 @@ using typename ImprovedStaggeredFermion::FermionField cv(RBGrid); -To map data within an MPI rank, the external code must iterate over -the sites belonging to that rank (full or checkerboard as -appropriate). To import data into Grid, the external data on a single -site with coordinates `r` is first copied into the appropriate Grid -scalar object `s`. Then it is copied into the Grid lattice field `l` -with `pokeLocalSite`:: +The example above assumes that the grid default values were set in the +`Grid_init` call. If not, they can be set at this point and passed +when `GridCartesian` is instantiated here. To map data within an MPI +rank, the external code must iterate over the sites belonging to that +rank (full or checkerboard as appropriate). Note that the site +coordinates are specified relative to the origin of the lattice +subvolume on that rank. To import data into Grid, the external data on +a single site with coordinates `r` is first copied into the +appropriate Grid scalar object `s`. Then it is copied into the Grid +lattice field `l` with `pokeLocalSite`:: pokeLocalSite(const sobj &s, Lattice &l, Coordinate &r); @@ -156,7 +191,7 @@ there to the lattice colour-vector field `cv`, as defined above. indexToCoords(idx,r); ColourVector cVec; for(int col=0; col r(4); - indexToCoords(idx,r); - std::vector r5(1,0); - for( int d = 0; d < 4; d++ ) r5.push_back(r[d]); + std::vector r(4); + indexToCoords(idx,r); + std::vector r5(1,0); + for( int d = 0; d < 4; d++ ) r5.push_back(r[d]); - for( int j = 0; j < Ls; j++ ){ + for( int j = 0; j < Ls; j++ ){ r5[0] = j; ColourVector cVec; for(int col=0; colcv), r5); - } + } diff --git a/documentation/manual.rst b/documentation/manual.rst index 26a99064..1596de5e 100644 --- a/documentation/manual.rst +++ b/documentation/manual.rst @@ -1,5 +1,4 @@ .. Grid documentation - .. highlight:: cpp Welcome to Grid's documentation! @@ -70,7 +69,7 @@ a programme is simply written as a series of statements, addressing entire latti Implementation details may be provided to explain how the code works, but are not strictly part of the API. -**Example** +**Example** For example, as an implementation detail, in a single programme multiple data (SPMD) message passing supercomputer the main programme is trivially replicated on each computing node. The data parallel operations are called *collectively* by all nodes. Any scalar values returned by the various reduction routines are the same on each node, resulting in (for example) the same decision being made by all nodes to terminate an iterative solver on the same iteration. @@ -91,7 +90,8 @@ or any codes directly interacting with the * Stencil -will make use of facilities provided by to assist the creation of high performance code. The internal data layout complexities +will make use of facilities provided by to assist the creation of high performance code. +The internal data layout complexities will be exposed to some degree and the interfaces are subject to change without notice as HPC architectures change. Since some of the internal implementation details are needed to explain the design strategy of grid these will be @@ -223,36 +223,32 @@ If you want to build all the tests just use `make tests`. Detailed build configuration options ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. todo:: CD: The double dash here gets turned into a pdf long dash. Not good. ======================================== ============================================================================================================================== - Option usage + Option usage ======================================== ============================================================================================================================== - `--prefix=` installation prefix for Grid. - `--with-gmp=` look for GMP in the UNIX prefix `` - `--with-mpfr=` look for MPFR in the UNIX prefix `` - `--with-fftw=` look for FFTW in the UNIX prefix `` - `--with-lime=` look for c-lime in the UNIX prefix `` - `--enable-lapack[=]` enable LAPACK support in Lanczos eigensolver. A UNIX prefix containing the library can be specified (optional). - `--enable-mkl[=]` use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional). - `--enable-simd=` setup Grid for the SIMD target `` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. - `--enable-gen-simd-width=` select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). - `--enable-precision={single|double}` set the default precision (default: `double`). - `--enable-precision=` use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-rng={sitmo|ranlux48|mt19937}` choose the RNG (default: `sitmo`). - `--disable-timers` disable system dependent high-resolution timers. - `--enable-chroma` enable Chroma regression tests. - `--enable-doxygen-doc` enable the Doxygen documentation generation (build with `make doxygen-doc`) + ``--prefix=path`` installation prefix for Grid. + ``--with-gmp=path`` look for GMP in the UNIX prefix `` + ``--with-mpfr=path`` look for MPFR in the UNIX prefix `` + ``--with-fftw=path`` look for FFTW in the UNIX prefix `` + ``--with-lime=path`` look for c-lime in the UNIX prefix `` + ``--enable-lapack[=path]`` enable LAPACK support in Lanczos eigensolver. A UNIX prefix containing the library can be specified (optional). + --enable-mkl[=path] use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional). + --enable-simd=code setup Grid for the SIMD target ``(default: `GEN`). A list of possible SIMD targets is detailed in a section below. + --enable-gen-simd-width=size select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). E.g. SSE 128 bit corresponds to 16 bytes. + --enable-precision=single|double set the default precision (default: `double`). + --enable-comms=mpi|none use `` for message passing (default: `none`). + --enable-rng=sitmo|ranlux48|mt19937 choose the RNG (default: `sitmo`). + --disable-timers disable system dependent high-resolution timers. + --enable-chroma enable Chroma regression tests. + --enable-doxygen-doc enable the Doxygen documentation generation (build with `make doxygen-doc`) ======================================== ============================================================================================================================== -.. todo:: CD: Somewhere, please provide more explanation of the --enable--gen-simd-width value -.. todo:: CD: Are there really two --enable-precision lines? - Possible communication interfaces ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following options can be use with the `--enable-comms=` option to target different communication interfaces: +The following options can be use with the `-\\-enable-comms=` option to target different communication interfaces: =============== ========================================================================================== Description @@ -264,14 +260,11 @@ The following options can be use with the `--enable-comms=` option to target dif For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard wrappers ( `CXX=CC` ) set up by Cray `PrgEnv` modules instead. -.. todo:: CD: Later below, there is an "mpi3". Should it be listed and - explained here? Is there an "mpit"? - Possible SIMD types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following options can be use with the `--enable-simd=` option to target different SIMD instruction sets: +The following options can be use with the `-\\-enable-simd=` option to target different SIMD instruction sets: ============ ===================================================================================================================== `` Description @@ -304,7 +297,7 @@ Notes versions of Grid when the AVX512 support in the compiler is more advanced. * For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform. * BG/Q performances are currently rather poor. This is being investigated for future versions. -* The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`. +* The vector size for the `GEN` target can be specified with the `configure` script option `-\\-enable-gen-simd-width`. Build setup for Intel Knights Landing platform --------------------------------------------------------------------------------------- @@ -336,12 +329,12 @@ where `` is the UNIX prefix where GMP and MPFR are installed. Knight's Landing with Intel Omnipath adapters with two adapters per node presently performs better with use of more than one rank per node, using shared memory -for interior communication. This is the mpi3 communications implementation. +for interior communication. We recommend four ranks per node for best performance, but optimum is local volume dependent. :: ../configure --enable-precision=double\ --enable-simd=KNL \ - --enable-comms=mpi3-auto \ + --enable-comms=mpi-auto \ --enable-mkl \ CC=icpc MPICXX=mpiicpc @@ -352,7 +345,7 @@ The following configuration is recommended for the Intel Haswell platform:: ../configure --enable-precision=double\ --enable-simd=AVX2 \ - --enable-comms=mpi3-auto \ + --enable-comms=mpi-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -369,7 +362,7 @@ If you are working on a Cray machine that does not use the `mpiicpc` wrapper, pl ../configure --enable-precision=double\ --enable-simd=AVX2 \ - --enable-comms=mpi3 \ + --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -388,7 +381,7 @@ The following configuration is recommended for the Intel Skylake platform:: ../configure --enable-precision=double\ --enable-simd=AVX512 \ - --enable-comms=mpi3 \ + --enable-comms=mpi \ --enable-mkl \ CXX=mpiicpc @@ -405,7 +398,7 @@ If you are working on a Cray machine that does not use the `mpiicpc` wrapper, pl ../configure --enable-precision=double\ --enable-simd=AVX512 \ - --enable-comms=mpi3 \ + --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -431,7 +424,7 @@ The following configuration is recommended for the AMD EPYC platform:: ../configure --enable-precision=double\ --enable-simd=AVX2 \ - --enable-comms=mpi3 \ + --enable-comms=mpi \ CXX=mpicxx If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:: @@ -444,12 +437,13 @@ where `` is the UNIX prefix where GMP and MPFR are installed. Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank. This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. -It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and -shared memory to communicate within this node:: +It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI and +shared memory to communicate within this + +.. describe:: command line mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 -.. todo:: CD: Maybe need bash highlighting, not cpp below - Generates warning Where omp_bind.sh does the following:: @@ -562,9 +556,8 @@ scalar matrix and vector classes:: template class iScalar { private: vobj _internal ; } template class iVector { private: vobj _internal[N] ; } - template class iMatrix { private: vobj _internal[N] ; } + template class iMatrix { private: vobj _internal[N][N] ; } -.. todo:: CD: Why is iMatrix only [N] and not [N][N]? These are template classes and can be passed a fundamental scalar or vector type, or nested to form arbitrarily complicated tensor products of indices. All mathematical expressions @@ -586,10 +579,27 @@ For Lattice field theory, we define types according to the following tensor product structure ordering. The suffix "D" indicates either double types, and replacing with "F" gives the corresponding single precision type. -.. todo:: CD: The test cases have R, which takes the compiled default. - Do we want to expose that and say something here? -.. todo:: CD: What is "Lattice" here? This section is about "iXXX" types. - Maybe say a few more introductory words. +The test cases have R, which takes the compiled default precision (either F or D). +This is for convenience only and may be deprecated in future forcing code external +to Grid to choose the specific word size. + +Type definitions are provided in qcd/QCD.h to give the internal index structures +of QCD codes. For example:: + + template + using iSinglet = iScalar > >; + using iSpinMatrix = iScalar, Ns> >; + using iColourMatrix = iScalar > > ; + using iSpinColourMatrix = iScalar, Ns> >; + using iLorentzColourMatrix = iVector >, Nd > ; + using iDoubleStoredColourMatrix = iVector >, Nds > ; + using iSpinVector = iScalar, Ns> >; + using iColourVector = iScalar > >; + using iSpinColourVector = iScalar, Ns> >; + using iHalfSpinVector = iScalar, Nhs> >; + using iHalfSpinColourVector = iScalar, Nhs> >; + +Giving the type table: ======= ======= ====== ====== =========== ======================= Lattice Lorentz Spin Colour scalar_type Field @@ -605,16 +615,22 @@ Scalar Scalar Matrix Matrix ComplexD SpinColourMatrixD The types are implemented via a recursive tensor nesting system. -.. todo:: CD: What choices are available for vtype? Is the "v" for "variable"? -.. todo:: CD: Should we say iLorentzColourMatrix is a Grid-provided typename? - Is there a list of similar convenience types? - -**Example** we declare:: +**Example** - template - using iLorentzColourMatrix = iVector >, Nd > ; +Here, the prefix "i" indicates for internal use, preserving the template nature of the class. +Final types are declared with vtype selected to be both scalar and vector, appropriate to a +single datum, or stored in a partial SoA transformed lattice object:: + + + // LorentzColour + typedef iLorentzColourMatrix LorentzColourMatrix; + typedef iLorentzColourMatrix LorentzColourMatrixF; + typedef iLorentzColourMatrix LorentzColourMatrixD; + + typedef iLorentzColourMatrix vLorentzColourMatrix; + typedef iLorentzColourMatrix vLorentzColourMatrixF; + typedef iLorentzColourMatrix vLorentzColourMatrixD; - typedef iLorentzColourMatrix LorentzColourMatrixD; Arbitrarily deep tensor nests may be formed. Grid uses a positional and numerical rule to associate indices for contraction in the Einstein summation sense. @@ -697,12 +713,6 @@ General code can access any specific index by number with a peek/poke semantic:: // poke index number "Level" of a matrix index template void pokeIndex (vtype &pokeme,arg,int i,int j) - -.. todo:: CD: The are the choices for "vtype"? - -.. todo:: CD: The example below does not use the template pair shown - above. It is good, but perhaps, also show the pair form of - the same example if there is one. **Example**:: @@ -804,9 +814,7 @@ The traceless anti-Hermitian part is taken with:: template iMatrix Ta(const iMatrix &arg) -Reunitarisation (or reorthogonalisation) is enabled by:: - -.. todo:: CD: U(3) or SU(3) projection? +SU(N) Reunitarisation (or reorthogonalisation) is enabled by:: template iMatrix ProjectOnGroup(const iMatrix &arg) @@ -977,18 +985,16 @@ Internally, Grid defines a portable abstraction SIMD vectorisation, via the foll * vComplexD -.. todo:: CD: Maybe say something about how SIMD vectorization works - here. Does a vRealF collect values for several SIMD lanes - at once? - These have the usual range of arithmetic operators and functions acting upon them. They do not form part of the API, but are mentioned to (partially) explain the need for controlling the -layout transformation in lattice objects. +layout transformation in lattice objects. They contain a number consecutive elements of the appropriate +Real/Complex type, where number is architecture depemendent. The number may be queried at runtime using:: + vComplexF::Nsimd(); + +The layout transformations in indexing functions in the Grid objects as completely parameterised by this Nsimd(). They are documented further in the Internals chapter. -.. todo:: CD: Might they be needed for interfacing with external code? - Coordinates ------------ @@ -1016,15 +1022,6 @@ This enables the coordinates to be manipulated without heap allocation or thread and avoids introducing STL functions into GPU code, but does so at the expense of introducing a maximum dimensionality. This limit is easy to change (lib/util/Coordinate.h). -.. todo:: CD: It would be very useful to explain how the communicator - works. That would include how the processor grid is - organized, how the lattice is subdivided across MPI ranks, - why Grid prefers to renumber the MPI ranks, what coordinates - go with what ranks? Ordinarily, this is hidden from the - user, but it is important for interfacing with external - code. Some methods and members of the communicator class - need to be "exposed" to make that possible. This might be a - good place for such a subsection? Grids ------------- @@ -1037,9 +1034,6 @@ are decomposed across MPI tasks and SIMD lanes of the vector length. We use a partial vectorisation transformation, must select which space-time dimensions participate in SIMD vectorisation. The Lattice containers are defined to have opaque internal layout, hiding this layout transformation. - -.. todo:: CD: The constructor simply defines the layout parameters. - It doesn't allocate space, right? Might be good to say. We define GridCartesian and GridRedBlackCartesian which both inherit from GridBase:: @@ -1071,10 +1065,16 @@ The Grid object provides much `internal` functionality to map a lattice site to a node and lexicographic index. These are not needed by code interfacing to the data parallel layer. -.. todo:: CD: What is specified with "split_rank" above? -.. todo:: CD: Maybe list the exposed Grid options within the "SpaceTimeGrid" - class. +When the requested processor grid is smaller than the parent's processor grid, multiple copies of the +same geometry communicator are created, indexed by spli_rank. This can be convenient to split +a job into multiple independent sub jobs (a long present feature of MPI). It can be particularly +effective in valence analysis, where for example, many inversions are performed on a single configuration. +These can be made on smaller communicators in parallel and communications overheads minimised. Routines:: + Grid_split + Grid_unsplit + +are provided to communicate fields between different communicators (e.g. between inversion and contraction phases). **Example** (tests/solver/Test_split_grid.cc):: @@ -1110,6 +1110,32 @@ to the data parallel layer. GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid); GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,SGrid); + /////////////////////////////////////////////////////////////// + // split the source out using MPI instead of I/O + /////////////////////////////////////////////////////////////// + Grid_split (Umu,s_Umu); + Grid_split (src,s_src); + +**Internals** + +The processor Grid is defined by data values in the Communicator object:: + + int _Nprocessors; // How many in all + std::vector _processors; // Which dimensions get relayed out over processors lanes. + int _processor; // linear processor rank + std::vector _processor_coor; // linear processor coordinate + unsigned long _ndimension; + Grid_MPI_Comm communicator; + +The final of these is potentially an MPI Cartesian communicator, mapping some total number of processors +to an N-dimensional coordinate system. This is used by Grid to geometrically decompose the subvolumes of a +lattice field across processing elements. Grid is aware of multiple ranks per node and attempts to ensure +that the geometrical decomposition keeps as many neigbours as possible on the same node. This is done +by reordering the ranks in the constructor of a Communicator object once the topology requested has +been indicated, via an internal call to the method OptimalCommunicator(). The reordering is chosen +by Grid to trick MPI, which makes a simple lexicographic assignment of ranks to coordinate, to ensure +that the simple lexicographic assignment of the reordered ranks is the optimal choice. MPI does not do this +by default and substantial improvements arise from this design choice. Lattice containers ----------------------------------------- @@ -1137,6 +1163,7 @@ The full range of QCD relevant lattice objects is given below. ======= ======= ====== ====== =========== ============================= ===================== Lattice Lorentz Spin Colour scalar_type Field Synonym ======= ======= ====== ====== =========== ============================= ===================== +Vector Scalar Scalar Scalar Integer LatticeInteger N/A Vector Scalar Scalar Scalar RealD LatticeRealD N/A Vector Scalar Scalar Scalar ComplexD LatticeComplexD N/A Vector Scalar Scalar Matrix ComplexD LatticeColourMatrixD LatticeGaugeLink @@ -1148,10 +1175,7 @@ Vector Scalar Matrix Matrix ComplexD LatticeSpinColourMatrixD Additional single precison variants are defined with the suffix "F". Other lattice objects can be defined using the sort of typedef's shown above if needed. - -.. todo:: CD: Are there others to expose, such as LatticeInteger, - LatticeFermionD, LatticeGaugeFieldD, LatticePropagatorD, - etc? If so, could this list be made complete? +LatticeInteger is typically only used in the form of predicate fields for where statements. Opaque containers ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1167,7 +1191,8 @@ are provided (lib/lattice/Lattice_transfer.h):: unvectorizeToLexOrdArray(std::vector &out, const Lattice &in); vectorizeFromLexOrdArray(std::vector &in , Lattice &out); -.. todo:: CD: Explain the choices for sobj and vobj. +sobj and vobj should be a matching pair of scalar and vector objects of the same internal structure. +The compiler will let you know with a long and verbose complaint if they are not. The Lexicographic order of data in the external vector fields is defined by (lib/util/Lexicographic.h):: @@ -1203,9 +1228,6 @@ peeking and poking specific indices in a data parallel manner:: template // Matrix poke void PokeIndex(Lattice &lhs,const Lattice<> & rhs,int i,int j) - -.. todo:: CD: Maybe mention that these match operations with scalar - objects, as listed above under "Internal index manipulation." The inconsistent capitalisation on the letter P is due to an obscure bug in g++ that has not to our knowledge been fixed in any version. The bug was reported in 2016. @@ -1363,7 +1385,7 @@ Logical are defined on LatticeInteger types:: operator|| -Ternary operator, logical operatons and where +Ternary operator, logical operations and where ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Within the data parallel level of the API the only way to perform operations @@ -1400,9 +1422,12 @@ A usage example is given result = where(mod(coor,block)==(block-1),x,z); -.. todo:: CD: A few words motivating this example? +This example takes result to be either "x" or "z" in a coordinate dependent way. +When third (z) lattice coordinate lies at the boundaries of a block size (periodic arithmetic). +This example is lifted and paraphrased from code that (data parallel) evaluates matrix elements +for a coarse representation of the Dirac operator in multigrid. -(Other usage cases of LatticeCoordinate include the generation of plane wave momentum phases.) +Other usage cases of LatticeCoordinate include the generation of plane wave momentum phases. Site local fused operations ------------------------------------------ @@ -1470,7 +1495,16 @@ accelerator_loops The second parallel primitive is the "accelerator_loop". -.. todo:: CD: What is the difference between these two loops? +The thread loop runs on host processor cores only. If the enabled architecture is +VGPU, if Grid is configured with + + --enable-simd=VGPU, + +the acccelerator_loop may run on a GPU if present. On non-accelerated architectures, +the accelerator_loop will simply run as a an OpenMP thread_loop. + +It is planned to support multiple forms of acccelerator in future, including OpenMP 5.0 offload, +and possibly SyCL based offload. **Example**:: @@ -1532,50 +1566,6 @@ lattice site :math:`x_\mu = 1` in the rhs to :math:`x_\mu = 0` in the result. } -CovariantCshift -^^^^^^^^^^^^^^^^^^^^ - -Covariant Cshift operations are provided for common cases of the boundary condition. These may be further optimised -in future:: - - template - Lattice CovShiftForward(const Lattice &Link, int mu, - const Lattice &field); - - template - Lattice CovShiftBackward(const Lattice &Link, int mu, - const Lattice &field); - -Boundary conditions -^^^^^^^^^^^^^^^^^^^^ - -The covariant shift routines occur in namespaces PeriodicBC and ConjugateBC. The correct covariant shift -for the boundary condition is passed into the gauge actions and wilson loops via an -"Impl" template policy class. - -The relevant staples, plaquettes, and loops are formed by using the provided method:: - - Impl::CovShiftForward - Impl::CovShiftBackward - -etc... This makes physics code transform appropriately with externally supplied rules about -treating the boundary. - -**Example** (lib/qcd/util/WilsonLoops.h):: - - static void dirPlaquette(GaugeMat &plaq, const std::vector &U, - const int mu, const int nu) { - // ___ - //| | - //|<__| - plaq = Gimpl::CovShiftForward(U[mu],mu, - Gimpl::CovShiftForward(U[nu],nu, - Gimpl::CovShiftBackward(U[mu],mu, - Gimpl::CovShiftIdentityBackward(U[nu], nu)))); - } - -.. todo:: CD: This example uses Gimpl instead of Impl. What is the - difference, and what are the exposed choices for Impl? Inter-grid transfer operations @@ -1649,11 +1639,6 @@ Growing a lattice by a multiple factor, with periodic replication:: That latter is useful to, for example, pre-thermalise a smaller volume and then grow the volume in HMC. It was written while debugging G-parity boundary conditions. -Input/Output facilities ---------------------------------------------- - - - Random number generators ========================================= @@ -1687,18 +1672,32 @@ The interface is as follows:: class GridSerialRNG { GridSerialRNG(); void SeedFixedIntegers(const std::vector &seeds); + void SeedUniqueString(const std::string &s); } class GridParallelRNG { GridParallelRNG(GridBase *grid); void SeedFixedIntegers(const std::vector &seeds); + void SeedUniqueString(const std::string &s); } - template void random(GridParallelRNG &rng,Lattice &l) { rng.fill(l,rng._uniform); } - template void gaussian(GridParallelRNG &rng,Lattice &l) { rng.fill(l,rng._gaussian); } +* Seeding - template void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); } - template void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } +The SeedUniqueString uses a 256bit SHA from the OpenSSL library to construct integer seeds. +The reason for this is to enable reproducible seeding in the measurement sector of physics codes. +For example, labelling a random drawn by a string representation the physics information, and the +appending trajectory number will give a unique set of seeds for each measurement on each trajectory. +This string based functionality is probably not expected to be used in a lattice evolution, except for +possibly the initial state. Subsequent evolution should checkpoint and restore lattice RNG state using +the interfaces below. + +These may be drawn as follows:: + + void random(GridParallelRNG &rng,Lattice &l) { rng.fill(l,rng._uniform); } + void gaussian(GridParallelRNG &rng,Lattice &l) { rng.fill(l,rng._gaussian); } + + void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); } + void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } * Serial RNG's are used to assign scalar fields. @@ -1982,7 +1981,7 @@ The routines in this section rely on the c-lime library being enabled in the Gri (http://usqcd-software.github.io/c-lime/). The configure script searches for `lime`, but the location can be indicated with the - `--with-lime=prefix` + `-\\-with-lime=prefix` flag. @@ -2129,25 +2128,28 @@ And a derived class adding methods suitable to red black preconditioning:: virtual void MooeeInvDag (const Field &in, Field &out)=0; }; -============= ============================================== +A checkerboard is defined as a parity :math:`(x+y+z+t)|2`, and half checker board operations supported +for red black preconditioning. + + +============= ==================================================================== Member Description -============= ============================================== -M -Mdag -MdagM -Mdiag -Mdir -Meooe -Mooee -MooeInv -MeooeDag -MooeeDag -MooeeInvDag -============= ============================================== +============= ==================================================================== +M Apply matrix +Mdag Apply matrix adjoint +MdagM Apply Matrix then adjoin matrix +Mdiag Apply site diagonal part of matrix +Mdir Apply terms involving hopping one direction +Meooe Apply even/odd matrix. ResultCB determined by InputCB +Mooee Apply site diagonal terms to a CB field +MooeInv Apply inverse of site diagonal terms to a CB field +MeooeDag Apply adjoint of Meooe +MooeeDag Apply adjoint of Mooee +MooeeInvDag Apply adjoint inverse of Mooee +============= ==================================================================== All Fermion operators will derive from this base class. -.. todo:: CD: Descriptions needed. Linear Operators ------------------- @@ -2160,7 +2162,6 @@ between RB and non-RB variants. Sparse matrix is like the fermion action def, an the wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising replication of code. -.. todo:: CD: Descriptions needed below. **Abstract base**:: @@ -2177,20 +2178,20 @@ replication of code. virtual void HermOp(const Field &in, Field &out)=0; }; -============== ============================================== +============== ========================================================================== Member Description -============== ============================================== -OpDiag -OpDir -Op -AdjOp -HermOpAndNorm -HermOp -============== ============================================== +============== ========================================================================== +OpDiag Diagonal term +OpDir Terms involving hopping in one direction +Op Full operator +AdjOp Full operator adjoint +HermOp A hermitian version of operator (possibly squared) +HermOpAndNorm A hermitian version of operator (possibly squared) returns norm result +============== ========================================================================== - MdagMLinearOperator -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +MdagMLinearOperator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This Linear operator takes a SparseMatrix (Fermion operator) and implements the unpreconditioned MdagM operator with the above interface:: @@ -2276,13 +2277,12 @@ The meaning of these different operators is ======================= ====================================================================================== Operator Description ======================= ====================================================================================== -SchurDiagMooeeeOperator -SchurDiagOneLH -SchurDiagOneRH +SchurDiagMooeeeOperator :math:`M_{oo} + M_{oe} M_ee^{-1} M_{eo}` +SchurDiagOneLH :math:`1 + M_{oo}^{-1}M_{oe} M_ee^{-1} M_{eo}` +SchurDiagOneRH :math:`1 + M_{oe} M_ee^{-1} M_{eo}M_{oo}^{-1}` SchurStaggeredOperator ======================= ====================================================================================== -.. todo:: CD: Descriptions needed. Operator Functions =================== @@ -2332,121 +2332,984 @@ Audit this:: Algorithms ========================================= -.. todo:: CD: The whole section needs to be completed, of course +In this section we describe a number of algorithmic areas present in the core Grid library. + +* Approximation: Chebyshev and Rational +* Krylov solvers: Conjugate Gradient +* Eigensolver: Chebyshev preconditioned Lanczos +* FFT: multidimensional FFT of arbitrary lattice fields Approximation -------------- +Both Chebyshev and Rational approximation codes are included. + + Polynomial ^^^^^^^^^^^^^ A polynomial of an operator with a given set of coefficients can be applied:: - template - class Polynomial : public OperatorFunction { - Polynomial(std::vector &_Coeffs) ; - }; - + template + class Polynomial : public OperatorFunction { + Polynomial(std::vector &_Coeffs) ; + }; Chebyshev ^^^^^^^^^^^ +Class:: + + template class Chebyshev : public OperatorFunction + +Defines constructors:: + + Chebyshev(ChebyParams p); + Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ); + Chebyshev(RealD _lo,RealD _hi,int _order) ; + +and methods:: + + RealD approx(RealD x); + + void operator() (LinearOperatorBase &Linop, const Field &in, Field &out) ; + +This will apply the appropriate polynomial of the LinearOperator Linop to the type Field. +The coefficient for the standard Chebyshev approximation to an arbitrary function, over tha range +[hi,lo] can be set up with the appropriate constructor call. + Remez ^^^^^^^^^^^ -Iterative ------------- +We adopt the Remez class written by Kate Clark with minimal modifications for compatibility + +Class:: + + class AlgRemez + + +Iterative solvers and algorithms +----------------------------------- + +We document a number of iterative algorithms of topical relevance to Lattice Gauge theory. +These are written for application to arbitrary fields and arbitrary operators using type +templating, by implementating them as arbitrary OperatorFunctions. + +Most of these algorithms these algorithms operate on a generic matrix class, which +derives from LinearOperatorBase. + +Linear operators +^^^^^^^^^^^^^^^^^ + +By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code +between RB and non-RB variants. Sparse matrix is an abstract fermion action def, and then +the LinearOperator wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising +replication of code. + +algorithms/LinearOperator.h + +Class:: + + template class LinearOperatorBase { + public: + + // Support for coarsening to a multigrid + virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base + virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base + + virtual void Op (const Field &in, Field &out) = 0; // Abstract base + virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base + virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0; + virtual void HermOp(const Field &in, Field &out)=0; + }; + +The specific operators are: + + template class MdagMLinearOperator : public LinearOperatorBase + template class ShiftedMdagMLinearOperator : public LinearOperatorBase + template class HermitianLinearOperator : public LinearOperatorBase + template class SchurOperatorBase : public LinearOperatorBase + template class SchurDiagOneRH : public SchurOperatorBase + template class SchurDiagOneLH : public SchurOperatorBase + template class SchurStaggeredOperator : public SchurOperatorBase + Conjugate Gradient ^^^^^^^^^^^^^^^^^^^ +algorithms/iterative/ConjugateGradient.h + +Class:: + + template class ConjugateGradient : public OperatorFunction + +with methods:: + + ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true); + + void operator()(LinearOperatorBase &Linop, const Field &src, Field &psi) ; + Multishift Conjugate Gradient ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -MultiRHS Conjugate Gradient -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +algorithms/iterative/ConjugateGradientMultiShift.h + +Class:: template class ConjugateGradientMultiShift : public OperatorMultiFunction, public OperatorFunction + +with methods:: + + ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true); + + void operator()(LinearOperatorBase &Linop, const Field &src, Field &psi) ; + Block Conjugate Gradient ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +algorithms/iterative/BlockConjugateGradient.h + +Class:: + + template class BlockConjugateGradient : public OperatorFunction + +Several options are possible. The behaviour is controlled by an enumeration. + +Enum:: + + enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS }; + +Constructor:: + + BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) + +With operator:: + + void operator()(LinearOperatorBase &Linop, const Field &Src, Field &Psi) + + +* CGmultiRHS + + +This applies conjugate gradient to multiple right hand sides concurrently making +use of a separate Krylov space for each. There is no cross coupling and +the routine is equivalent to running each of these independently one after the other +in term of iteration count. + +* BlockCGrQ + +This applies block conjugate gradient to multiple right hand sides concurrently making +use of a shared Krylov space for each. The cross coupling may in some cases lead to +acceleration of convergence and reduced matrix multiplies for multiple solves. + Mixed precision Conjugate Gradient ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Conjugate Gradient Reliable Update -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Class:: + + template class MixedPrecisionConjugateGradient : public LinearFunction + + +Applies an inner outer mixed precision Conjagate Gradient. It has constructor:: + + MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase &_Linop_f, LinearOperatorBase &_Linop_d) : + + +Where the linear operators are for the single precision and double precision operators respectively. +The operator to apply the inversion is:: + + void operator() (const FieldD &src_d_in, FieldD &sol_d){ -Adef Generic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Preconditioned Conjugate Gradient -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Preconditioned Conjugate Residual ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Preconditioned Generalised Conjugate Residual -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Class:: + + template class PrecConjugateResidual : public OperatorFunction + +Constructor:: + + PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction &Prec) + + +Solve method:: + + void operator() (LinearOperatorBase &Linop,const Field &src, Field &psi) + Implicitly restarted Lanczos ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Helpers and wrappers ----------------------- +Class:: + + template class ImplicitlyRestartedLanczos + +Solve method:: + + void calc(std::vector& eval, std::vector& evec, const Field& src, int& Nconv, bool reverse=false) -Normal Equations -^^^^^^^^^^^^^^^^^ Schur decomposition ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +======================= ====================================================================================== +Operator Description +======================= ====================================================================================== +SchurDiagMooeeeOperator :math:`M_{oo} + M_{oe} M_ee^{-1} M_{eo}` +SchurDiagOneLH :math:`1 + M_{oo}^{-1}M_{oe} M_ee^{-1} M_{eo}` +SchurDiagOneRH :math:`1 + M_{oe} M_ee^{-1} M_{eo}M_{oo}^{-1}` +SchurStaggeredOperator :math:`m^2 - M_{oe} M_{eo}` +======================= ====================================================================================== + +Associated with these operators are convenience wrappers for Schur +decomposed solution of the full system are provided (red-black preconditioning, algorithms/iterative/SchurRedBlack.h): + +Class:: + + template class SchurRedBlackStaggeredSolve + template class SchurRedBlackDiagMooeeSolve + template class SchurRedBlackDiagOneLHSolve + template class SchurRedBlackDiagOneRHSolve + +Constructors:: + + SchurRedBlackStaggeredSolve(OperatorFunction &HermitianRBSolver); + SchurRedBlackDiagMooeeSolve(OperatorFunction &HermitianRBSolver,int cb=0); + SchurRedBlackDiagOneLHSolve(OperatorFunction &HermitianRBSolver,int cb=0); + SchurRedBlackDiagOneRHSolve(OperatorFunction &HermitianRBSolver,int cb=0); + +The cb parameter specifies which checkerboard the SchurDecomposition factorises around, and the +HermitianRBSolver parameter is an algorithm class, such as conjugate gradients, for solving the +system of equations on a single checkerboard. + + +All have the operator method, returning both checkerboard solutions:: + + template void operator() (Matrix & _Matrix,const Field &in, Field &out) + +In order to allow for deflation of the preconditioned system, and external guess constructor is possible:: + + template void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){ + Lattice Gauge theory utilities ========================================= -.. todo:: CD: The whole section needs to be completed, of course - -.. todo:: CD: Gamma matrices? - Spin projection, reconstruction? - Lie Algebra? - -Types --------------- Spin -------- +See, for example, tests/core/Test_gamma.cc for a complete self test of the +Grid Dirac algebra, spin projectors, and Gamma multiplication table. + +The spin basis is: + +.. math:: \gamma_x= \left(\begin{array}{cccc} 0& 0& 0& i\\ 0& 0& i& 0\\ 0&-i& 0& 0\\ -i& 0& 0& 0 \end{array}\right) + +.. math:: \gamma_y= \left(\begin{array}{cccc} 0& 0& 0&-1\\ 0& 0& 1& 0\\ 0& 1& 0& 0\\ -1& 0& 0& 0 \end{array}\right) + +.. math:: \gamma_z= \left(\begin{array}{cccc} 0& 0& i& 0\\ 0& 0& 0&-i\\ -i& 0& 0& 0\\ 0& i& 0& 0 \end{array}\right) + +.. math:: \gamma_t= \left(\begin{array}{cccc} 0& 0& 1& 0\\ 0& 0& 0& 1\\ 1& 0& 0& 0\\ 0& 1& 0& 0 \end{array}\right) + +.. math:: \gamma_5= \left(\begin{array}{cccc} 1& 0& 0& 0\\ 0& 1& 0& 0\\ 0& 0&-1 &0\\ 0& 0& 0&-1 \end{array}\right) + +These can be accessed via a strongly typed enumeration to avoid multiplication by zeros. +The values are considered opaque, and symbolic names must be used. +These are signed (prefixes like MinusIdentity also work):: + + Gamma::Algebra::Identity + Gamma::Algebra::Gamma5 + Gamma::Algebra::GammaT + Gamma::Algebra::GammaTGamma5 + Gamma::Algebra::GammaX + Gamma::Algebra::GammaXGamma5 + Gamma::Algebra::GammaY + Gamma::Algebra::GammaYGamma5 + Gamma::Algebra::GammaZ + Gamma::Algebra::GammaZGamma5 + Gamma::Algebra::SigmaXT + Gamma::Algebra::SigmaXZ + Gamma::Algebra::SigmaXY + Gamma::Algebra::SigmaYT + Gamma::Algebra::SigmaYZ + Gamma::Algebra::SigmaZT + +**Example** + +They can be used, for example (benchmarks/Benchmark_wilson.cc):: + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + { // Naive wilson implementation + ref = zero; + for(int mu=0;mu void spProjXp (iVector &hspin,const iVector &fspin) + template void spProjYp (iVector &hspin,const iVector &fspin) + template void spProjZp (iVector &hspin,const iVector &fspin) + template void spProjTp (iVector &hspin,const iVector &fspin) + template void spProj5p (iVector &hspin,const iVector &fspin) + template void spProjXm (iVector &hspin,const iVector &fspin) + template void spProjYm (iVector &hspin,const iVector &fspin) + template void spProjZm (iVector &hspin,const iVector &fspin) + template void spProjTm (iVector &hspin,const iVector &fspin) + template void spProj5m (iVector &hspin,const iVector &fspin) + +and there are associated reconstruction routines for assembling four spinors from these two spinors:: + + template void spReconXp (iVector &fspin,const iVector &hspin) + template void spReconYp (iVector &fspin,const iVector &hspin) + template void spReconZp (iVector &fspin,const iVector &hspin) + template void spReconTp (iVector &fspin,const iVector &hspin) + template void spRecon5p (iVector &fspin,const iVector &hspin) + template void spReconXm (iVector &fspin,const iVector &hspin) + template void spReconYm (iVector &fspin,const iVector &hspin) + template void spReconZm (iVector &fspin,const iVector &hspin) + template void spReconTm (iVector &fspin,const iVector &hspin) + template void spRecon5m (iVector &fspin,const iVector &hspin) + + template void accumReconXp (iVector &fspin,const iVector &hspin) + template void accumReconYp (iVector &fspin,const iVector &hspin) + template void accumReconZp (iVector &fspin,const iVector &hspin) + template void accumReconTp (iVector &fspin,const iVector &hspin) + template void accumRecon5p (iVector &fspin,const iVector &hspin) + template void accumReconXm (iVector &fspin,const iVector &hspin) + template void accumReconYm (iVector &fspin,const iVector &hspin) + template void accumReconZm (iVector &fspin,const iVector &hspin) + template void accumReconTm (iVector &fspin,const iVector &hspin) + template void accumRecon5m (iVector &fspin,const iVector &hspin) + +These ca + + SU(N) -------- +A generic Nc qcd/utils/SUn.h is provided. This defines a template class:: + + template class SU ; + +The most important external methods are:: + + static void printGenerators(void) ; + template static void generator(int lieIndex, iSUnMatrix &ta) ; + + static void SubGroupHeatBath(GridSerialRNG &sRNG, GridParallelRNG &pRNG, RealD beta, // coeff multiplying staple in action (with no 1/Nc) + LatticeMatrix &link, + const LatticeMatrix &barestaple, // multiplied by action coeffs so th + int su2_subgroup, int nheatbath, LatticeInteger &wheremask); + + static void GaussianFundamentalLieAlgebraMatrix(GridParallelRNG &pRNG, + LatticeMatrix &out, + Real scale = 1.0) ; + static void GaugeTransform( GaugeField &Umu, GaugeMat &g) + static void RandomGaugeTransform(GridParallelRNG &pRNG, GaugeField &Umu, GaugeMat &g); + + static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) ; + static void TepidConfiguration(GridParallelRNG &pRNG,GaugeField &out); + static void ColdConfiguration(GaugeField &out); + + static void taProj( const LatticeMatrixType &in, LatticeMatrixType &out); + static void taExp(const LatticeMatrixType &x, LatticeMatrixType &ex) ; + + static int su2subgroups(void) ; // returns how many subgroups + + +Specific instantiations are defined:: + + typedef SU<2> SU2; + typedef SU<3> SU3; + typedef SU<4> SU4; + typedef SU<5> SU5; + +For example, Quenched QCD updating may be run as (tests/core/Test_quenched_update.cc):: + + for(int sweep=0;sweep<1000;sweep++){ + + RealD plaq = ColourWilsonLoops::avgPlaquette(Umu); + + std::cout<(Umu,mu); + + for( int subgroup=0;subgroup(Umu,link,mu); + + //reunitarise link; + ProjectOnGroup(Umu); + } + } + } + + Space time grids ---------------- -Random configurations and random gauge transforms ---------------------------------------------------- - - -Wilson loops --------------- Lattice actions ========================================= -.. todo:: CD: The whole section needs to be completed, of course - -Gauge --------- +We discuss in some detail the implementation of the lattice actions. +The action is a sum of terms, each of which must inherit from and provide the following interface. + +lib/qcd/action/ActionBase.h:: + + class Action + { + + public: + bool is_smeared = false; + // Heatbath? + virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions + virtual RealD S(const GaugeField& U) = 0; // evaluate the action + virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative + virtual std::string action_name() = 0; // return the action name + virtual std::string LogParameters() = 0; // prints action parameters + virtual ~Action(){} + }; + +Fermion Lattice actions are defined in the qcd/action/fermion subdirectory and in the +qcd/action/gauge subdirectories. For Hybrid Monte Carlo and derivative sampling algorithm +Pseudofermoin actions are defined in the qcd/action/pseudofermion subdirectory. + +The simplest lattice action is the Wilson plaquette action, and we start by considering the Wilson loops +facilities as this is illustrative of the implementation policy design approach. + +Wilson loops +-------------- + +Wilson loops are common objects in Lattice Gauge Theory. +A utility class is provided to assist assembling these as they occur both in common observable construction but +also in actions such as the Wilson plaquette and the rectangle actions. + +Since derivatives with respect to gauge links are required for evolution codes, non-closed staples of +various types are also provided. The gauge actions are all assembled consistently from the Wilson loops +class. + +**Implementation policies** + +The Wilson loops class is templated to take a implementation policy class parameter. The covarian Cshift is inherity from +the policy and implements boundary conditions, such as period, anti-period or charge conjugate. In this +way the Wilson loop code can automatically transform with the boundary condition and give the right plaquette, +force terms etc... for the boundary conditions passed in external to the class. + + +This implementation policy class is called an "impl", and a class that bundles together all the required +rules to assemble a gauge action is called a Gimpl. + +There are several facilities provided by a Gimpl. + +These include Boundary conditions and consequently a CovariantCshift. + +CovariantCshift +^^^^^^^^^^^^^^^^^^ + +Covariant Cshift operations are provided for common cases of the boundary condition. These may be further optimised +in future:: + + template + Lattice CovShiftForward(const Lattice &Link, int mu, + const Lattice &field); + + template + Lattice CovShiftBackward(const Lattice &Link, int mu, + const Lattice &field); + +Boundary conditions +^^^^^^^^^^^^^^^^^^^^^^^^ + +The covariant shift routines occur in namespaces PeriodicBC and ConjugateBC. The correct covariant shift +for the boundary condition is passed into the gauge actions and wilson loops via an +"Impl" template policy class. + +The relevant staples, plaquettes, and loops are formed by using the provided method:: + + Impl::CovShiftForward + Impl::CovShiftBackward + +etc... This makes physics code transform appropriately with externally supplied rules about +treating the boundary. + +**Example** (lib/qcd/util/WilsonLoops.h):: + + static void dirPlaquette(GaugeMat &plaq, const std::vector &U, + const int mu, const int nu) { + // ___ + //| | + //|<__| + plaq = Gimpl::CovShiftForward(U[mu],mu, + Gimpl::CovShiftForward(U[nu],nu, + Gimpl::CovShiftBackward(U[mu],mu, + Gimpl::CovShiftIdentityBackward(U[nu], nu)))); + } + +Currently provided predefined implementations are (qcd/action/gauge/GaugeImplementations.h):: + + typedef PeriodicGaugeImpl PeriodicGimplR; // Real.. whichever prec + typedef PeriodicGaugeImpl PeriodicGimplF; // Float + typedef PeriodicGaugeImpl PeriodicGimplD; // Double + + typedef PeriodicGaugeImpl PeriodicGimplAdjR; // Real.. whichever prec + typedef PeriodicGaugeImpl PeriodicGimplAdjF; // Float + typedef PeriodicGaugeImpl PeriodicGimplAdjD; // Double + + typedef ConjugateGaugeImpl ConjugateGimplR; // Real.. whichever prec + typedef ConjugateGaugeImpl ConjugateGimplF; // Float + typedef ConjugateGaugeImpl ConjugateGimplD; // Double + +Gauge Actions +--------------- + +lib/qcd/action/gauge/Photon.h defines the U(1) field:: + + class Photon + +using Fourier techniques. + +lib/qcd/action/gauge/WilsonGaugeAction.h defines the standard plaquette action:: + + template + class WilsonGaugeAction : public Action ; + +This action is suitable to use in a Hybrid Monte Carlo evolution as an action term and has constructor:: + + WilsonGaugeAction(RealD beta_); + +lib/qcd/action/gauge/PlaqPlusRectangleAction.h defines the standard plaquette plus rectangle class of action:: + + template + class PlaqPlusRectangleAction : public Action ; + +The constructor is:: + + PlaqPlusRectangleAction(RealD b,RealD c); + +Due to varying conventions, convenience wrappers are provided:: + + template class RBCGaugeAction : public PlaqPlusRectangleAction; + template class IwasakiGaugeAction : public RBCGaugeAction ; + template class SymanzikGaugeAction : public RBCGaugeAction ; + template class DBW2GaugeAction : public RBCGaugeAction ; + +With convenience constructors to set the rectangle coefficient automatically to popular values:: + + SymanzikGaugeAction(RealD beta) : RBCGaugeAction(beta,-1.0/12.0) {}; + IwasakiGaugeAction(RealD beta) : RBCGaugeAction(beta,-0.331) {}; + DBW2GaugeAction(RealD beta) : RBCGaugeAction(beta,-1.4067) {}; + Fermion -------- +These classes all make use of a Fermion Implementation (Fimpl) policy class to provide +things like boundary conditions, covariant transportation rules etc. + +All Fermion operators actions inherit from a common base class, + +that conforms to the CheckerBoardedSparseMatrix interface and constrains these objects to conform to the +interface expected by general algorithms in Grid:: + + ///////////////////////////////////////////////////////////////////////////////////////////// + // Interface defining what I expect of a general sparse matrix, such as a Fermion action + ///////////////////////////////////////////////////////////////////////////////////////////// + template class SparseMatrixBase { + public: + virtual GridBase *Grid(void) =0; + // Full checkerboar operations + virtual RealD M (const Field &in, Field &out)=0; + virtual RealD Mdag (const Field &in, Field &out)=0; + virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) { + Field tmp (in._grid); + ni=M(in,tmp); + no=Mdag(tmp,out); + } + virtual void Mdiag (const Field &in, Field &out)=0; + virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; + }; + + ///////////////////////////////////////////////////////////////////////////////////////////// + // Interface augmented by a red black sparse matrix, such as a Fermion action + ///////////////////////////////////////////////////////////////////////////////////////////// + template class CheckerBoardedSparseMatrixBase : public SparseMatrixBase { + public: + virtual GridBase *RedBlackGrid(void)=0; + // half checkerboard operaions + virtual void Meooe (const Field &in, Field &out)=0; + virtual void Mooee (const Field &in, Field &out)=0; + virtual void MooeeInv (const Field &in, Field &out)=0; + + virtual void MeooeDag (const Field &in, Field &out)=0; + virtual void MooeeDag (const Field &in, Field &out)=0; + virtual void MooeeInvDag (const Field &in, Field &out)=0; + + }; + +The base class for Fermion Operators inherits frmo these:: + + template + class FermionOperator : public CheckerBoardedSparseMatrixBase, public Impl + +These all make use of an implementation template class, and the possible implementations include:: + + typedef WilsonImpl WilsonImplF; // Float + typedef WilsonImpl WilsonImplD; // Double + +Staggered fermions make us of a spin index free field via the StaggeredImpl:: + + typedef StaggeredImpl StaggeredImplF; // Float + typedef StaggeredImpl StaggeredImplD; // Double + +A number of alternate, non-fundamental Fermion representations are supported. Note that the Fermion +action code is common to each of these, demonstrating the utility of the template Fimpl classes for separating +the code that varies from the invariant sections:: + + typedef WilsonImpl WilsonAdjImplF; // Float + typedef WilsonImpl WilsonAdjImplD; // Double + + typedef WilsonImpl WilsonTwoIndexSymmetricImplF; // Float + typedef WilsonImpl WilsonTwoIndexSymmetricImplD; // Double + + typedef WilsonImpl WilsonTwoIndexAntiSymmetricImplF; // Float + typedef WilsonImpl WilsonTwoIndexAntiSymmetricImplD; // Double + +G-parity boundary conditions are supported, and an additional flavour index inserted on the Fermion field via the Gparity implementation:: + + typedef GparityWilsonImpl GparityWilsonImplF; // Float + typedef GparityWilsonImpl GparityWilsonImplD; // Double + +ZMobius Fermions use complex rather than real action coefficients and are supported via an alternate implementation:: + + typedef WilsonImpl ZWilsonImplF; // Float + typedef WilsonImpl ZWilsonImplD; // Double + + +Some example constructor calls are given below for Wilson and Clover fermions:: + + template class WilsonFermion; + +With constructor:: + + WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, RealD _mass, + const ImplParams &p = ImplParams(), + const WilsonAnisotropyCoefficients &anis = WilsonAnisotropyCoefficients() ); + +and:: + + template class WilsonCloverFermion : public WilsonFermion; + +with constructor:: + + WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid, + GridRedBlackCartesian &Hgrid, + const RealD _mass, + const RealD _csw_r = 0.0, + const RealD _csw_t = 0.0, + const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(), + const ImplParams &impl_p = ImplParams()); + +Additional paramters allow for anisotropic versions to be created, which take default values for +the isotropic case. + +The constuctor signatures can be found in the header files in qcd/action/fermion/ +A complete list of the 4D ultralocal Fermion types is:: + + WilsonFermion WilsonFermionF; + WilsonFermion WilsonAdjFermionF; + WilsonFermion WilsonTwoIndexSymmetricFermionF; + WilsonFermion WilsonTwoIndexAntiSymmetricFermionF; + WilsonTMFermion WilsonTMFermionF; + WilsonCloverFermion WilsonCloverFermionF; + WilsonCloverFermion WilsonCloverAdjFermionF; + WilsonCloverFermion WilsonCloverTwoIndexSymmetricFermionF; + WilsonCloverFermion WilsonCloverTwoIndexAntiSymmetricFermionF; + ImprovedStaggeredFermion ImprovedStaggeredFermionF; + +Cayley form chiral fermions (incl. domain wall):: + + DomainWallFermion DomainWallFermionF; + DomainWallEOFAFermion DomainWallEOFAFermionF; + MobiusFermion MobiusFermionF; + MobiusEOFAFermion MobiusEOFAFermionF; + ZMobiusFermion ZMobiusFermionF; + ScaledShamirFermion ScaledShamirFermionF; + MobiusZolotarevFermion MobiusZolotarevFermionF; + ShamirZolotarevFermion ShamirZolotarevFermionF; + OverlapWilsonCayleyTanhFermion OverlapWilsonCayleyTanhFermionF; + OverlapWilsonCayleyZolotarevFermion OverlapWilsonCayleyZolotarevFermionF; + +Continued fraction overlap:: + + OverlapWilsonContFracTanhFermion OverlapWilsonContFracTanhFermionF; + OverlapWilsonContFracZolotarevFermion OverlapWilsonContFracZolotarevFermionF; + + Partial fraction overlap:: + + OverlapWilsonPartialFractionTanhFermion OverlapWilsonPartialFractionTanhFermionF; + OverlapWilsonPartialFractionZolotarevFermion OverlapWilsonPartialFractionZolotarevFermionF; + + Gparity cases; a partial list is defined until tested:: + + WilsonFermion GparityWilsonFermionF; + DomainWallFermion GparityDomainWallFermionF; + DomainWallEOFAFermion GparityDomainWallEOFAFermionF; + + WilsonTMFermion GparityWilsonTMFermionF; + MobiusFermion GparityMobiusFermionF; + MobiusEOFAFermion GparityMobiusEOFAFermionF; + +For each action, the suffix "F" can be replaced with "D" to obtain a double precision version. More generally, +it is possible to perform communications with a different precision from computation. +The number of combinations is rather large to list, but in the above listing the substitution is the +obvious one. + +========== ================ ================== +Suffix Computation Communication +========== ================ ================== +F fp32 fp32 +D fp64 fp64 +R default default +FH fp32 fp16 +DF fp64 fp32 +RL default lower +========== ================ ================== + + Pseudofermion --------------- +Pseudofermion actions are defined in qcd/action/pseudofermion/ . +These action terms are built from template classes:: + + // Base even odd HMC on the normal Mee based schur decomposition. + // + // M = (Mee Meo) = (1 0 ) (Mee 0 ) (1 Mee^{-1} Meo) + // (Moe Moo) (Moe Mee^-1 1 ) (0 Moo-Moe Mee^-1 Meo) (0 1 ) + // + // Determinant is det of middle factor. This assumes Mee is indept of U. + template class SchurDifferentiableOperator ; + + // S = phi^dag (Mdag M)^-1 phi + template class TwoFlavourPseudoFermionAction ; + + // S = phi^dag V (Mdag M)^-1 Vdag phi + template class TwoFlavourRatioPseudoFermionAction ; + + // S = phi^dag (Mdag M)^-1 phi (odd) + // + phi^dag (Mdag M)^-1 phi (even) + template class TwoFlavourEvenOddPseudoFermionAction; + + // S = phi^dag V (Mdag M)^-1 Vdag phi + template class TwoFlavourEvenOddRatioPseudoFermionAction ; + + +Rational HMC pseudofermion terms:: + + // S_f = chi^dag * N(M^dag*M)/D(M^dag*M) * chi + // + // Here, M is some operator + // N and D makeup the rat. poly + template class OneFlavourRationalPseudoFermionAction; + + // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi + // + // Here P/Q \sim R_{1/4} ~ (V^dagV)^{1/4} + // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2} + template class OneFlavourRatioRationalPseudoFermionAction; + + + // S = phi^dag (Mdag M)^-1/2 phi + template class OneFlavourEvenOddRationalPseudoFermionAction; + + // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi + // + // Here P/Q \sim R_{1/4} ~ (V^dagV)^{1/4} + // Here N/D \sim R_{-1/2} ~ (M^dagM)^{-1/2} + template class OneFlavourEvenOddRatioRationalPseudoFermionAction; + +The relevant Fermion operators are constructed externally, +and references are passed in to these object constructors. Thus, they work for any Fermion operator and the code +can be shared. For example, one of the constructors is given as:: + + TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator &_NumOp, + FermionOperator &_DenOp, + OperatorFunction & DS, + OperatorFunction & AS) : + + +The exact one flavour algorithm for Domain Wall Fermions is present but is not documented here:: + + #include + HMC ========================================= -.. todo:: CD: The whole section needs to be completed, of course +There are a large number of examples under tests/hmc/ + +The most important data structure associated with (R)HMC describes the action +and integration scheme. + +The action is a sum of terms, possibly with nested timesteps. + +This is assembled in an ActionSet object (qcd/action/ActionSet.h). +The timesteps are managed by Levels. The ActionSet object maintains a list +of ActionLevel objects:: + + // Define the ActionSet + template + using ActionSet = std::vector >; + +Each ActionLevel is associated with each level of the +nested integration scheme, schematically:: + + template + struct ActionLevel { + public: + unsigned int multiplier; + // Fundamental repr actions separated because of the smearing + typedef Action* ActPtr; + std::vector& actions; + } + +The outer loop, running MD trajectories, Metropolis steps, saving and restoring configurations +is generic and managed by a "Runner" class.. + +There are a number of possible integrators: LeapFrog, MinimumNorm2, ForceGradient + +These are a template parameter to the HMCRunner class. We will take as an example Test_hmc_EODWFRatio.cc:: + + typedef GenericHMCRunner HMCWrapper; // Uses the default minimum norm + +The test defines the Fermion action and Gauge action + + typedef WilsonImplR FermionImplPolicy; + typedef DomainWallFermionR FermionAction; + typedef typename FermionAction::FermionField FermionField; + HMCWrapper TheHMC; + +The HMC runner is given the Grid information (lifted from standard Grid --grid Lx.Ly.Lz.Lt command line):: + + TheHMC.Resources.AddFourDimGrid("gauge"); + +The checkpointing strategy is defined:: + + // Checkpointer definition + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_EODWF_lat"; + CPparams.rng_prefix = "ckpoint_EODWF_rng"; + CPparams.saveInterval = 5; + CPparams.format = "IEEE64BIG"; + + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + +.. todo:: HOW TO resume from saved RNGs. Guido changed this. + +Random number generators are seeded:: + + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + +Observables measured at the end of the trajectory can be registered:: + + // Construct observables + // here there is too much indirection + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + +The action must be defined:: + + RealD beta = 5.6 ; + WilsonGaugeActionR Waction(beta); + + const int Ls = 8; + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + LatticeGaugeField U(GridPtr); + + Real mass = 0.04; + Real pv = 1.0; + RealD M5 = 1.5; + + FermionAction DenOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,mass,M5); + FermionAction NumOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv, M5); + + double StoppingCondition = 1.0e-8; + double MaxCGIterations = 10000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + TwoFlavourEvenOddRatioPseudoFermionAction Nf2(NumOp, DenOp,CG,CG); + + // Set smearing (true/false), default: false + Nf2.is_smeared = false; + + // Collect actions + ActionLevel Level1(1); + Level1.push_back(&Nf2); + + ActionLevel Level2(4); + Level2.push_back(&Waction); + + TheHMC.TheAction.push_back(Level1); + TheHMC.TheAction.push_back(Level2); + +And the HMC can be setup and run:: + + TheHMC.Parameters.MD.MDsteps = 20; + TheHMC.Parameters.MD.trajL = 1.0; + TheHMC.ReadCommandLine(argc, argv); // these can be parameters from file + + TheHMC.Run(); // no smearing + +This puts together the pieces of the previous sections (actions, Fermion operators, solver algorithms etc...) into a full application. Development of the internals ======================================== @@ -2477,20 +3340,9 @@ Optimised fermion operators Optimised communications --------------------------------------------- -Interfacing with external software -======================================== -.. todo:: CD: Such a section should be very useful -.. todo:: CD: The whole section needs to be completed, of course +.. include:: interfacing.rst -MPI initialization and coordination ------------------------------------ - -Creating Grid fields --------------------- - -Mapping fields between Grid and user layouts --------------------------------------------- .. image:: logo.png :width: 200px diff --git a/tests/hmc/Test_hmc_EODWFRatio.cc b/tests/hmc/Test_hmc_EODWFRatio.cc index dc903f4d..9a755dd5 100644 --- a/tests/hmc/Test_hmc_EODWFRatio.cc +++ b/tests/hmc/Test_hmc_EODWFRatio.cc @@ -146,13 +146,7 @@ int main(int argc, char **argv) { std::cout << GridLogMessage << "Denominator report, Dw(m) term (includes CG) : " << std::endl; DenOp.Report(); - - - Grid_finalize(); - - - } // main