1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-24 17:54:47 +01:00

Compare commits

...

1332 Commits

Author SHA1 Message Date
Peter Boyle
aa13118127 Missing conjugate already fixed in develop 2020-04-10 11:11:24 -04:00
Peter Boyle
6cdb09c884 Faster copy region 2020-04-10 11:10:52 -04:00
Peter Boyle
a65bc64f10 Accelerator peek poke 2020-04-10 11:09:59 -04:00
Peter Boyle
11dec4883c Don't throw assert 2020-04-10 11:09:11 -04:00
Peter Boyle
afa458c812 Extra solvers 2020-04-10 11:08:19 -04:00
Peter Boyle
dc50190b8f Faster GPU basis rotation
May need to later include Regensburg optimised CPU variant
2020-04-10 11:06:04 -04:00
Peter Boyle
8a5c13d5fb Still fast moving in changes 2020-02-06 17:57:26 -05:00
Peter Boyle
bdccb0c91f Working 2 types of decomposition 2020-02-06 17:26:55 -05:00
Peter Boyle
68b45f6444 Lower left/upper right region cut paste 2020-02-06 15:50:26 -05:00
Peter Boyle
ef9b3e658a extra typedef 2020-02-06 15:47:14 -05:00
Peter Boyle
b9ca40cc44 More precise power method at start 2020-02-06 10:09:14 -05:00
Peter Boyle
2f421a5db1 Commeent fix 2020-02-06 10:08:27 -05:00
Peter Boyle
852fc1b001 True Hierachical multigrid for DWF 2020-01-27 13:45:10 -05:00
Peter Boyle
2b5de5bba5 MdagM operator without norm option 2020-01-27 13:44:30 -05:00
Peter Boyle
2e85cae74e Add Jacobi polynomials 2020-01-27 13:43:49 -05:00
Peter Boyle
76c823781e Much faster coarsening 2020-01-27 13:43:19 -05:00
Peter Boyle
114db3b99d Optional MdagM without norms 2020-01-27 13:42:51 -05:00
Peter Boyle
49e123dbda Use explicit linalg calls to get coalesce optimisations on GPU 2020-01-27 12:44:51 -05:00
Peter Boyle
8cec294ec9 Make CG a bit less verbose as gettign annoying in nested algorithms.
Can use Iterative logging if you want to see more
2020-01-27 12:44:04 -05:00
Peter Boyle
eb5b720e94 Normal Equations can be used in HDCR now 2020-01-27 12:43:29 -05:00
Peter Boyle
b2736ec80b Make PrecGCR recursive - it can precondition itself 2020-01-27 12:42:48 -05:00
Peter Boyle
086256a032 Less sloppy convergence test on PowerMethod 2020-01-27 12:41:59 -05:00
Peter Boyle
afc7426f39 Much bigger pointer cache in case of Nvidia due to cost of setting up UVM allocations 2020-01-27 12:41:16 -05:00
Peter Boyle
7c061e20c9 All directions of dirac operator for fastt coarsening 2020-01-27 12:40:13 -05:00
Peter Boyle
e5d1c09665 Faster DhopDirAll for little dirac operator coarsening 2020-01-27 12:38:54 -05:00
Peter Boyle
8016a465ae Remove extraneous variable 2020-01-27 12:35:37 -05:00
Peter Boyle
d8b9742092 DhopDirAll for faster matrix elements of little Dirac operator 2020-01-27 12:34:54 -05:00
Peter Boyle
1bd87c35d7 Read coalescing on Nvidia 2020-01-27 12:29:56 -05:00
Peter Boyle
fa856c9669 Disable information message 2020-01-27 12:28:46 -05:00
Peter Boyle
48008e4d8b Thread coordinate creation loop 2020-01-27 12:28:16 -05:00
Peter Boyle
55cdb17691 Integer divide for blocking 2020-01-27 12:27:45 -05:00
Peter Boyle
554542b773 Merge branch 'feature/hdcr' of https://github.com/paboyle/Grid into feature/hdcr 2020-01-06 11:47:56 -05:00
Peter Boyle
03da4040e2 Make summit happy 2020-01-06 11:47:48 -05:00
Peter Boyle
e583035614 Change to interface to minise comms in evaluating coarse space operator 2020-01-06 11:43:59 -05:00
Peter Boyle
3c3d6a94f3 OPtimising the force term a bit 2020-01-04 03:16:23 -05:00
Peter Boyle
205ea4bbb2 More verboose Lanczos 2020-01-04 03:13:40 -05:00
Peter Boyle
039eb7b2eb Make the force term and coarsening multigrid more optimised 2020-01-04 03:12:17 -05:00
Peter Boyle
f7e4bd1f6d Getting more optimised 2020-01-04 03:11:53 -05:00
Peter Boyle
0afecfcae7 Nearing well optimised state 2020-01-04 03:11:19 -05:00
Peter Boyle
ba40a3f763 Alternate low pass filter option 2020-01-03 05:29:09 -05:00
Peter Boyle
aa920aa532 Improved DWF multigrid 2019-12-28 10:32:35 -05:00
Peter Boyle
c0d8e4dce5 Improved Multigrid for DWF 2019-12-28 10:32:15 -05:00
Peter Boyle
9cfd64c604 Coarse grid on GPU, not fast enough yet. Need a 10x 2019-12-17 05:24:45 -05:00
Peter Boyle
e478404291 Tuned up significantly on GPU, but another 10x in coarse space required 2019-12-17 05:03:25 -05:00
Peter Boyle
9aafd20468 Simple block project promote runs faster on GPU 2019-12-17 05:01:39 -05:00
Peter Boyle
9e15474999 Accelerator loop attempt at speed up 2019-12-14 05:28:16 -05:00
Peter Boyle
152b525a4d Typo fix 2019-12-13 22:44:42 -05:00
Peter Boyle
d18994eddc offload more of mgrid to GPU 2019-12-13 22:08:11 -05:00
Peter Boyle
736b19485e Faster set up and some dead code ifdef'ed out 2019-12-13 21:30:48 -05:00
Peter Boyle
5bfd1470ad Merge branch 'develop' into feature/hdcr 2019-12-10 21:51:06 -05:00
Peter Boyle
6957b0b58a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-12-10 21:50:42 -05:00
Peter Boyle
d73f0b8618 Verbose for temporary debug 2019-12-10 21:50:06 -05:00
Peter Boyle
0b3a3562c3 Some MPI (summit) create sigusr2, so trap that 2019-12-10 21:49:12 -05:00
Peter Boyle
710fee5d26 Subspace setup testing code
and timing verbose
2019-12-10 21:48:42 -05:00
Peter Boyle
bab0bf2e93 Merge branch 'develop' into feature/hdcr 2019-12-10 21:47:41 -05:00
Peter Boyle
848079e8ba Merge pull request #235 from grid-test-organisation/feature/5d-improvement
MooeeInv and M5D optimisations + enable threading with nvcc
2019-12-10 21:45:03 -05:00
Peter Boyle
f2a4f13111 Must offload the Coarsened matrix if Stencil buffers are device resident 2019-12-10 19:32:12 -05:00
b9b9fcbfa0 Merge pull request #229 from nils-asmussen/feature/JacobiSmear
MSource::jacobi smear + sort file contents of Modules.hpp and modules.inc
2019-12-09 22:50:02 +00:00
bbe48998a8 sort Modules.hpp and modules.inc + add module JacobiSmear 2019-12-09 18:06:29 +00:00
6446671a9c Merge pull request #241 from nils-asmussen/fix/remQCDns_ignore_ws
Undo whitespace changes in fix/removeQCDremnants to allow comparing relevant changes
2019-12-09 18:02:21 +00:00
110373ea79 Merge pull request #204 from nils-asmussen/sha256sum_Eigen_download
bootstrap.sh: verify checksum of Eigen tar file
2019-12-09 18:01:46 +00:00
a986786192 bootstrap.sh: verify checksum of Eigen tar file if sha256sum is installed 2019-12-09 17:11:21 +00:00
Peter Boyle
edd1c924eb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-12-09 03:53:01 -05:00
Peter Boyle
9b6b0caa55 Junk commit fix 2019-12-09 03:01:58 -05:00
Peter Boyle
2a48617ac5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-12-09 03:00:00 -05:00
Peter Boyle
876d9c957c QMR 2019-12-09 02:59:49 -05:00
Peter Boyle
295e535f93 QMR 2019-12-09 02:59:35 -05:00
Peter Boyle
58a31f0763 QMR implemented, preserve even if not used much 2019-12-09 02:59:13 -05:00
Peter Boyle
3d2fe80780 Temporary size depends on checkerboard/uncheckerboard. The Mdir cares 2019-12-09 02:58:24 -05:00
Peter Boyle
e43fce1083 Clean up and simplify a little. 2019-12-09 02:55:45 -05:00
Peter Boyle
0dfdf80407 Logging 2019-12-09 02:54:52 -05:00
Peter Boyle
2912071f83 Add non hermitian operator 2019-12-09 02:51:53 -05:00
Peter Boyle
26605ef387 HDCR back to working 2019-12-09 02:51:01 -05:00
1e5ac576d9 Merge commit 'f7698b93ca57ea3aa4d72b133ad9ca5d1e703661' into develop
# Conflicts:
#	Hadrons/Modules.hpp
#	Hadrons/modules.inc
2019-12-06 11:59:21 +00:00
d5492b426f Hadrons: better order in module list 2019-12-06 11:56:26 +00:00
d428858c9d Merge pull request #255 from fionnoh/feature/sparseNoise
Feature/sparse noise
2019-12-06 11:43:27 +00:00
ferben
f7698b93ca corrected comments about quark line directions 2019-12-06 09:46:52 +00:00
ferben
7ce77690b8 Naming conventon also applied to metadata 2019-12-05 17:38:43 +00:00
ferben
164ed9c434 Naming conventon also applied to metadata 2019-12-05 17:38:00 +00:00
ferben
a54157e682 more definitions changed 2019-12-05 17:08:09 +00:00
ferben
58b6a0d8d1 changed some naming conditions to resemble rare-kaons 2019-12-05 16:56:54 +00:00
ferben
1a5e562bde only one FIMPL left! 2019-12-05 16:46:58 +00:00
Fionn O hOgain
45be26cf3f Merge branch 'develop' of https://github.com/fionnoh/Grid into feature/sparseNoise 2019-12-05 16:18:47 +00:00
Fionn O hOgain
5227ffccb7 Added James' sparse noise code and a module to use it 2019-12-05 15:50:03 +00:00
a0b47cc0be Merge pull request #254 from fionnoh/bugfix/eigenMigration
Updated Eigen URL after migration to gitlab
2019-12-05 15:26:38 +00:00
ferben
b766038810 new syntax after merge 2019-12-04 18:08:00 +00:00
ferben
cd9fd80a5d merged in develop 2019-12-04 17:12:46 +00:00
d6100cc35a Merge pull request #253 from mmphys/feature/distil
Fix phase convention adjustment error
2019-12-04 14:58:51 +00:00
Fionn O hOgain
29a1530510 Updated Eigen URL after migration to gitlab 2019-12-04 13:49:22 +00:00
Michael Marshall
15119eaf03 Fix phase convention adjustment error (and make no assumptions about node layout) 2019-12-04 09:59:58 +00:00
188e12ffbb Merge pull request #249 from mmphys/feature/distil
Feature/distil
2019-12-03 18:06:00 +00:00
ferben
e940f4db7e removed unused parameter parity 2019-12-03 12:01:31 +00:00
ferben
9c7f269489 typo in fimpl4 2019-12-03 11:19:54 +00:00
ferben
07feaf9531 updated ascii-doc preamble 2019-12-03 11:17:35 +00:00
Michael Marshall
7983ff2fdd Merge branch 'develop' into feature/distil
* develop:
  Change to reporting
  NVCC timer support
  Fix nocompilee under NVCC
  --enable-summit flag
  IBM summit optimisation. Synchronise in node is still btweeen 2 halves of AC922, so could be a little faster
  Sliced propagator contraction was not producing any results because buf.size()=0
  several typos in hadrons
2019-11-30 16:47:03 +00:00
Michael Marshall
2db814f2b7 Resolve conflicts in BaryonUtils (just use latest from develop) 2019-11-29 18:19:35 +00:00
Michael Marshall
6418f06771 Add option to save the eigenvectors of the Laplacian.
If they are saved, then metadata saved are:
solverXml	Parameters for this LapEvec module instance
OperatorXml	module type and parameters (if any) for the module that created the gauge field
2019-11-29 18:06:18 +00:00
8a5576f73c cleared up how exactly q_spec has to be defined 2019-11-28 12:35:18 +00:00
Peter Boyle
997790ad24 Allow subspace setup to no converge 2019-11-26 14:04:28 -05:00
Peter Boyle
900d6fad21 fp16 mandatory. Use SFW is not available as hdw 2019-11-26 13:26:43 -05:00
799ff0c96e speed-up 2019-11-26 15:28:47 +00:00
5fd5c25114 now two seperate functions for Eye and NonEye 2019-11-26 13:44:55 +00:00
62b3799c77 Merge pull request #251 from fionnoh/bugfix/WallWallMeson
MContraction::Meson bugfix
2019-11-26 12:46:03 +00:00
Peter Boyle
d1a89af8c9 Change to reporting 2019-11-22 10:49:10 -05:00
Peter Boyle
d91ba1f6cc NVCC timer support 2019-11-21 20:11:19 +00:00
Peter Boyle
f4d27e7090 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-11-21 20:09:31 +00:00
Peter Boyle
feb1ff3494 Fix nocompilee under NVCC 2019-11-21 20:03:39 +00:00
Peter Boyle
8ef6175acc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-11-21 15:02:21 -05:00
Peter Boyle
e4399e3ee1 --enable-summit flag 2019-11-21 15:02:10 -05:00
Peter Boyle
98ea67b636 IBM summit optimisation. Synchronise in node is still btweeen 2 halves of AC922, so could
be a little faster
2019-11-21 15:00:46 -05:00
ferben
421a4395af Sigma to Nucleon contractions 2019-11-21 17:25:37 +00:00
Fionn O hOgain
cf95a460a5 Sliced propagator contraction was not producing any results because buf.size()=0 2019-11-21 17:17:55 +00:00
a60e20f265 Merge pull request #250 from mmphys/hadrons-typos
several typos in hadrons
2019-11-20 17:10:08 +00:00
ferben
9261c0da89 several typos in hadrons 2019-11-20 17:06:32 +00:00
ferben
b350a24ded fixed test_distil 2019-11-18 15:29:20 +00:00
Michael Marshall
13a0db7162 Reverse changes not intended to be part of distillation release 2019-11-18 12:34:49 +00:00
Michael Marshall
18177d9709 Review changes 2019-11-18 11:59:13 +00:00
Michael Marshall
7bf42b9c0e HADRONS_ERROR 2019-11-18 10:27:35 +00:00
ferben
2d6f4e0c09 fixed issue with HADRONS_ERROR, no idea why this works 2019-11-15 13:46:47 +00:00
ferben
7f06c40107 _var -> var_ 2019-11-15 13:26:24 +00:00
ferben
9f75065205 eigen_strong_inline gone 2019-11-15 13:22:20 +00:00
ferben
271a02230e assert -> ERROR 2019-11-15 11:11:50 +00:00
ferben
b1e8b5b5ce changed default behaviour as discussed with antonin 2019-11-15 11:00:25 +00:00
ferben
25d2521d77 small stuff 2019-11-13 16:34:09 +00:00
ferben
500ef17143 beauty 2019-11-13 15:14:51 +00:00
ferben
ee9dd22643 worked on test_distil 2019-11-13 14:59:44 +00:00
ferben
a977d9901b cleanup 2019-11-13 14:52:06 +00:00
ferben
667ffb70db changed error type 2019-11-13 12:16:56 +00:00
ferben
65b3059bd7 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-11-13 11:51:14 +00:00
ferben
5238808ccd No DistilVectors specified in xml no throws an error 2019-11-13 11:50:55 +00:00
Michael Marshall
8f88fee680 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
* 'feature/distil' of github.com:mmphys/Grid:
  made notation DPar->dp consistent over modules
2019-11-13 11:34:10 +00:00
Michael Marshall
fcc412a1c2 Remove conditional compilation to support GPU build 2019-11-13 11:32:23 +00:00
ferben
12e415330f made notation DPar->dp consistent over modules 2019-11-13 11:21:08 +00:00
Michael Marshall
66e0811317 Attempt to fix cuda build 2019-11-13 00:02:51 +00:00
Michael Marshall
55e743aad6 Streamline 2019-11-12 23:57:28 +00:00
Michael Marshall
e2ab0d671e Implement destructors 2019-11-12 23:18:37 +00:00
Michael Marshall
7a4c5dbbd5 Restoring previous version for _reduced variables 2019-11-12 22:12:35 +00:00
Michael Marshall
3f00b8f6c7 Switch to std::unique_ptr<GridCartesian> grid3d;
Remove hand-coded reference to pi - switch to <math.h> definition
2019-11-12 21:53:09 +00:00
Michael Marshall
6d7043e0c2 NamedTensor changes done 2019-11-12 17:31:42 +00:00
ferben
b0f24ec302 Test works now 2019-11-12 15:14:13 +00:00
Michael Marshall
fb2834bf82 Oops 2019-11-12 14:01:20 +00:00
Michael Marshall
78f75b0e9f Better than graffiti 2019-11-12 14:00:46 +00:00
Michael Marshall
62dd0bfe58 New parameter module compiles. Untested. 2019-11-12 13:59:53 +00:00
ferben
db952993fa envCreate problem.. 2019-11-12 12:23:34 +00:00
ferben
b8f0878981 removed most default behaviour 2019-11-11 17:49:38 +00:00
ferben
df586a142d added DistilPar-module and cleaned up some code 2019-11-11 17:29:55 +00:00
ferben
7a446d5b7f removed default filenames 2019-11-11 14:36:45 +00:00
ferben
e7d7ea4f8f added LoadNoise module 2019-11-11 12:55:45 +00:00
Michael Marshall
f8e1941327 Implemented specialisations of NamedTensor as derived classes, however this suffers a number of problems:
1) virtual functions not available in base class constructor where I'd like to use them - e.g. IndexNames
2) Must define new constructors in derived classes
... so the specialisations are fatter than I'd like. Would prefer to revert to specifying tensor name and index name defaults in template
2019-11-08 11:55:00 +00:00
65aa54804e added comments 2019-11-08 11:15:51 +00:00
ferben
293bfe17d1 added code to the noise module... 2019-11-07 14:00:40 +00:00
ferben
a8f3a111a5 added Serial RNG - code compiles but not tested! 2019-11-07 13:45:38 +00:00
ferben
5c23abe507 commented on Notation 2019-11-07 11:57:40 +00:00
Michael Marshall
22c654182a Fixes for GPU compile 2019-11-04 17:24:34 +00:00
Michael Marshall
6f0439c0e4 Remove unnecessary cast 2019-11-04 15:50:14 +00:00
Michael Marshall
4f9a7c5d76 Back out unnecessary change 2019-11-02 16:50:29 +00:00
Michael Marshall
fcd90705bc Beautification 2019-11-02 16:15:48 +00:00
Michael Marshall
4bcdb4ff95 Remove accidental check-in of local debugging 2019-11-02 15:24:12 +00:00
Michael Marshall
1c10933db1 Rationalisation of NamedTensor (Perambulator) 2019-11-02 14:58:32 +00:00
Michael Marshall
52d8d576d0 Removed SliceShare as a reusable routine 2019-11-01 20:10:51 +00:00
Michael Marshall
ada0a7a83b C++11 case comparison of named tensor index names 2019-11-01 16:05:08 +00:00
Michael Marshall
efe2f2d48b Merge branch 'develop' into feature/distil
* develop:
  Summit jsrun GPU mapping updates. Conffigure with --enable-jsrun
  Fixed Lanczos calling aligned alloc in threaded region hitting up against pointer-cache no-threading restrictions Fixed Lattice::reset not compiling with new Grid explicit memory region handling Fixed memory leak in Lattice::resize that occurs when data region has been previously allocated
2019-11-01 15:38:48 +00:00
Michael Marshall
45d4cf0971 Cleanup in progress 2019-11-01 15:35:07 +00:00
Peter Boyle
ac614cbc53 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-10-31 11:46:43 -04:00
Peter Boyle
ec8e060ec7 Summit jsrun GPU mapping updates. Conffigure with --enable-jsrun 2019-10-31 11:46:09 -04:00
Felix Erben
5c54f27ac1 some cleanup, but hard-coded src in LapEvec unclear 2019-10-31 11:51:05 +00:00
Felix Erben
4ed9379535 some cleanup 2019-10-31 11:45:50 +00:00
Michael Marshall
858e348a6d Cleanup of messages 2019-10-31 11:11:52 +00:00
Michael Marshall
3b3680c64e Reversed Felix's interim A2Autils.h changes ... these were finished and went into develop via a separate branch 2019-10-30 15:50:04 +00:00
Michael Marshall
2a926b3dc6 Merged latest changes from develop, in preparation for release. 2019-10-30 14:52:34 +00:00
Chris K
845a045493 Merge pull request #233 from giltirn/lanczos_fix
A few run /compile / memory leak fixes
2019-10-30 10:21:59 -04:00
Michael Marshall
eb8848a071 Merge branch 'develop' into feature/distil
* develop: (27 commits)
  Update README.md
  result layout standardised, iterator size more elegant
  updated syntac in Test_hadrons_spectrum
  chroma-regression test now prints difference correctly
  baryon input strings are now pairs of pairs of gammas - still ugly!!
  second update to pull request
  Changing back interface for Gamma3pt
  Removing old debug code
  Changes to A2Autils
  suggested changes for 1st pull request implemented
  changed input parameters for easier use
  Should compile everywhere now
  changed baryon interface
  added author information
  ready for pull request
  code compiling now - still need to test
  Baryons module works in 1 of 3 cases - still need SlicedProp and Msource part!!
  thread_for caused the problems - slow for loop for now
  still bugfix
  weird bug...
  ...

# Conflicts:
#	Hadrons/Modules.hpp
#	Hadrons/modules.inc
2019-10-30 14:13:00 +00:00
Peter Boyle
f31e3278a6 Update README.md 2019-10-25 11:43:55 -04:00
Michael Marshall
ca234325bc Fix single-precision error 2019-10-23 21:49:32 +01:00
c97f780784 Merge pull request #243 from fionnoh/feature/A2A_current_insertion
Feature/a2 a current insertion
2019-10-22 13:55:53 +01:00
Michael Marshall
78bdb0ff6a Grid 2019-10-20 14:22:45 +01:00
Michael Marshall
decab587a0 PerambFileName defaults to object name if empty 2019-10-20 14:14:06 +01:00
202f025fc7 Merge pull request #242 from mmphys/feature/baryons
Feature/baryons
2019-10-16 15:06:32 +01:00
Felix Erben
3c702b510b result layout standardised, iterator size more elegant 2019-10-15 18:48:51 +01:00
Michael Marshall
519ce19128 Fixes to enable GPU build. NB: Contractor and ContractorBenchmark still not working 2019-10-14 22:40:13 +01:00
Felix Erben
8d166a81c0 updated syntac in Test_hadrons_spectrum 2019-10-14 13:41:08 +01:00
Felix Erben
aa62ca9046 chroma-regression test now prints difference correctly 2019-10-10 11:07:20 +01:00
Felix Erben
2dee4791db baryon input strings are now pairs of pairs of gammas - still ugly!! 2019-10-09 17:56:09 +01:00
Felix Erben
548b3bf43c second update to pull request 2019-10-09 14:52:33 +01:00
Fionn O hOgain
a55d0ba8fe Changing back interface for Gamma3pt 2019-10-08 15:52:01 +01:00
Fionn O hOgain
5de9547db5 Removing old debug code 2019-10-08 15:51:28 +01:00
Fionn O hOgain
6a3b09cf02 Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion 2019-10-08 13:25:51 +01:00
Fionn O hOgain
10de4bfc23 Changes to A2Autils 2019-10-08 13:24:56 +01:00
Felix Erben
2ce7f2b4d8 suggested changes for 1st pull request implemented 2019-10-08 13:19:47 +01:00
Michael Marshall
88d6ff8f1d Peter's bugfix in ImplicitlyRestartedLanczos.h
My bugfix in MomentumPhase.hpp
2019-10-07 17:36:11 +01:00
Michael Marshall
803329af99 Merge branch 'develop' into feature/distil
* develop:
  Fix after GPU merge: Phase in Free Propagator
  z2-momentum phase module

# Conflicts:
#	Hadrons/Modules/MSource/MomentumPhase.hpp
2019-10-07 13:09:52 +01:00
Michael Marshall
9d96899aa8 Doc bugfix 2019-10-07 13:05:04 +01:00
Michael Marshall
86939dbf1a Removed unnecessary function (for getting a parameter) 2019-10-04 13:59:59 +01:00
317645aaeb undo (most) whitespace changes in the two files HMC/Mobius2p1fEOFA{,_F1}.cc 2019-10-02 16:25:23 +01:00
Felix Erben
e280ec6b0b changed input parameters for easier use 2019-10-02 16:14:06 +01:00
d5a180d914 Merge branch 'fix/removeQCDremnants' into fix/remQCDns_ignore_ws 2019-10-02 16:11:27 +01:00
d2928761dd Merge pull request #240 from guelpers/feature/bugfixafterGPUmerge
Fix after GPU merge: Phase in Free Propagator
2019-10-02 15:00:15 +01:00
f2a74c603f Merge pull request #239 from mmphys/z2_momentum
z2-momentum phase module
2019-10-02 14:57:59 +01:00
5f22810f55 Fix after GPU merge: Phase in Free Propagator 2019-10-02 14:49:35 +01:00
Michael Marshall
92e25488f8 Added MomentumPhase Hadrons module from z2_momentum branch (thankyou, Felix) so I can run Z_2 wall with momenta easily 2019-10-02 14:13:35 +01:00
Michael Marshall
89ef2b7dc2 Should compile everywhere now 2019-10-02 13:20:07 +01:00
Michael Marshall
7606554b76 Remove references to unused modules (now part of separate Baryons branch) 2019-10-02 13:16:58 +01:00
Felix Erben
c8fc0b3e0c changed baryon interface 2019-10-02 11:36:39 +01:00
Felix Erben
ccb5e8374b z2-momentum phase module 2019-09-30 17:36:15 +01:00
Felix Erben
b88fd436e7 added author information 2019-09-30 17:07:46 +01:00
Felix Erben
155bcd4ff3 ready for pull request 2019-09-30 16:58:20 +01:00
Fionn O hOgain
d1daab601a Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion
Peter's GPU branch changes merged with A2A CI code
2019-09-30 16:53:44 +01:00
Felix Erben
e5d7910fa7 code compiling now - still need to test 2019-09-30 13:55:26 +01:00
Felix Erben
94b9a9474c Baryons module works in 1 of 3 cases - still need SlicedProp and Msource part!! 2019-09-27 15:08:56 +01:00
Felix Erben
bf62ec163d thread_for caused the problems - slow for loop for now 2019-09-26 13:33:49 +01:00
Felix Erben
8415e23fc6 still bugfix 2019-09-26 11:09:09 +01:00
Felix Erben
76c93aa44e weird bug... 2019-09-17 14:36:26 +01:00
Michael Marshall
3137628222 BaryonUtils.h is now part of Baryons 2019-09-17 13:19:20 +01:00
Michael Marshall
ce965ee6bb Cleanup tests that are no longer required 2019-09-17 13:10:59 +01:00
Michael Marshall
911fbb0f36 Cleanup modules that are no longer required 2019-09-17 13:06:52 +01:00
Michael Marshall
eb293e9909 Restore Baryons modules per develop branch 2019-09-16 20:29:37 +01:00
Felix Erben
f548114ff6 bugfix 2019-09-16 17:55:58 +01:00
Felix Erben
dab8c01c3d added Baryon code 2019-09-16 17:20:54 +01:00
Michael Marshall
2f3dd0703d Ensure Distillation test (Test_distil) works 2019-09-16 17:00:46 +01:00
Michael Marshall
2e963d1a78 Fix location of Grid.h and remove reference to QCD namespace 2019-09-16 15:34:47 +01:00
Michael Marshall
bf52e7cc96 Latest BaryonUtils.h from Felix + my fixes 2019-09-13 18:11:10 +01:00
Michael Marshall
61d017d0a5 Merge GPU support (upstream/develop) into distillation branch.
This compiles and looks right ... but may need some testing

* develop: (762 commits)
  Tensor ambiguous fix
  Fix for GCC preprocessor/pragma handling bug
  Trips up NVCC for reasons I dont understand on summit
  Fix GCC complaint
  Zero() change
  Force a couple of things to compile on NVCC
  Remove debug code
  nvcc error suppress
  Merge develop
  Reduction finished and hopefully fixes CI regression fail on single precisoin and force
  Double precision variants for summation accuracy
  Update todo list
  Freeze the seed
  Fix compiling of MSource::Gauss for single precision
  Think the reduction is now sorted and cleaned up
  Fix force term
  Printing improvement
  GPU reduction fix and also exit backtrace option
  GPU friendly
  Simplify the comms benchmark
  ...

# Conflicts:
#	Grid/communicator/SharedMemoryMPI.cc
#	Grid/qcd/action/fermion/WilsonKernelsAsm.cc
#	Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
#	Grid/qcd/smearing/StoutSmearing.h
#	Hadrons/Modules.hpp
#	Hadrons/Utilities/Contractor.cc
#	Hadrons/modules.inc
#	tests/forces/Test_dwf_force_eofa.cc
#	tests/forces/Test_dwf_gpforce_eofa.cc
2019-09-13 13:30:00 +01:00
Michael Marshall
04a661cafe Remove unused modules BC2 and Baryon2 2019-09-10 14:49:24 +01:00
gfilaci
a7fa86dc29 MooeeInv improvement for DW EOFA + comments 2019-09-05 12:05:21 +01:00
gfilaci
0c1efa5235 pass OpenMP flag to host compiler 2019-09-03 12:12:25 +01:00
gfilaci
fdd9b14e82 speed up MooeeInvDag for DWF EOFA 2019-09-02 14:49:51 +01:00
gfilaci
e66669d300 fast MooeeInv for EOFA 2019-09-02 14:26:13 +01:00
gfilaci
0efaf3c4fa access M5D coeffs through pointers 2019-09-02 11:33:00 +01:00
gfilaci
3ef519aaa4 fast MooeeInv 2019-09-02 11:18:14 +01:00
Peter Boyle
b473405652 Tensor ambiguous fix 2019-08-29 09:36:41 -05:00
Christopher Kelly
114ebb7914 Fixed Lanczos calling aligned alloc in threaded region hitting up against pointer-cache no-threading restrictions
Fixed Lattice::reset not compiling with new Grid explicit memory region handling
Fixed memory leak in Lattice::resize that occurs when data region has been previously allocated
2019-08-26 16:47:44 -04:00
Peter Boyle
9b7a6d197f Fix for GCC preprocessor/pragma handling bug 2019-08-23 14:37:46 +01:00
Peter Boyle
59cd7f3b70 Trips up NVCC for reasons I dont understand on summit 2019-08-23 06:03:49 -04:00
Peter Boyle
28d6be2a4e Fix GCC complaint 2019-08-22 18:56:37 +01:00
6b6c5aa626 remove namespace QCD from directory tests 2019-08-20 15:35:36 +01:00
9210b0aa6e remove namespace QCD from directory HMC 2019-08-20 15:21:23 +01:00
ad01290545 remove remnants of the namespace QCD 2019-08-19 20:30:33 +01:00
Fionn O hOgain
25150eb2e0 3pt contraction now takes a list of gammas 2019-08-15 12:09:30 +01:00
Peter Boyle
95f66cc93c Merge branch 'feature/gpu-port' into develop 2019-08-15 02:19:31 +01:00
Peter Boyle
12eb2a6a34 Zero() change 2019-08-15 01:43:00 +01:00
Peter Boyle
7c8902b04f Merge branch 'develop' into feature/gpu-port 2019-08-15 01:33:07 +01:00
Peter Boyle
4278caa030 Force a couple of things to compile on NVCC 2019-08-15 01:32:03 +01:00
Peter Boyle
be37dfb6f8 Remove debug code 2019-08-15 01:31:40 +01:00
Peter Boyle
5e8437029f nvcc error suppress 2019-08-15 01:31:12 +01:00
Peter Boyle
e279b2be29 Merge develop 2019-08-14 23:01:59 +01:00
Peter Boyle
48e6efc7c9 Merge branch 'develop' into feature/gpu-port
Conflicts:
	Grid/qcd/action/fermion/WilsonKernelsAsm.cc
	Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
	Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
	benchmarks/Benchmark_comms.cc
2019-08-14 18:56:54 +01:00
Peter Boyle
3e49dc8a67 Reduction finished and hopefully fixes CI regression fail on single precisoin and force 2019-08-14 15:18:34 +01:00
Peter Boyle
96ac56cace Double precision variants for summation accuracy 2019-08-14 13:08:01 +01:00
Peter Boyle
2b037e3daa Update todo list 2019-08-14 13:07:26 +01:00
Peter Boyle
2d2de7aede Freeze the seed 2019-08-14 13:07:11 +01:00
Peter Boyle
ce97638bac Think the reduction is now sorted and cleaned up 2019-08-11 11:09:01 +01:00
Peter Boyle
53e3ab4131 Fix force term 2019-08-11 11:06:13 +01:00
Fionn O hOgain
d566637cec Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion 2019-08-07 12:11:40 +01:00
Felix Erben
51bed48cd2 added selfcontract module 2019-08-05 17:46:42 +01:00
Felix Erben
b875edceab Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil
Conflicts:
	Grid/qcd/utils/BaryonUtils.h
	Hadrons/Modules/MContraction/Baryon2.hpp
2019-08-05 14:19:43 +01:00
Felix Erben
29df60c0cb some debugging stuff 2019-08-05 14:10:04 +01:00
Michael Marshall
8d97e2a02a Say which A2AMatrix is being loaded, and which contraction is being performed (m of n) 2019-08-02 19:23:18 +01:00
Michael Marshall
ed23f6be20 Remove blank line from log 2019-08-02 15:59:18 +01:00
Michael Marshall
cad76827b0 Be consistent about separator usage. Log start / stop / duration 2019-08-02 15:47:20 +01:00
Michael Marshall
310867d46a Additional option to specify the separator used between terms in correlator 2019-08-02 11:25:29 +01:00
Michael Marshall
e598178d94 TODO: Felix, please fix. I commented this out because of compiler errors 2019-08-01 20:51:51 +01:00
Michael Marshall
723457d467 Contractor updates ready for test on Tesseract:
1) Move definitions of serialisable objects into header for re-use by external programs/utilities
2) Add "-s" switch for "Simple" correlators, i.e. only include A2AMatrix info for the actual fields included in each contraction
2019-08-01 20:35:55 +01:00
Michael Marshall
6f40021842 Fixed compiler errors: TODO: Felix, please validate 2019-08-01 19:57:59 +01:00
Peter Boyle
9cd33a7b9c Printing improvement 2019-07-31 08:01:24 +01:00
Peter Boyle
639dc1ab21 GPU reduction fix and also exit backtrace option 2019-07-31 01:23:23 +01:00
Peter Boyle
9117f61109 GPU friendly 2019-07-31 01:22:54 +01:00
Felix Erben
622d5eaa3e Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-07-30 13:47:22 +01:00
Felix Erben
e66d48c142 second way to compute baryons - qdp style 2019-07-30 13:46:59 +01:00
Peter Boyle
9dad7a0094 Reproducible reduction and axpy_norm offload from Gianluca.
Hopefully get CG running entirely on GPU
2019-07-30 00:14:12 +01:00
Peter Boyle
1282e1067f Do the force term on the accelerator too. Needed particularly because comms buffers
are device memory.
2019-07-29 22:58:35 +01:00
Michael Marshall
f5ad4f3de8 Added the ability to write a version of the validated XML file excluding any of the module IDs supplied in a separate exclude file 2019-07-26 19:46:55 +01:00
Peter Boyle
275c1c920f More info dump on error from CUDA 2019-07-26 12:18:53 +01:00
Peter Boyle
fe700a183a Getting HMC to run 2019-07-26 12:18:29 +01:00
Peter Boyle
34108296cd Merge branch 'develop' into feature/gpu-port
Conflicts:
	Grid/simd/Grid_avx512.h
2019-07-20 17:05:35 +01:00
Peter Boyle
ce255ec359 Relocate to fix build failure for comms none 2019-07-20 16:37:03 +01:00
Peter Boyle
1c096626cb Hypercube defaults to on if HPE detected, but override to off possible 2019-07-20 16:06:16 +01:00
Peter Boyle
ce8b247426 Compiles 2019-07-20 15:16:02 +01:00
Peter Boyle
80481f81be Constructor typo 2019-07-20 09:58:24 +01:00
Peter Boyle
d85dcc72df Multinode fix 2019-07-20 07:13:28 +01:00
Peter Boyle
3fedcd6d52 Compiles 2019-07-20 07:12:44 +01:00
Michael Marshall
e7050a7aed Support gamma structure names that have trailing white space 2019-07-19 11:58:56 +01:00
Felix Erben
e138bc7204 debug output 2019-07-19 11:16:35 +01:00
Peter Boyle
25ba4c5f80 Merge branch 'develop' into feature/gpu-port
Conflicts:
	HMC/Mobius2p1fEOFA.cc
	tests/forces/Test_rect_force.cc
2019-07-19 11:01:55 +01:00
Michael Marshall
6d4fb35d84 Ready for testing 2019-07-19 10:33:03 +01:00
Peter Boyle
775eaee199 Fix for suspected Intel 2018.1 compiler bug under O3 2019-07-19 07:57:34 +01:00
Peter Boyle
0fd2827d5d Fix fail in single 2019-07-19 05:28:26 +01:00
Peter Boyle
bdd79f9ef8 TODO update 2019-07-18 22:04:28 +01:00
Peter Boyle
0695f8cec2 Single precision compile fix. Soon deprecate single precision 2019-07-18 22:02:31 +01:00
Peter Boyle
9fa705c5a0 comma fix 2019-07-18 21:38:11 +01:00
Felix Erben
56cefadf9b gamma matrices as input 2019-07-18 17:46:43 +01:00
ferben
9d82855c5d bugfix in Baryonutils 2019-07-18 15:45:43 +01:00
ferben
97d61f2564 bugfix in Baryonutils 2019-07-18 14:57:10 +01:00
Peter Boyle
331f5a53dc New header 2019-07-18 14:51:09 +01:00
Peter Boyle
a23dc295ac Remove compiler errors and warnings 2019-07-18 14:47:02 +01:00
ferben
11a8668d19 bugfix in Baryonutils 2019-07-18 14:44:55 +01:00
ferben
cded7670d0 new utils for baryons 2019-07-18 14:29:04 +01:00
ferben
feb029fb66 new utils for baryons 2019-07-18 14:24:16 +01:00
Peter Boyle
08904f830e Merge develop 2019-07-16 11:59:56 +01:00
Peter Boyle
fa9cd50c5b Merge branch 'develop' into feature/gpu-port 2019-07-16 11:55:17 +01:00
Felix Erben
5a62ebe7b1 general baryons case added 2019-07-15 15:26:30 +01:00
Peter Boyle
7c11525d1a Local stencil for complex wilson loops etc 2019-07-14 14:05:09 +01:00
Peter Boyle
42c1dbb1d1 General local stencil first cut for Patrick force term 2019-07-14 14:04:28 +01:00
Peter Boyle
6179acfda0 Put back a call that was required 2019-07-14 13:59:54 +01:00
Michael Marshall
fa747173d1 Debugging references were to l-values, so added const to stop errors 2019-07-14 11:08:00 +01:00
Peter Boyle
07601ac1f5 Replace instantiation of Gparity 2019-07-12 17:18:12 +01:00
Peter Boyle
705a8098b2 Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port
Conflicts:
	Grid/stencil/Stencil.h
2019-07-12 17:14:11 +01:00
Peter Boyle
a29b43d755 Stencil comms cleaner 2019-07-12 17:12:25 +01:00
Peter Boyle
368c8369ce Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port 2019-07-12 17:11:29 +01:00
Peter Boyle
c0d89a2dbb TODO updates 2019-07-12 17:11:15 +01:00
Peter Boyle
78ebd93281 Cuda 9.1 happy 2019-07-12 17:11:00 +01:00
Peter Boyle
3d58daf70f Safety check 2019-07-12 17:10:35 +01:00
Peter Boyle
bd155ca5c0 Overlap comms with comput now supported 2019-07-12 09:09:40 +01:00
Peter Boyle
91e2cf9b40 All axes can be used for comms now 2019-07-12 09:08:26 +01:00
Peter Boyle
3cc9947731 Better welcome printing 2019-07-12 06:47:51 +01:00
Peter Boyle
f15eeb0283 localise scope of variables declared in macro 2019-07-12 06:47:01 +01:00
Peter Boyle
0996ba9396 Pretty messaging 2019-07-12 06:45:31 +01:00
Michael Marshall
12afb0395f Debugging transposeSpin - seems just not to be implemented for Lattice<x> 2019-07-11 17:42:26 +01:00
Felix Erben
ec4aa978ab why cant I spinTranspose 2019-07-11 14:01:41 +01:00
Peter Boyle
966a203dcb Interactions with GPU compilation 2019-07-11 03:16:17 +01:00
Peter Boyle
44170cc15f Initialise CUDA device prior to entering MPI.
This may or may not interact with Summit which configures MPI - CUDA mapping with jsrun.
TBD
Cases of OpenMPI and MVAPICH are covered, and default to cudaSetDevice(0) otherwise
2019-07-11 03:14:23 +01:00
Michael Marshall
7bc4a06f3f This is probably what you want ... 2019-07-10 12:29:33 +01:00
Michael Marshall
cd659525e1 You probably want to add this to the build. And you may need to do a bootstrap 2019-07-10 12:08:37 +01:00
Felix Erben
dc2240d2d8 why does sliceSum in Nucleon.hpp not work 2019-07-10 11:34:16 +01:00
Felix Erben
98cf20cf06 continued work on baryons 2019-07-09 17:42:36 +01:00
Felix Erben
cc3346073e continued work on baryons 2019-07-09 17:30:32 +01:00
Felix Erben
3848da7c50 added nucleon module (non-distillation) 2019-07-08 17:43:14 +01:00
Felix Erben
b7d0cf6751 buxfix in diquark sum / baryons 2019-07-04 22:06:37 +01:00
Felix Erben
2c1a077369 continued on baryons 2019-07-02 17:55:28 +01:00
Peter Boyle
6e3c3214a3 Offload loops 2019-07-02 17:25:40 +01:00
Peter Boyle
d6ffadb33b Coalesced write 2019-07-02 17:25:13 +01:00
Michael Marshall
ae3abbe53d Added the ability for Perambulator module to save unsmeared sinks through the addition of two optional parameters:
UnsmearedSinkFileName: If present, specifies the filename to write to
UnsmearedSinkMultiFile: defaults to true to write each sink vector to a different file, but can be set to 0 for a single file
2019-07-01 17:28:27 +01:00
Felix Erben
5fc0188205 started saving sinks 2019-07-01 14:51:59 +01:00
Peter Boyle
4c3225412b Drop 5dVEC 2019-07-01 07:31:26 +01:00
Peter Boyle
b8f7bfbb26 Dont stream as poor perf in some cases 2019-07-01 07:30:25 +01:00
Peter Boyle
7b7c470917 Accelerator loop 2019-07-01 07:29:51 +01:00
Peter Boyle
532e226b22 cuda 9.1 fixes 2019-07-01 07:29:22 +01:00
Peter Boyle
6a13731818 Move GPU cuda call earlier 2019-07-01 07:28:41 +01:00
fionnoh
67690df3bd Changes nedded to have a current insertion on every second time slice - avoids unnecessary contractions 2019-06-28 15:18:28 +08:00
fionnoh
ce29b18dc9 New modules for loading in MFs as diskvectors and producing propagaotrs from 4 quark contractions 2019-06-27 13:46:06 +08:00
fionnoh
421a0a8a36 Changes to A2Autils, A2AMatirx and DiskVector code that is needed for Hadrons 4 quark contraction module 2019-06-27 13:45:20 +08:00
fionnoh
ac530636ca A2Aloop bugfix 2019-06-27 13:44:47 +08:00
Michael Marshall
2d940a598c Inserted four extra parameters just to make this test compile. Needs to be fixed properly 2019-06-19 10:37:50 +01:00
Michael Marshall
c28c5fc61b Inserted four extra parameters just to make this test compile. Needs to be fixed properly 2019-06-19 10:31:41 +01:00
Michael Marshall
015340d60c Elided superfluous copy on write 2019-06-19 09:37:03 +01:00
Peter Boyle
1cd4ee0706 Thrust used on GPU builds 2019-06-18 12:50:35 +01:00
Peter Boyle
b8f71b6777 Fix NVCC warning unused variable 2019-06-17 13:58:45 +01:00
Peter Boyle
703dc20377 Compile tests fix 2019-06-16 13:59:29 +01:00
Peter Boyle
d976e5c514 Pow is being awkward in thrust for reasons I don't understand. Possible thrust bug. 2019-06-16 12:05:11 +01:00
Peter Boyle
d7b3efe893 Compile fix 2019-06-15 17:03:15 +01:00
Peter Boyle
f710d7bd45 TODO list update 2019-06-15 12:54:27 +01:00
Peter Boyle
cb336aa8f8 Thread loop constructs changing a little 2019-06-15 12:54:11 +01:00
Peter Boyle
462900b48d Modified entire test directory to suit new GPU constructs for looping 2019-06-15 12:53:27 +01:00
Peter Boyle
0561c2edeb Benchmarks modified for new GPU constructs 2019-06-15 12:52:56 +01:00
Peter Boyle
0184719216 Change to predicate type 2019-06-15 12:52:26 +01:00
Peter Boyle
24202dbc51 Thread loop construct change 2019-06-15 12:52:07 +01:00
Peter Boyle
d763c303c5 Clean acceleerator barrier 2019-06-15 12:51:45 +01:00
Peter Boyle
8e394d3bf9 New loop construct 2019-06-15 12:51:15 +01:00
Peter Boyle
b881d5489b Move SchurDiagTwoKappa to Algorithms 2019-06-15 12:50:45 +01:00
Peter Boyle
82306913a8 Move Schur operator into correct place 2019-06-15 12:49:22 +01:00
Peter Boyle
49f90cc7eb use pragma once 2019-06-15 12:45:22 +01:00
Peter Boyle
b77af0210b Thread loop. Probably deprecate this impl 2019-06-15 12:44:56 +01:00
Peter Boyle
5254ede2d8 New loops. Revisit as accelerator loop in future audit 2019-06-15 12:44:29 +01:00
Peter Boyle
16e5d7945e Hard to make 5D vec work with GPU code 2019-06-15 12:43:43 +01:00
Peter Boyle
decc99ca76 Accelerator version 2019-06-15 12:43:00 +01:00
Peter Boyle
464cd65931 Still to test this fully 2019-06-15 12:35:14 +01:00
Peter Boyle
a1ec2f4723 Still to test this routine fully 2019-06-15 12:33:55 +01:00
Peter Boyle
ea9662ec85 Thread loop changes 2019-06-15 09:09:57 +01:00
Peter Boyle
52c74f1cac Thread loop changes 2019-06-15 09:08:16 +01:00
Peter Boyle
9a13d2992c lean up 2019-06-15 09:05:16 +01:00
Peter Boyle
b0449ae270 Thread loop changes 2019-06-15 09:04:19 +01:00
Peter Boyle
1299225105 Accelerator loop changes 2019-06-15 09:03:46 +01:00
Peter Boyle
5925e7f405 Thread for changes 2019-06-15 09:01:30 +01:00
Peter Boyle
be1fd4930f Template instantiation make happy changes 2019-06-15 08:37:34 +01:00
Peter Boyle
377fa5dec1 looping construct 2019-06-15 08:36:48 +01:00
Peter Boyle
e8b78f596e Looping construct changes 2019-06-15 08:35:57 +01:00
Peter Boyle
09720c40cd Coalesced loops 2019-06-15 08:35:26 +01:00
Peter Boyle
bb024dd114 Loop construct changed 2019-06-15 08:30:05 +01:00
Peter Boyle
52456b9ec7 New loop construct 2019-06-15 08:28:45 +01:00
Peter Boyle
b285138be4 Better checking on types 2019-06-15 08:27:48 +01:00
Peter Boyle
c7dbf4c87e Scalar support for GPU threads 2019-06-15 08:25:43 +01:00
Peter Boyle
1e889c93b8 Insert a GPU synchronise 2019-06-15 08:23:26 +01:00
Peter Boyle
7379047482 Threading and acceleration primitives further changes. accelerator_barrier() needed and used 2019-06-15 08:22:48 +01:00
Peter Boyle
d836ce3b78 Clean up of acceleration and threading primitives 2019-06-15 08:14:21 +01:00
Peter Boyle
cefaacbc07 Changing accelerator loop. Still have work to do for multi-GPU code 2019-06-15 08:10:24 +01:00
Peter Boyle
0074ef7f69 thread loops 2019-06-15 08:04:29 +01:00
Peter Boyle
20359ca15f Coalesced loops. 2019-06-15 08:03:57 +01:00
Peter Boyle
736358b0cb Coalesced loops 2019-06-15 08:03:13 +01:00
Peter Boyle
6b692aa726 Thread loops 2019-06-15 08:02:26 +01:00
Peter Boyle
7f99e1cd3b Coalesced loops 2019-06-15 08:01:39 +01:00
Peter Boyle
f3c89df948 Thread loop changes 2019-06-15 08:00:37 +01:00
Peter Boyle
b7e6d111d7 Thread loop changes. Need to offload this file 2019-06-15 07:59:10 +01:00
Peter Boyle
f39cf69c33 Accelerator loop change 2019-06-15 07:58:23 +01:00
Peter Boyle
8e27338df2 Rationalise number of loop macros 2019-06-15 07:57:40 +01:00
Peter Boyle
bcbb5e9d26 Remove assembly tests 2019-06-15 07:57:05 +01:00
Peter Boyle
0ea7f5279d Accelerator loop changes 2019-06-15 07:56:14 +01:00
Peter Boyle
18e5de426d There is a stray use of predicatedWhere introduced by Andrew Lawson in the conserve currents.
The conserved currents need rewritten using data parallel operations.
2019-06-15 07:53:58 +01:00
Peter Boyle
e896d81235 Accelerator loop redefine. Coalesce most accesses, but ET engine still to go clean. 2019-06-15 07:52:44 +01:00
Peter Boyle
7b8ccff4f4 Accelerated coalesced loops in most cases 2019-06-15 07:48:00 +01:00
Peter Boyle
68541606ab Thread loop changes. Soon try these with accelerator loops and benchmark 2019-06-15 07:46:42 +01:00
Peter Boyle
339ea10cc7 First touch only on CPU code 2019-06-15 07:45:43 +01:00
Peter Boyle
d0d8dc8042 Thread loop changes 2019-06-15 07:45:09 +01:00
Peter Boyle
81eb1fd9f2 Accelerator loop changes for coalesced access 2019-06-15 07:44:47 +01:00
Peter Boyle
cb93d32cd9 Thread loop changes 2019-06-15 07:44:08 +01:00
Peter Boyle
8f223962ff Thread loop changed 2019-06-15 07:43:42 +01:00
Michael Marshall
9a8a63467e BC2 now runs. setup() runs twice, which had resulted in doubling up of momenta. Also fixed initialisation of momentum phases. 2019-06-12 15:25:59 +01:00
Peter Boyle
36f06555a2 Simplify Impl 2019-06-09 22:26:27 +01:00
Peter Boyle
d6c0e0756d Remove GPU version 2019-06-09 11:23:42 +01:00
Peter Boyle
3e41b1055c Remove Gpu only kernels. 2019-06-09 11:20:01 +01:00
Peter Boyle
9fbcfe612c Update TODO list 2019-06-09 11:19:38 +01:00
Peter Boyle
e78a5e7838 ASM instantiation without link errors 2019-06-09 01:25:21 +01:00
Peter Boyle
da8d87e9da Cuda switch off 2019-06-08 17:11:38 +01:00
Peter Boyle
8e3a05d89b Moving the instantiation into a cleaner structure 2019-06-08 13:48:33 +01:00
Peter Boyle
8adc5da7dd Testig out approaches to kernel writing introducing SIMT_loop temporarily 2019-06-08 13:47:04 +01:00
Peter Boyle
29a244e423 Test of using a lane variable instead of repeated reference to threadIdx.y 2019-06-08 13:46:26 +01:00
Peter Boyle
18cbfecf02 Use symlinks in find command 2019-06-08 13:45:46 +01:00
Peter Boyle
c933ac2248 Temporarily introduce a SIMT_loop to test out approaches prior to making a global change to
accelerator_loop
2019-06-08 13:44:27 +01:00
Peter Boyle
ad2c433574 Instantiations move. Tried using Gianluca's suggestion about avoiding threadIdx but doesn't
seem to make a difference. Will revisit this and probably remove the lane parameter from the coalescedRead
2019-06-08 13:43:12 +01:00
Peter Boyle
86e7fb6e86 Instantiation relocation 2019-06-08 13:42:46 +01:00
Peter Boyle
fb91dda7be Hand instantiation moved location 2019-06-08 13:42:26 +01:00
Peter Boyle
82cf7bc5ab Move instantiation into fermion/instantiation 2019-06-08 13:41:46 +01:00
Peter Boyle
e452cc0a22 Move static variables into instantiation .cc file 2019-06-08 13:41:20 +01:00
Peter Boyle
4d2b938166 Remove explict instantiation from here 2019-06-08 13:41:01 +01:00
Peter Boyle
10d16ab76c Remove explict instantiation from here 2019-06-08 13:40:32 +01:00
Peter Boyle
1f997fa484 Instantiate via explict .cc files for parallel make. 2019-06-08 13:39:51 +01:00
Peter Boyle
dc5024e88c The GPU reduction was not working for me and causing errors. Need to revisit.
Gianluca is working on deterministic reduction/
2019-06-08 13:39:11 +01:00
Peter Boyle
6d77941990 Drop the 5D vec actions 2019-06-08 13:38:05 +01:00
Peter Boyle
0ee6e77cbc Compiles GPU and CPU, still gives good performance on CPU 2019-06-05 13:28:16 +01:00
Peter Boyle
18d3cde29a Compile on GPU workd 2019-06-05 00:14:58 +01:00
Peter Boyle
7323099966 Instatiation fix 2019-06-05 00:14:38 +01:00
Peter Boyle
6379651cdd Generic or GPU ready for benchmark test on GPU 2019-06-05 00:13:52 +01:00
Peter Boyle
ba4fd756b9 Fix signature, but deprecating this loops style 2019-06-05 00:12:36 +01:00
Peter Boyle
d185fc1ebf clean up instantiation 2019-06-05 00:11:52 +01:00
Peter Boyle
96b36d8367 Instantiation clean up 2019-06-05 00:11:27 +01:00
Peter Boyle
899f8b5065 Instantiation clean up 5d vec removal 2019-06-05 00:11:05 +01:00
Peter Boyle
c8d0483fe9 Remove 5d vectorisation 2019-06-05 00:10:37 +01:00
Peter Boyle
0f214e5f76 Clean up instantiation 2019-06-05 00:10:13 +01:00
Peter Boyle
8eea568426 GPU loop ; presently differentiated with ifdef, find a way to unify. 2019-06-05 00:09:28 +01:00
Peter Boyle
9636324069 GPU happy code 2019-06-05 00:08:54 +01:00
Peter Boyle
8a5489d9e6 Move the loop into a central kernel call. 2019-06-05 00:08:13 +01:00
Peter Boyle
8113845f9c coalesce loop. Need to rationalise this file 2019-06-04 23:49:29 +01:00
Peter Boyle
b47f73c222 GPU happy 2019-06-04 21:30:39 +01:00
Peter Boyle
5720ced0fd Simplifying 2019-06-04 21:30:08 +01:00
Peter Boyle
2c87b56b53 Making GPU happier 2019-06-04 21:29:44 +01:00
Peter Boyle
dbad48d802 Remove Ls vectorised DWF 2019-06-04 21:27:40 +01:00
Peter Boyle
4557a1365a Remove Ls vectorised DWF 2019-06-04 20:59:59 +01:00
Peter Boyle
16e9b87d98 Remove Ls vectorised DWF as unused and hard to maintain 2019-06-04 20:59:01 +01:00
Peter Boyle
685eea3d0f Small cosmetic 2019-06-04 20:58:14 +01:00
Peter Boyle
65b48831fb Simplify code 2019-06-04 20:56:30 +01:00
Peter Boyle
57396fc595 Simplify code 2019-06-04 20:56:23 +01:00
Peter Boyle
a2e199df50 Simplifying Cayley cases. 2019-06-04 20:54:52 +01:00
Peter Boyle
020346c848 WOrk list. Will have to clean up Fermion sector. 2019-06-04 20:54:00 +01:00
Peter Boyle
c2625a127e Non blocking loop. Want to change the naming here. 2019-06-04 20:52:59 +01:00
Peter Boyle
8794d35c78 GPU 2019-06-04 20:52:27 +01:00
Peter Boyle
24bff6dbe6 Minor improvements 2019-06-04 20:51:48 +01:00
Peter Boyle
45b15d10d3 GPU happy changes 2019-06-04 20:49:16 +01:00
Peter Boyle
33d6bbe32b GPU must use accelerator vectors 2019-06-04 20:48:52 +01:00
Peter Boyle
7a1569bd46 Annoying, cannot rely on equivalence of Grid ComplexD adn Eigen Complex type on GPU.
Solve with ComplexD typecasts but must be a better way
2019-06-04 20:47:49 +01:00
Peter Boyle
6e2e904a0e NVCC compiles happy. Start to develop strategy for writing generic
code for GPU kernels and CPU kernels.
2019-06-04 20:46:35 +01:00
Peter Boyle
d92a17f359 Suppress NVCC warnings in pugixml with pragma 2019-06-04 20:45:53 +01:00
Peter Boyle
47c063f984 Remove Ls Vec cases from benchmarks 2019-06-04 20:45:35 +01:00
Peter Boyle
7e27a5213a Tests builds clean. 2019-06-04 20:45:20 +01:00
Michael Marshall
fe72dc099b Upgrade to Mojave forced me to reinstall MacPorts. These are the ports I installed to get Grid working 2019-06-04 16:12:24 +01:00
Peter Boyle
ade4a126da Getting closer on the GPU port, but will start deleting 5th dim vectorised variants
for code maintainability
2019-06-04 11:53:44 +01:00
Peter Boyle
7b59ab5bd7 Compiling after reorganisation 2019-06-03 15:46:26 +01:00
Peter Boyle
fcd8cfe257 Gparity in 2019-06-03 15:45:09 +01:00
Peter Boyle
b4b53812cb Move implementation to specific implementation headers 2019-06-03 15:43:01 +01:00
Peter Boyle
085cac583f Implementation in header 2019-06-03 15:42:36 +01:00
Peter Boyle
25e3b8640c Move to header 2019-06-03 15:42:05 +01:00
Michael Marshall
54edb9906e Housekeeping. #include <Grid.h> ---> #include <Grid/Grid.h> 2019-06-03 15:20:46 +01:00
Peter Boyle
44bbec50b0 Making GPU compile happy 2019-06-03 14:57:04 +01:00
Peter Boyle
ec68b67d5d Attempt at unified GPU and CPU kernel 2019-06-03 14:55:51 +01:00
Peter Boyle
778450e0c8 Move to implementation subdir 2019-06-03 14:53:56 +01:00
Peter Boyle
567aa5f366 Move to implementation subdir 2019-06-03 14:53:33 +01:00
Peter Boyle
2ab7e2b175 Force instantiation in .cc files.
Eventually move into multiple files
2019-06-03 14:52:59 +01:00
Peter Boyle
6f61be044d Dont instantiate in header 2019-06-03 14:52:01 +01:00
Peter Boyle
269e00509e Don't instantiate in header 2019-06-03 14:51:24 +01:00
Peter Boyle
a5e90b0ddc Making the kernels more GPU happy 2019-06-03 14:50:54 +01:00
Peter Boyle
5622faf226 pragma once ifdef guard 2019-06-03 14:50:26 +01:00
Peter Boyle
82ecd520c7 Macos happy fix under nvcc 2019-06-03 14:48:50 +01:00
Michael Marshall
9ff459816f ReadBinary needs to do case insensitive name comparison (since I changed the default case of perambulator column names) 2019-06-01 13:50:27 +01:00
Michael Marshall
eb737daeb5 Merge branch 'develop' into feature/distil
* develop: (34 commits)
  Hadrons: EMLepton: Wall source
  Revert "cleaning up Kl2 contraction"
  cleaning up Kl2 contraction
  posibility to save/load schedules directly from the application parameters
  moving VERSION file to the empty ChangeLog one, this create compilation problems with #include <version> in recent versions of LLVM and case-insensitive FS (typically macOS)
  Added precision tuning to Hadrons parameterfile writing
  Kl2 QED cleanup
  Added ZFIMPL to SeqGamma
  Added ZFIMPL to SeqConserved module
  F1 ensemble running with 96%~ acceptance etc..
  Make detection of HPE 8600 automatic
  Added variables that were missing from wall source setup
  Exposed a coulomb/landau enum to the gauge fixing module
  Coulomb gauge added as an option
  More logging, timing, and 4d/5d logic for eigpack gauge transforms
  Added gauge transform option to eigpack IO
  Hadrons: Lepton Propagator for kl2, sign swap for antiperiodic boundary
  A2A Lepton-Meson Field contraction
  Verbose
  Iteratoin range fix
  ...
2019-05-31 18:20:43 +01:00
Peter Boyle
ffde81f22a Nsimd() and coalesced support 2019-05-25 12:44:07 +01:00
Peter Boyle
d8098f1ecd coalesced support 2019-05-25 12:43:31 +01:00
Peter Boyle
aca788cf4f Move coalesced read into tensors 2019-05-25 12:43:00 +01:00
Peter Boyle
a0e9f3b0a0 Plan for GPU port 2019-05-20 09:46:19 +01:00
Peter Boyle
a9342c6ae5 Udpdate TODO afer gianluc marge 2019-05-18 22:58:25 +01:00
Peter Boyle
ee6f96d85c Merge pull request #210 from grid-test-organisation/feature/gpu-port-develop
Cayley fermion functions for GPUs
2019-05-18 19:06:20 +01:00
Peter Boyle
4e9df9e93c GPU patches 2019-05-18 17:43:11 +01:00
Peter Boyle
9fe68857a9 Runs multiGPU with coalesced access on tesseract 2019-05-18 17:42:41 +01:00
Peter Boyle
37336c9e0c Allow compress to be either vector or scalar types 2019-05-18 17:41:13 +01:00
Peter Boyle
6c4da3bbc7 Stencil now runs with coalesced accesses 2019-05-18 17:40:35 +01:00
Peter Boyle
a584b16c4a Adding a non-blocking kernel launch 2019-05-18 17:39:54 +01:00
Felix Erben
8ce7ebdca3 fixed contraction issue 2019-05-17 10:52:55 +01:00
Felix Erben
435653490e fixed contraction issue 2019-05-17 10:50:15 +01:00
Michael Marshall
10a052d695 3 issues preventing compilation under clang. Marked these with FELIX_ISSUE and made minimal change to make compile (as fix not obvious) 2019-05-17 09:59:01 +01:00
Felix Erben
acd5a01b65 some work on baryons 2019-05-16 15:11:50 +01:00
gfilaci
1a82533d22 fix inner product with thrust reduction 2019-05-14 15:35:54 +01:00
Michael Marshall
ec7d96ce3b Merge branch 'develop' into feature/distil
* develop:
  Hadron WeakEye and A2ALoop bug fixes, and WWVVContraction bug fix
  DiskVector: fix of memory bug triggering segfault when the cache is accessed following a certain pattern
  MFermion::GaugeProp fix for 4d fields
2019-05-14 13:10:40 +01:00
gfilaci
e3c56fd9b3 CayleyZeroCounters before benchmark loop 2019-05-13 15:52:00 +01:00
gfilaci
955cc7790f MooeeInvDag offloaded to GPU 2019-05-13 14:25:29 +01:00
gfilaci
1179123ac2 MooeeInv offloaded to GPU 2019-05-13 12:37:12 +01:00
gfilaci
22e35c9ddd M5Ddag offloaded to GPU 2019-05-10 12:23:39 +01:00
gfilaci
698b45e163 remove unused typedef 2019-05-09 11:19:39 +01:00
gfilaci
f1744b3f01 M5D offloaded to GPU 2019-05-09 11:17:55 +01:00
gfilaci
2b3c22f03d bandwidth dependent on grid default precision 2019-05-08 12:01:11 +01:00
gfilaci
8423a05940 duplicate CayleyFermion5D for gpu 2019-05-08 11:51:37 +01:00
Michael Marshall
c16916cc45 Multiple local slice fixes 2019-05-06 10:35:42 +01:00
Michael Marshall
a865caf0d2 Forgot a const in IndexName only version of NamedTensor constructor 2019-05-03 22:17:25 +01:00
Michael Marshall
9ae4d369f3 Use the definition of the Perambulator Index names given in Hadrons::MDistil 2019-05-03 22:00:50 +01:00
Michael Marshall
ec24a1f828 Fixed 2 bugs in LapEvec: 1) InsertLocalSlice 2) ensure convergence assertion stops entire machine 2019-05-03 16:03:56 +01:00
Michael Marshall
0efe63f6fa 3D smearing fix 2019-05-02 19:37:59 +01:00
Michael Marshall
b7ead6c16a Fixed bug: iff stout smearing disabled then gauge field uninitialised 2019-05-02 18:20:49 +01:00
gfilaci
d9438627d9 M5D benchmark without vector copy overhead 2019-05-02 11:10:57 +01:00
gfilaci
b23305dbe2 fix M5D flop count 2019-05-02 11:08:21 +01:00
gfilaci
d3b5c02e2d measure M5D bandwidth and fix M5D flop count 2019-05-02 11:02:39 +01:00
gfilaci
8b6541fb60 Fix gpu MultRealPart and MaddRealPart bug 2019-05-02 10:58:17 +01:00
gfilaci
6da9aa9971 replace std::vector with Vector in benchmark 2019-05-02 10:56:22 +01:00
gfilaci
44e0360b97 replace std::vector with Vector 2019-05-02 10:55:36 +01:00
gfilaci
9003c4a07c allocator copy constructor (to be fixed) 2019-05-02 10:53:37 +01:00
gfilaci
b52fa38f8c seed initialisation of RNG5 2019-05-02 10:36:09 +01:00
gfilaci
3f1c4d8789 fix comment hash 2019-05-02 10:24:36 +01:00
Michael Marshall
62692b68b9 I'd forgotten that Intel '17 doesn't like auto var{value}; syntax 2019-05-01 20:45:16 +01:00
Michael Marshall
311c35a15c Looking for fixes for Intel '17 compiler errors. std::cout << complex number ? 2019-05-01 18:22:08 +01:00
Michael Marshall
a3fe57f430 NamedTensor writes to tag NamedTensor by default (not filename) - so still usable in case user renames file.
Also tweaked tensor index name checking (which is used to ensure tensor is correct type)
2019-05-01 18:11:37 +01:00
Michael Marshall
8dc0587621 Post Michael / Felix review. Ready for Peter / Antonin review 2019-05-01 13:04:51 +01:00
Michael Marshall
cfe5fa7a35 1) Don't write Laplacian eigenvectors to disk 2) Add a test that loads perambulators from disk 2019-05-01 09:50:23 +01:00
Michael Marshall
e72e26c899 Get rid of unnecessary multiFile options 2019-05-01 08:53:08 +01:00
Michael Marshall
334f29becb Fairly close to ready for release. Felix and I to review, then submit for release 2019-04-30 23:53:57 +01:00
Michael Marshall
e56ead55ef WIP 2019-04-30 14:41:48 +01:00
Michael Marshall
d74d443d1b Pre-release cleanup in progress 2019-04-29 22:18:29 +01:00
Michael Marshall
4203105104 Part-way through release tidy-up 2019-04-29 18:40:38 +01:00
Michael Marshall
ac19c0e04f This will need to be removed eventually, but should save us fiddling about with each release 2019-04-29 09:20:08 +01:00
Michael Marshall
b48ca8a6ef Merge branch 'develop' into feature/distil
* develop: (36 commits)
  Mobius 2+1f sign off.
  Integrator logging on by default
  RHMC for mobius
  HMC make file
  Update
  Simple check
  Simple checks
  Monius HMC
  Changes locally
  Power method
  Momentum rescaling
  Bounds checking
  Bounds checking
  Scale momentum convention to CPS/UKQCD MD time
  Add bounds checking
  Updated documentation after Peter's review. 1) Removed version numbers from Grid dependencies 2) Explained in a little more detail how to use Xcode to build Grid and Hadrons libraries
  Remove bundled Eigen stuff
  Fix typo so it matches develop
  Remove bundled source from my local repository
  Slightly generalize interface to SchurRedBlackBase and derived solver classes so we can pass forecasted initial guesses in EOFA heatbath correctly
  ...
2019-04-29 08:37:39 +01:00
Michael Marshall
c48ae4f3ad 1) Only the boss should write the perambulator - possibly was a source of intermittent corruption?
2) Implemented and test a perambulator conversion utility in Test_distil (commented out near the start of main)
2019-04-28 23:24:57 +01:00
Michael Marshall
fb74de0798 Making sure Hdf5 is an optional dependency (default to binary writer if not present) 2019-04-28 20:23:44 +01:00
Michael Marshall
adc1eaee68 Switched to Hdf5 format for perambulators. Ready for first test on Tesseract. 2019-04-28 17:53:42 +01:00
Peter Boyle
60330e05a3 NVCC wacky compiler options frozen. Possibly Cuda 9.2 specific 2019-04-28 07:39:33 +01:00
Peter Boyle
f9b8c0cccf Vector changes for UVM 2019-04-28 07:38:57 +01:00
Peter Boyle
3cad67e569 Compile on tesseract 2019-04-28 07:38:09 +01:00
Peter Boyle
170ba4e619 Ensure different MPI ranks use different GPUs. The mapping works on Tesseract. 2019-04-28 07:32:30 +01:00
Peter Boyle
204a090497 Inner product is not working on GPU. Why? 2019-04-28 07:31:56 +01:00
Peter Boyle
3c717c47ef GPU no compile on Wilson Multigrid fixed 2019-04-28 07:31:19 +01:00
Michael Marshall
5aca4e8670 Just realised that the trace is at every lattice site, so moved the check for no smearing further up 2019-04-26 17:23:18 +01:00
Michael Marshall
e223d0b99f Need to validate range about which exp^iQ is considered unity 2019-04-26 16:00:35 +01:00
Michael Marshall
2e220456d3 First attempt at minimising smearing 2019-04-26 15:54:05 +01:00
Felix Erben
4333d97958 fixed parameter 2019-04-26 14:29:21 +01:00
Felix Erben
55c9c45d4b Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-04-26 14:28:01 +01:00
Felix Erben
e70e03f560 started stout smearing for small w 2019-04-26 14:27:40 +01:00
Michael Marshall
ff5e2e0f47 Debug output fix. Meant to print the rho matrix for stout smearing ... not the address of the function that creates it 2019-04-26 12:30:41 +01:00
Michael Marshall
4f3d1ea6e8 Two heads are better than one. Combined effort and hopefully spatial smearing now fixed! 2019-04-26 12:18:11 +01:00
Author Name
b1768ba820 Urgh! 2019-04-26 10:04:27 +01:00
Michael Marshall
3ac5a69a57 Ready to test spatial smearing (again) 2019-04-26 08:54:30 +01:00
Michael Marshall
50a74eaea3 Doesn't compile. Does it still need to be maintained? 2019-04-26 08:33:10 +01:00
Michael Marshall
8419fbb335 Renamed PerambLight module. Check with Felix whether Test_24 and Test_tesseract still need to be maintained 2019-04-26 08:23:15 +01:00
Michael Marshall
23a9b93cda More dependencies for Distil.hpp move and (C) 2019 only 2019-04-26 07:39:05 +01:00
Michael Marshall
ecdc3ddebf Moved Distil.hpp and added GNU license to all files 2019-04-26 07:24:56 +01:00
Michael Marshall
606698511c Seems we've not been keeping the test up-to-date 2019-04-22 19:03:24 +01:00
Michael Marshall
a97b814f0c Remove redundancy in LapEvec filename 2019-04-19 14:09:36 +01:00
Michael Marshall
7214681e11 Spatial smearing doesn't work yet. Fixed inconsistency in naming of perambulator in PerambLight.hpp 2019-04-19 13:54:25 +01:00
Michael Marshall
143b75956c Stout smearing 3D fixes. Changed LapEvec to perform spatial smearing only 2019-04-19 11:54:02 +01:00
Felix Erben
4a4203c610 fixed stout smearing for now 2019-04-18 19:10:49 +01:00
Felix Erben
2b598294c9 added distil source module 2019-04-18 17:47:09 +01:00
Michael Marshall
d111c70c38 Merge branch 'develop' into feature/distil
* develop:
  Make sure Grid::Serializable can write Eigen Tensors to output streams. NB: 1) The Eigen package defines operator<< for Eigen tensors, but this format is different, hence Grid::Serializable::WriteMember 2) For simplification, the contents are written in memory order. I.e. Different results will be obtained depending on whether the tensor is row- or column-major
  ... this time without the new Distillation modules ...
  Eigen tensor serialisation fixes after Antonin's review
  Iterator added. Will wait for review comments before finalising.
  Fix build with Intel '17 compiler, i.e. workaround incorrect auto types for c++ style definitions. E.g. assuming T::rank is an int, then objects defined like so:     const auto rank{T::rank}; should also be int. Unfortunately, Intel '17 instead defines them to be std::initializer_list<int>, then proceeds to complain where these variables are used that they cannot be converted to int. NB: This was fixed under Intel '18
  Pushed paboyle's changes: Updates for clang happy
  Merge paboyle's no compile in single precision Intel 2019 fix
  Eigen::Tensor serialisation. Tested on single and double precision builds
2019-04-10 13:14:24 +01:00
Michael Marshall
ed2427d5f7 Make sure Grid::Serializable can write Eigen Tensors to output streams. NB:
1) The Eigen package defines operator<< for Eigen tensors, but this format is different, hence Grid::Serializable::WriteMember
2) For simplification, the contents are written in memory order. I.e. Different results will be obtained depending on whether the tensor is row- or column-major
2019-04-06 15:37:53 +01:00
Michael Marshall
ea2f34de7b Updated documentation after Peter's review.
1) Removed version numbers from Grid dependencies
2) Explained in a little more detail how to use Xcode to build Grid and Hadrons libraries
2019-04-06 13:37:47 +01:00
Michael Marshall
63dc0fa7e9 Fixed memory leak ... without breaking semantics of prior code. Possibly should change the semantics? For Peter / Antonin to comment 2019-04-04 16:00:17 +01:00
Felix Erben
5e6104e683 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-04-04 12:13:35 +01:00
Felix Erben
25e4ee3a49 3D Stout smearing added 2019-04-04 12:13:16 +01:00
Michael Marshall
4161429dcc Serialisation fixes after Antonin's review 2019-04-03 22:30:07 +01:00
Michael Marshall
b5eb97206b Merge branch 'develop' into feature/distil
* develop:
  MGauge::GaugeFix use standard convention for fields
  fix bug: MGauge::GaugeFix should not modify its input
  add gauge transformation matrix as output to module MGauge/GaugeFix
2019-04-03 16:24:49 +01:00
Michael Marshall
0da906cf66 Merge branch 'develop' into feature/distil
* develop:
  Documentation for using Grid with Xcode on Mac OS
2019-03-27 23:08:29 +00:00
Michael Marshall
3decb5f886 Merge branch 'develop' of github.com:paboyle/Grid into feature/distil
* 'develop' of github.com:paboyle/Grid:
  endianness fix in resilient IO
2019-03-27 20:39:23 +00:00
Michael Marshall
faa8bb9bc6 Fixed funny memory leak 2019-03-27 17:55:52 +00:00
Michael Marshall
4c02ed6d0c Updated GridXcode documentation 2019-03-27 13:54:39 +00:00
ferben
f757b80e1c tried to fix mem leak 2019-03-27 12:00:36 +00:00
ferben
b8581be1da : 2019-03-27 11:59:06 +00:00
Michael Marshall
9fce1263be Fixed bug in LapEvec if machine running spread-out in time 2019-03-26 13:24:39 +00:00
Michael Marshall
ae565b006a Compiling in single-precision now works 2019-03-25 22:56:01 +00:00
Michael Marshall
8502660023 Begin fixes for single precision 2019-03-25 20:40:05 +00:00
Michael Marshall
625a97a466 cosmetic 2019-03-25 18:16:04 +00:00
Felix Erben
bce2766fef Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-03-25 16:38:42 +00:00
Felix Erben
ce501afec6 bugfix 2019-03-25 16:38:25 +00:00
Michael Marshall
1d10a3b3de Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
* 'feature/distil' of github.com:mmphys/Grid:
  bugfix
2019-03-25 15:50:57 +00:00
Michael Marshall
d1e02f50ff Added iterator for Eigen tensors 2019-03-25 15:50:29 +00:00
Felix Erben
48b03c4590 bugfix 2019-03-25 15:45:35 +00:00
ferben
b3b9e608e1 added new module for noises 2019-03-25 14:13:03 +00:00
Michael Marshall
4e87cbd400 Fix build with Intel '17 compiler, i.e. workaround incorrect auto types for c++ style definitions.
E.g. assuming T::rank is an int, then objects defined like so:
const auto rank{T::rank};
should also be int. Unfortunately, Intel '17 instead defines them to be std::initializer_list<int>, then proceeds to complain where these variables are used that they cannot be converted to int. NB: This was fixed under Intel '18
2019-03-23 09:28:41 +00:00
ferben
4fc045b563 added module to load perambulators from disk 2019-03-22 13:50:47 +00:00
ferben
fbf286b0e3 added Spin dilution 2019-03-22 13:30:11 +00:00
Michael Marshall
9dc3fe9922 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
* 'feature/distil' of github.com:mmphys/Grid:
  modules list
2019-03-22 13:00:06 +00:00
Felix Erben
6c9029fab7 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-03-22 12:41:56 +00:00
Felix Erben
8700dd4d0d modules list 2019-03-22 12:41:53 +00:00
Michael Marshall
9c16391e55 Merge branch 'develop' into feature/distil
* develop:
  Updates for clang happy
2019-03-22 12:08:50 +00:00
Michael Marshall
685d9bafef Merge branch 'develop' into feature/distil
* develop:
  No compile in single precisoin Intel 2019 fix
2019-03-21 16:36:48 +00:00
Michael Marshall
d2d26b302d Removed the module we don't need from modules.inc (so make now works)
i.e. removed Modules/MDistil/PerambMultipleSolves.hpp from Hadrons/modules.inc
2019-03-20 22:59:20 +00:00
Michael Marshall
88cb004731 Fixed single-precision issues in Test_serialisation 2019-03-20 22:05:16 +00:00
ferben
a66bb8acba fixed possible memory leak 2019-03-20 14:41:36 +00:00
ferben
4ae35000a9 removed module which we do not need 2019-03-20 13:36:57 +00:00
Michael Marshall
02b96b4602 Fixed module list (messed up when I merged from develop) 2019-03-20 11:20:40 +00:00
Michael Marshall
11dded61e8 Merge branch 'develop' into feature/distil
* develop: (29 commits)
  precision fix
  Updates after review with Peter.
  Wilson clover multi grid for lime lattice
  Recommendations for Traits classes
  Hadrons: uninitialised pointer fix (might have been harmless)
  Hadrons: beware of the nasty uninitialised twists
  Smearing test. Test on free field.
  Smearing for quark observables
  Smearing
  Hadrons: XML validator utility
  display relative norm during field IO norm check
  possibility to set a build number
  IO norm check on relative norm
  Output field norm check during IO
  Hadrons: random vector utility module I/O
  quieter initialisation
  fix patch command for eigen in bootstrap.sh
  Mres changes and gauge xform mat changes
  Hadrons: 32 bit I/O directly in Lanczos module
  Hadrons: copyright update
  ...

# Conflicts:
#	Grid/tensors/Tensor_traits.h
#	Hadrons/Modules.hpp
#	Hadrons/modules.inc
2019-03-20 10:35:36 +00:00
Michael Marshall
24cf3b9df5 Ignore Version.h as it's created by automake/autoconf 2019-03-19 12:12:39 +00:00
Michael Marshall
9c8aa2047d Put GridXcode doc in subdirectory 2019-03-19 07:33:19 +00:00
Michael Marshall
204cfa1c5a Added documentation for Grid using Xcode 2019-03-19 07:28:29 +00:00
Michael Marshall
fe6845d38b Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-18 14:44:18 +00:00
bff4eeec41 Added disclaimer on half-precision types 2019-03-18 12:15:25 +00:00
Felix Erben
d1fe4dce33 new idea to get multiple perambulators 2019-03-15 10:28:02 +00:00
ferben
50ca3101de bug in multiSolves and new test prog 2019-03-13 17:25:55 +00:00
ferben
0faf40e207 last commit did not compile - fxied this 2019-03-13 13:24:18 +00:00
ferben
5313e44d11 some cleanup 2019-03-13 13:15:12 +00:00
ferben
6bb9b67c93 externalised gauge field reading to hadrons module 2019-03-13 12:09:12 +00:00
a0405c6d84 PerambMultipleSolves.hpp compiles (not had time to test) 2019-03-12 14:01:29 +00:00
ferben
c2a3231cdf added testing module for multiple perambulators 2019-03-11 18:05:39 +00:00
ferben
5fb2ee89bb modified test so that it runs 2019-03-08 16:50:21 +00:00
ferben
608a98d870 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-08 16:28:34 +00:00
ferben
2df396380d solver is now external 2019-03-08 16:28:21 +00:00
Felix Erben
64ba664637 changed debug options 2019-03-08 12:25:00 +00:00
ferben
4a70b2ffd4 Aslash insertions work now? 2019-03-08 12:23:22 +00:00
2d659015ff Serialisation is fully functional. Ready for review. 2019-03-08 00:30:43 +00:00
e63019ac50 Tensor serialisation is fully functional 2019-03-08 00:01:45 +00:00
Felix Erben
dde118fed9 added everythong to compute sequential aslash fields 2019-03-07 17:36:53 +00:00
Felix Erben
1538bf8c34 added everythong to compute sequential aslash fields 2019-03-07 17:36:22 +00:00
Felix Erben
4abc498ae3 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-03-07 15:34:10 +00:00
Felix Erben
93dfbfbfcd added module to compute perambulator from a solve 2019-03-07 15:33:50 +00:00
f9e273d4bf Making sure same as Traits-recommend 2019-03-07 14:33:04 +00:00
584fa0a633 Changes after review with Peter 2019-03-07 12:53:34 +00:00
ferben
73cdca3973 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-06 13:55:51 +00:00
ferben
d716f8a0c9 new module for baryon contraction 2019-03-06 13:55:36 +00:00
aa24f04911 Changed EigenIO to use GridTypeMapper type traits 2019-03-06 12:55:05 +00:00
1880e6d12d Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-06 11:16:34 +00:00
4a00513e65 Moving Eigen trensor utilities to separate (optional) header 2019-03-06 11:16:22 +00:00
ferben
7718ee199a efficient baryon test program 2019-03-05 17:16:42 +00:00
ferben
d7c7bff065 added output for source meson fields on all tsrc 2019-03-05 12:01:55 +00:00
ferben
802675f062 baryons should compile now... 2019-03-04 17:31:21 +00:00
d56d8c923f Replaced an error in A2AUtils.h that was stopping the build with an assert() 2019-03-02 00:36:53 +00:00
00c3c6fc54 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-03-02 00:24:47 +00:00
b3d4ba8657 Fixed issues with Eigen Tensor serialisation. Fixed issues with precision to text streams 2019-03-02 00:24:37 +00:00
Felix Erben
a4d578bd5d baryons work now??? 2019-03-01 14:44:39 +00:00
Felix Erben
7653649389 baryons working now 2019-03-01 12:57:41 +00:00
a344a2227e Fixing build errors 2019-02-28 20:30:16 +00:00
4b9200b35c Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-28 19:06:36 +00:00
91be028507 Still one issue on write 2019-02-28 19:06:25 +00:00
3b05f91f5c Prototype for template traits recommendations 2019-02-28 19:04:44 +00:00
Felix Erben
8804271339 efficient baryons compile! 2019-02-28 16:32:40 +00:00
6d9f377913 added parity 2019-02-28 11:05:31 +00:00
18b603c5ae simple but hopefully efficient baryon field 2019-02-28 10:27:05 +00:00
ferben
e9784572af baryons... 2019-02-27 17:51:25 +00:00
Felix Erben
f168a9e7ee continued with baryons 2019-02-26 16:41:52 +00:00
Felix Erben
50b6db75da Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-02-26 15:57:09 +00:00
Felix Erben
df065f1d57 first test configs 2019-02-26 15:57:01 +00:00
578eb177e7 Tweaked format and memory use on Xml format. Still crashes (out of memory) on large read on my laptop 2019-02-25 22:03:21 +00:00
81b3f3d2ca Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 15:39:07 +00:00
7c7ffa3b10 Added text read/write 2019-02-25 15:38:47 +00:00
ferben
1f098ceecf Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 15:36:23 +00:00
ferben
c47c1a2472 started working on baryons - this time efficiently 2019-02-25 15:36:11 +00:00
ec45b16840 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 14:10:34 +00:00
9288019789 Added Xml IO (has one deficiency: the format for multi-dimensional data is flat) 2019-02-25 14:10:24 +00:00
ferben
9c04139362 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 12:40:44 +00:00
ferben
cfc14a7432 more adjustments to test 2019-02-25 12:40:32 +00:00
31e40c26fa Oops. Forgot to delete SortNode (prevented linking) 2019-02-25 11:35:33 +00:00
ferben
3f2fe5c7e7 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 11:18:34 +00:00
ferben
76b6e8a01e first tesseract test 2019-02-25 11:18:25 +00:00
f9543982e4 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-25 11:07:43 +00:00
3c9f2d4106 Chunking layout reasonably efficient. Looks for small prime factors of each dimension, falling back to approximate size if needed. 2019-02-25 11:07:29 +00:00
ferben
cad26a736e quick&dirty fix for g5*field 2019-02-22 17:05:16 +00:00
ferben
4f2ac433f1 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-22 16:31:26 +00:00
ferben
f9e505108b test Aslash 2019-02-22 16:31:17 +00:00
Felix Erben
d2aced13da Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-02-22 16:30:40 +00:00
Felix Erben
03d031d623 tesserct test 2019-02-22 16:30:22 +00:00
44a2d4854a Ensured Hdf5 chunk size always less than 4GB 2019-02-22 15:14:32 +00:00
292ff33f7f Removed issue with std::string_literal 2019-02-21 16:51:05 +00:00
55886cf9db ran make_module_list.sh 2019-02-21 16:14:13 +00:00
c640923159 Fixed reference to depth from test 2019-02-21 15:48:52 +00:00
752530f352 Gotten rid of c++17 in Test_serialisation.cc 2019-02-21 14:43:07 +00:00
34b9450fc9 Gotten rid of c++17 2019-02-21 14:22:48 +00:00
ferben
5d6462b706 bugfix 2019-02-21 11:13:10 +00:00
f70c5b004a some cleanup in Baryon2pt 2019-02-20 12:56:13 +00:00
5bb9de9242 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-19 17:37:37 +00:00
982a24514b Binary IO also implemented and tested 2019-02-19 17:37:21 +00:00
ferben
97c6f770b4 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-19 17:22:41 +00:00
ferben
4522f1e831 separated final 2pt Contraction 2019-02-19 17:22:30 +00:00
c14547ddbe EigenIO writing rationalised. All indices (trivial or not) written 2019-02-19 16:12:55 +00:00
63c97db414 Prior to rationalising 2 versions of BaseIO::write (scalar and vector) 2019-02-19 13:29:08 +00:00
6ebb32ffbf Rationalised Test_serialisation 2019-02-18 21:40:53 +00:00
07c97cb424 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-18 17:12:36 +00:00
04b58de5de Read-back working. 2019-02-18 17:12:27 +00:00
ferben
6e822b7201 added sign for contraction sum 2019-02-18 15:21:13 +00:00
ferben
625ccfcd72 continued baryon contraction code 2019-02-18 13:10:34 +00:00
c77069244d Nearly ready. Just finishing off readback and compare 2019-02-18 08:55:50 +00:00
9815ddb853 Started read routines. Introduced readMultiDim and tested I didnt break anything 2019-02-16 19:30:33 +00:00
74a3a5b825 Fixed existing bug in Hdf5Reader::readDefault for std::vector<U> 2019-02-16 18:45:46 +00:00
00e9416e0a Tweak to initialisation example 2019-02-16 17:08:22 +00:00
b6803a070a Making sure I understand row-major vs column-major ordering 2019-02-16 16:18:28 +00:00
ferben
bfd2770657 started on baryon flavour sums 2019-02-15 15:51:46 +00:00
ferben
668b1e77c7 small changes 2019-02-15 15:31:53 +00:00
ferben
e51744260f Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-15 14:32:26 +00:00
ferben
e0987d7d81 first contraction version done 2019-02-15 14:32:17 +00:00
26b94d7bda Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-15 13:53:00 +00:00
df0c8b5d84 Test of Eigen slices 2019-02-15 13:52:49 +00:00
ferben
a111d814db Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-15 10:47:41 +00:00
e8bd8767c0 Get rid of declarations inside constexpr functions. if constexpr warning remains 2019-02-15 10:06:15 +00:00
8cb96cb693 Hmmm lots of warnings depending on compiler ... 2019-02-14 19:17:12 +00:00
b9bee45277 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-14 19:05:47 +00:00
bee24655cd Finalising traits 2019-02-14 19:05:35 +00:00
886c895f81 baryon field structure is now eigentensor - started on contractions for 2pt functions 2019-02-14 16:44:54 +00:00
59c8cc1588 Minor bugfix 2019-02-13 22:11:24 +00:00
11467a994d Enough for tonight 2019-02-13 21:48:35 +00:00
ferben
9f2ca98dfc enseble can now be specified in LapEvec 2019-02-13 13:54:31 +00:00
bf434b6bef Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-13 12:14:18 +00:00
41ff592515 Moved serialisation tests into Test_serialisation 2019-02-13 12:14:01 +00:00
ferben
48ec937c55 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-13 11:48:57 +00:00
ferben
65731546b7 merge... 2019-02-13 11:48:34 +00:00
76c6a6772a Added rank_non_trivial 2019-02-12 22:15:55 +00:00
e7048231bc Working version with additional Grid traits pre: review by Antonin 2019-02-12 13:59:48 +00:00
49babeab19 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-11 23:26:46 +00:00
fb2cb3015e Writing of Eigen::Tensor of grid objects now works (for Hdf5) 2019-02-11 23:26:18 +00:00
ferben
53f45d2c7e Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-11 17:39:55 +00:00
ferben
d889cebc60 unique string is now used 2019-02-11 17:39:42 +00:00
9a225235b6 Can write both fixed and dynamic sized tensors (small tidy) 2019-02-11 17:15:38 +00:00
dff7d9261d Can write both fixed and dynamic sized tensors 2019-02-11 15:47:40 +00:00
6f2663edf6 Serialisation of an object containing an Eigen::Tensor works for Hdf5. Still quite a lot of tidying up to do. 2019-02-10 23:19:20 +00:00
d5024bd07e Hdf5 writing of scalar (i.e. no Grid subtypes) Eigen::Tensor works. But issues when adding Eigen::Tensor to serialisable object. 2019-02-10 15:33:16 +00:00
9c4189484a Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-09 17:12:43 +00:00
3720103f41 Adding Eigen::Tensor still WIP 2019-02-09 17:12:36 +00:00
ferben
c4d27ee30f added parity operator to baryon fields 2019-02-08 15:49:52 +00:00
ferben
d26a5dce12 bugfix 2019-02-08 14:37:09 +00:00
ferben
5843a943d9 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-08 14:36:08 +00:00
ferben
c1341b8ed2 bugfix 2019-02-08 14:33:06 +00:00
6a4515d0cd baryons have now the correct (?) structure - also easier! 2019-02-07 12:27:57 +00:00
a0a39e4b00 Fixed initialisation of vector of Complex 2019-02-06 21:56:44 +00:00
b9fb16077c Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-06 21:37:54 +00:00
4b3c566c89 ../tests/hadrons/Test_hadrons_distil.cc 2019-02-06 21:36:46 +00:00
ferben
cbd2dfe53f Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-06 12:56:49 +00:00
ferben
6cdb1eb62c BContraction now computes what might be a baryon function, but probably isn't 2019-02-06 12:23:52 +00:00
ed7175076b Turned off warning of unused variable line 150 2019-02-06 09:32:13 +00:00
27677b3870 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-06 09:25:39 +00:00
7423f5af1a Examples of how to access Grid Tensors 2019-02-06 09:25:24 +00:00
ferben
21d6dbe0b6 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-02-05 17:32:39 +00:00
ferben
1ee84509b5 added baryons project - not working yet 2019-02-05 17:32:26 +00:00
57e57d162f Removed Eigen::DontAlign attribute 2019-02-05 12:50:28 +00:00
5b0870bb19 Added Scalar_ length and Scalar_Unit_Size to Perambulator file for validation 2019-02-05 09:07:05 +00:00
7f5354630a Updated perambulator binary format to save payload in big endian format on disk 2019-02-04 23:07:59 +00:00
008ac6b5ae Permabulator is read back from disk if it exists instead of being created 2019-02-04 12:06:32 +00:00
c7aa4e0c1f Perambulator filename can be specified in xml. NB: Perambulator binary format now includes data size in bytes to avoid type mismatches. 2019-02-04 11:30:30 +00:00
43bd918a47 Logging tweak 2019-02-03 21:48:50 +00:00
7eda54bb87 Only write indices with dimesion!=1 2019-02-03 20:58:58 +00:00
bd75b843fa Added checksum to data 2019-02-03 20:31:42 +00:00
8865bf5d7c Implemented perambulator read/write ... but in binary format. Will switch to Hdf5 when I have Antonins feedback 2019-02-03 17:05:19 +00:00
ferben
caabbcd951 minor change 2019-02-01 17:50:18 +00:00
48528c5b1d Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil
Added index names to Perambulator
2019-02-01 15:31:27 +00:00
f7b90a0c14 Added index names to perambulator 2019-02-01 15:20:35 +00:00
ferben
a9848becb0 unsmeared sinks can now be computed - new test program available 2019-02-01 13:23:42 +00:00
7cc13f48d5 added some TODO comments; needs discussion 2019-01-31 16:54:11 +00:00
b6b267fd4b Fixed new test parameters 2019-01-31 15:11:12 +00:00
9671a61bb2 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-31 15:07:45 +00:00
d7dc617746 Switched perambulator to sue Eigen::Tensor (file write temporarily excluded) 2019-01-31 15:06:52 +00:00
32cb2e1a9a Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-31 13:01:31 +00:00
3d31113337 added test t5 to compute meson fields of different quarks. Different nvec are allowed. 2019-01-31 13:01:16 +00:00
48b6f7e6ad Changed PerambLight<FIMPL> to PerambLight<GIMPL> 2019-01-31 12:37:00 +00:00
0da411fe60 LapEvec fixes 2019-01-31 12:28:38 +00:00
d7b9ed199d PerambLight fixes 2019-01-31 12:24:32 +00:00
7e74f7bec4 tsrc != 0 now works 2019-01-31 11:35:05 +00:00
dae7b30b92 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-30 21:16:30 +00:00
f7e4661ca0 Fixed grid3d leak in PerambLight 2019-01-30 21:16:09 +00:00
ferben
7b66197534 meson fields are now the same 2019-01-30 18:03:34 +00:00
ferben
c3273eff20 agreement up to laph vectors 2019-01-30 11:20:22 +00:00
ferben
67a3d7aeed added debug output, perambulators now agree up to 8 digits 2019-01-29 16:24:59 +00:00
ferben
d8831fe925 changed parameters to match Test_Distil 2019-01-29 13:40:26 +00:00
c7ceff6a21 Switched to Gauge field (GIMPL) 2019-01-28 12:28:35 +00:00
ferben
5580b3a7d1 bugfix in DistilVectors 2019-01-28 12:24:47 +00:00
33d8fb2dd9 Default 2019-01-25 19:21:12 +00:00
9f6f776460 ensured there is a default test to run 2019-01-25 19:14:22 +00:00
ferben
84fe36d084 meson functions work until to be saved 2019-01-25 17:26:43 +00:00
ferben
3438dde8df test prog now computes everything up to meson fields 2019-01-25 15:19:18 +00:00
ferben
aea49bc349 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-25 13:44:30 +00:00
ferben
9ef6f9878e test works up to perambulators now 2019-01-25 13:44:19 +00:00
708ca8585a Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-25 13:26:56 +00:00
d15bf4b8e1 Added trajectory number to output file 2019-01-25 13:26:48 +00:00
ferben
7496da0987 bugfix in prambLight 2019-01-25 13:08:56 +00:00
ferben
2568f5b925 bugfix in prambLight 2019-01-25 12:37:18 +00:00
577cdf1d72 Simplified tests 2019-01-24 18:50:18 +00:00
f92ed659a7 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-24 16:30:28 +00:00
dfb7fb1d9f LapEvec test works on --grid 4.4.4.8 2019-01-24 16:30:13 +00:00
a4c1ab6147 all modules linked in test prog 2019-01-24 16:12:19 +00:00
cf85f0388d Still debugging eigenvector parameters 2019-01-24 13:26:05 +00:00
00b0f75b0d Eigenvectors created. Still need to correctly set parameters for test. 2019-01-24 12:44:06 +00:00
b45586e81c Discovered bug root cause. setup() is called multiple times. Now ready to copy-paste the LapEvec code 2019-01-23 21:17:56 +00:00
2c7e6bf58b Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-23 15:20:06 +00:00
7c5a06f6d0 Trying to work out why LapEvec constructor not being called 2019-01-23 15:19:51 +00:00
ferben
068ef85b05 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-23 15:08:24 +00:00
ferben
a6ab742fdb added perambs to test 2019-01-23 13:58:20 +00:00
2062a8d578 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-23 13:00:20 +00:00
3d3e8f4f9f Structured objects passed into LapEvec 2019-01-23 12:59:55 +00:00
ferben
2756f16a5e created test prog for perambs 2019-01-23 12:49:20 +00:00
ferben
d7908c33de moved hard-coded parameters in DistilVectors to module input 2019-01-23 11:32:53 +00:00
ferben
4cc2ebc9e4 moved hard-coded parameters to module input 2019-01-23 11:26:07 +00:00
ferben
b8afa7314c Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-01-23 10:51:23 +00:00
ferben
be5605931c merge 2019-01-23 10:51:09 +00:00
09fa821510 Added remaining methods to Permabulator 2019-01-22 17:59:55 +00:00
ferben
f45d2d5dcc perambLight done, but SliceShare and Write does not work yet 2019-01-22 15:52:26 +00:00
ferben
0a82fae45c moved perambulator definition to shared header file 2019-01-22 15:06:45 +00:00
ferben
46b05aa9c5 cleaned up, deleted commented out old code 2019-01-22 13:48:44 +00:00
ferben
813c1ab1f1 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-01-22 13:28:09 +00:00
ferben
b1c27a141d DistilVectors complete and compiling - not tested at all! 2019-01-22 13:27:51 +00:00
81bb361299 Test program ready 2019-01-22 13:19:39 +00:00
ferben
79d533550d continued on DistilVectors.hpp 2019-01-21 16:45:31 +00:00
ferben
b8c106f320 working on DistilVectors, initialisation done and compiles 2019-01-21 16:04:18 +00:00
b74492a805 Merge branch 'feature/distil' of github.com:mmphys/Grid into feature/distil 2019-01-21 10:40:01 +00:00
c93a43f158 Added test program 2019-01-21 10:39:28 +00:00
Felix Erben
0ff410ae19 copied perambulato code into PerambLight.hpp 2019-01-18 17:47:41 +00:00
Felix Erben
ced30b61e2 added phi vectors - still commented out and does not compile otherwise 2019-01-18 16:38:13 +00:00
Felix Erben
2b782df290 Merge branch 'feature/distil' of https://github.com/mmphys/Grid into feature/distil 2019-01-18 15:58:51 +00:00
Felix Erben
f0f1ba0307 uses evec4d now 2019-01-18 15:58:10 +00:00
2343e621e6 Bananas 2019-01-18 13:32:27 +00:00
Felix Erben
2568504821 small change 2019-01-18 13:23:03 +00:00
b821dde020 Initial version 2019-01-18 13:14:28 +00:00
ae3b053334 Initial version 2019-01-18 13:10:02 +00:00
Peter Boyle
c5e081d69c Re-Merge branch 'develop' into feature/gpu-port
Pull in Regensburg MultiGrid pull request
2019-01-03 01:50:16 +00:00
Peter Boyle
535a6aaf05 Update todo list 2019-01-02 22:07:51 +00:00
Peter Boyle
e73b909a48 Make tests running past nvcc. Different NVCC versions proving tricky to keep happy. This is 9.2 2019-01-02 12:05:30 +00:00
Peter Boyle
a4d9200293 Fixing AVX 512 instantiation error. Need to move to extern templates urgently. 2019-01-02 00:27:07 +00:00
Peter Boyle
350508bdb3 pugixml problem 2019-01-01 16:38:54 +00:00
Peter Boyle
38852737e4 No compile fix on clang 2019-01-01 15:55:13 +00:00
Peter Boyle
802404c78c Remove warnings under NVCC and move parallel_for to thread-loop 2019-01-01 15:08:09 +00:00
Peter Boyle
0e9b591c1c NVCC warning suppression 2019-01-01 15:07:47 +00:00
Peter Boyle
c43a2b599a GPU support 2019-01-01 15:07:29 +00:00
Peter Boyle
8c91e82ee8 GPU clean up, remove parallel_for. Split into accelerator_loop, thread_loop
cases, and collides with parallel_for in thrust
2019-01-01 15:06:46 +00:00
Peter Boyle
9d866d062a GPU support improvements 2019-01-01 15:05:03 +00:00
Peter Boyle
3a4e397e72 Deprecating JSON, too hard to support under NVCC 2019-01-01 15:04:33 +00:00
Peter Boyle
2b6cfe555f Disable JSON on NVCC. Maybe unsupport JSON full stop. XML and JSON is too many formats in my view. 2019-01-01 15:03:50 +00:00
Peter Boyle
7df58dd883 Photon syntax gave problems with NVCC 2019-01-01 15:03:29 +00:00
Peter Boyle
4bf86ae60a NVCC clean up 2019-01-01 15:02:50 +00:00
Peter Boyle
07ee87ff5a GPU happy. Still need to prevent hand kernels being callable under NVCC 2019-01-01 15:00:33 +00:00
Peter Boyle
0c2498fe2f Explicit instantiation needed for NVCC 2019-01-01 13:55:12 +00:00
Peter Boyle
ad2e65dad5 GPU related updates 2019-01-01 13:54:40 +00:00
Peter Boyle
715babeac8 GPU reductions first cut; use thrust, non-reproducible. Inclusive scan can fix this if desired.
Local reduction to LatticeComplex and then further reduction.
2019-01-01 13:53:37 +00:00
Peter Boyle
3eae9a9e3f update NVCC flags 2019-01-01 13:49:15 +00:00
Peter Boyle
186aad065f Roll forward Eigen in attempt to make CUDA happy 2019-01-01 13:48:32 +00:00
Peter Boyle
bf5685eb11 Update todo list 2019-01-01 13:48:06 +00:00
Peter Boyle
4a96c067ae Remove warnings from NVCC 2019-01-01 13:43:09 +00:00
Peter Boyle
ab063f33c0 Offload the linear combinations in CG 2019-01-01 13:42:13 +00:00
Peter Boyle
9efcc535bc Cleaner drop from CUDA mode around Eigen includes. Remains difficult to let Eigen compile under nvcc with version issues. 2019-01-01 13:39:10 +00:00
Peter Boyle
231b61d012 std::array by default 2019-01-01 13:37:35 +00:00
Peter Boyle
e898f4f0b0 Whitespace 2019-01-01 13:36:55 +00:00
Peter Boyle
d5db5f5242 Wrong dimension used in a temporary 2018-12-20 10:49:45 +00:00
Peter Boyle
2fcedb13dd Step size modification in HMC; ICC happy thread pragmas 2018-12-20 09:32:33 +00:00
Peter Boyle
35ed1defac Passes make check now single and double compile 2018-12-19 11:09:32 +00:00
Peter Boyle
4e95accf80 Namespace fix 2018-12-15 21:46:17 +00:00
Peter Boyle
422764757d Updates in tests to make all of Grid compile 2018-12-14 16:55:54 +00:00
Peter Boyle
afc462bd58 Bracketing issue in macro 2018-12-13 10:53:22 +00:00
Peter Boyle
b57a4d32aa Merge branch 'develop' into feature/gpu-port 2018-12-13 05:11:34 +00:00
Peter Boyle
adbdc4e65b Half comms not working on GPU yet, so disable. 2018-09-11 05:15:22 +01:00
Peter Boyle
e4deea4b94 Weird bug appears with Vector<Vector<>>.
"fix" with std::vector<Vector<>>

Lies in the face table code. But think there is some latent problem.
Possibly in my allocator since it is caching, but could simplify or eliminate the caching
option and retest. One to look at later.
2018-09-11 04:36:57 +01:00
Peter Boyle
94d721a20b Comments on further topology discovery work 2018-09-11 04:20:04 +01:00
Peter Boyle
7bf82f5b37 Offload the face handling to GPU 2018-09-10 11:28:42 +01:00
Peter Boyle
f02c7ea534 Peer to peer on GPU's setup 2018-09-10 11:26:20 +01:00
Peter Boyle
bc503b60e6 Offloadable gather code 2018-09-10 11:21:25 +01:00
Peter Boyle
704ca162c1 Offloadable compression 2018-09-10 11:20:50 +01:00
Peter Boyle
b5329d8852 Protect against zero length loops giving a kernel call failure 2018-09-10 11:20:07 +01:00
Peter Boyle
f27b9347ff Better unquiesce MPI coverage 2018-09-10 11:19:39 +01:00
Peter Boyle
b4967f0231 Verbose and error trapping cleaner 2018-09-09 14:28:02 +01:00
Peter Boyle
6d0f1aabb1 Fix the multi-node path 2018-09-09 14:27:37 +01:00
Peter Boyle
f4bfeb835d Drop back to smaller Ls 2018-09-09 14:25:06 +01:00
Peter Boyle
394b7b6276 Verbose decrease 2018-09-09 14:24:46 +01:00
Peter Boyle
da17a015c7 Pack the stencil smaller for 128 bit access 2018-07-23 06:12:45 -04:00
Peter Boyle
1fd08c21ac make simd width configure time option for GPU 2018-07-23 06:10:55 -04:00
Peter Boyle
28db0631ff Hack to force 128bit accesses 2018-07-23 06:10:27 -04:00
Peter Boyle
b35401b86b Fix CUDA_ARCH. Need to simplify. See when new eigen release happens 2018-07-23 06:09:33 -04:00
Peter Boyle
a0714de8ec Define vector length for GPU 2018-07-23 06:09:05 -04:00
Peter Boyle
21a1710b43 Verbose vector length 2018-07-23 06:08:39 -04:00
Peter Boyle
b2b5137d28 Finally starting to get decent performance on Volta 2018-07-13 12:06:18 -04:00
Peter Boyle
2cc07450f4 Fastest option for the dslash 2018-07-05 09:57:55 -04:00
Peter Boyle
c0e8bc9da9 Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 07:10:25 -04:00
Peter Boyle
b1265ae867 Prettify code 2018-07-05 07:08:06 -04:00
Peter Boyle
32bb85ea4c Standard extractLane is fast 2018-07-05 07:07:30 -04:00
Peter Boyle
ca0607b6ef Clearer kernel call meaning 2018-07-05 07:06:15 -04:00
Peter Boyle
19b527e83f Better extract merge for GPU. Let the SIMD header files define the pointer type for
access. GPU redirects through builtin float2, double2 for complex
2018-07-05 07:05:13 -04:00
Peter Boyle
4730d4692a Fast lane extract, saturates bandwidth on Volta for SU3 benchmarks 2018-07-05 07:03:33 -04:00
Peter Boyle
1bb456c0c5 Minor GPU vector width change 2018-07-05 07:02:04 -04:00
Peter Boyle
4b04ae3611 Printing improvement 2018-07-05 06:59:38 -04:00
Peter Boyle
2f776d51c6 Gpu specific benchmark saturates memory. Can enhance Grid to do this for expressions,
but a bitof (known) work.
2018-07-05 06:58:37 -04:00
paboyle
3a50afe7e7 GPU dslash updates 2018-06-27 22:32:21 +01:00
paboyle
f8e880b445 Loop for s and xyzt offlow 2018-06-27 21:49:57 +01:00
paboyle
3e947527cb Move looping over "s" and "site" into kernels for GPU optimisatoin 2018-06-27 21:29:43 +01:00
paboyle
31f65beac8 Move site and Ls looping into the kernels 2018-06-27 21:28:48 +01:00
paboyle
38e2a32ac9 Single SIMD lane operations for CUDA 2018-06-27 21:28:06 +01:00
paboyle
efa84ca50a Keep Cuda 9.1 happy 2018-06-27 21:27:32 +01:00
paboyle
5e96d6d04c Keep CUDA happy 2018-06-27 21:27:11 +01:00
paboyle
df30bdc599 CUDA happy 2018-06-27 21:26:49 +01:00
paboyle
7f45222924 Diagnostics on memory alloc fail 2018-06-27 21:26:20 +01:00
paboyle
dd891f5e3b Use NVCC to suppress device Eigen 2018-06-27 21:25:17 +01:00
paboyle
6c97a6a071 Coalescing version of the kernel 2018-06-13 20:52:29 +01:00
paboyle
73bb2d5128 Ugly hack to speed up compile on GPU; we don't use the hand kernels on GPU anyway so why compile 2018-06-13 20:35:28 +01:00
paboyle
b710fec6ea Gpu code first version of specialised kernel 2018-06-13 20:34:39 +01:00
paboyle
b2a8cd60f5 Doubled gauge field is useful 2018-06-13 20:27:47 +01:00
paboyle
867ee364ab Explicit instantiation hooks 2018-06-13 20:27:12 +01:00
paboyle
25becc9324 GPU tweaks for benchmarking; really necessary? 2018-06-13 20:26:07 +01:00
paboyle
94d1ae4c82 Some prep work for GPU shared memory. Need to be careful, as will try GPU direct
RDMA and inter-GPU memory sharing on SUmmit later
2018-06-13 20:24:06 +01:00
paboyle
2075b177ef CUDA_ARCH more carefule treatment 2018-06-13 20:22:34 +01:00
paboyle
847c761ccc Move sfw IEEE fp16 into central location 2018-06-13 20:22:01 +01:00
paboyle
8287ed8383 New GPU vector targets 2018-06-13 20:21:35 +01:00
paboyle
e6be7416f4 Use managed memory 2018-06-13 20:14:00 +01:00
paboyle
26863b6d95 User Managed memory 2018-06-13 20:13:42 +01:00
paboyle
ebd730bd54 Adding 2D loops 2018-06-13 20:13:01 +01:00
paboyle
066be31a3b Optional GPU target SIMD types; work in progress and trying experiments 2018-06-13 20:07:55 +01:00
paboyle
7a4c142955 Add GPU specific simd targets 2018-06-13 19:55:30 +01:00
Peter Boyle
eb7d34a4cc GPU version 2018-05-14 19:41:47 -04:00
Peter Boyle
aab27a655a Start of GPU kernels 2018-05-14 19:41:17 -04:00
Peter Boyle
93280bae85 Gpu option 2018-05-14 19:40:58 -04:00
Peter Boyle
c5f93abcd7 GPU clean up 2018-05-14 19:40:33 -04:00
Peter Boyle
d5deef782d Useful debug comments 2018-05-14 19:39:52 -04:00
Peter Boyle
5f50473c0d Clean up 2018-05-14 19:39:11 -04:00
Peter Boyle
13f50406e3 Suppress print statement 2018-05-12 18:00:00 -04:00
Peter Boyle
09cd46d337 Lane by Lane operation 2018-05-12 17:59:35 -04:00
Peter Boyle
d3f51065c2 Give command line control of blocks/threads split 2018-05-12 17:58:56 -04:00
Peter Boyle
925ac4173d Thread count control for warp scheduler thingy doodaa thing 2018-05-12 17:58:22 -04:00
Peter Boyle
eb921041d0 Perf count control 2018-05-12 17:57:32 -04:00
Peter Boyle
87c5c0271b Ficxing eigen 2018-04-16 19:08:07 -04:00
Peter Boyle
a3f5a13591 Better Eigen handling 2018-04-16 18:02:55 -04:00
Peter Boyle
9fe28f00eb Eigen sim link off head revision 2018-04-16 17:54:46 -04:00
Peter Boyle
a8a0bb85cc Control scalar execution or vector under generic. Disable Eigen vectorisation on powerpc / SUmmit 2018-04-12 12:32:57 -04:00
Peter Boyle
6411caad67 work distribution 2018-04-12 11:41:41 -04:00
Peter Boyle
7533035a99 Control Eigen vectorisatoin 2018-04-12 11:40:56 -04:00
Peter Boyle
b15db11c60 Kernels -> pure static object to enable device execution 2018-03-24 19:35:20 -04:00
Peter Boyle
f6077f9d48 Kernels -> not instantiaed otherwise object ref on GPU 2018-03-24 19:33:44 -04:00
Peter Boyle
572954ef12 Kernels not an instantiated object, just static 2018-03-24 19:33:13 -04:00
Peter Boyle
cedeaae7db Lebesge -> StencilView if necessary 2018-03-24 19:32:41 -04:00
Peter Boyle
e6cf0b1e17 View typedefs go to OperatorImpl 2018-03-24 19:32:11 -04:00
Peter Boyle
5412628ea6 begin end lamda 2018-03-24 19:31:45 -04:00
Peter Boyle
1f70cedbab Have to make all kernel called routines static since object reference will be a host pointer on GPU 2018-03-24 19:29:26 -04:00
Peter Boyle
b50f37cfb4 Remove overlap comms flag 2018-03-24 19:28:53 -04:00
Peter Boyle
cb0d2a1b03 threaded rng init; I thought this was on 2018-03-24 19:28:17 -04:00
Peter Boyle
6fe9b28a82 Cosmetic 2018-03-24 19:27:14 -04:00
Peter Boyle
b002587d7c Simplify 2018-03-24 19:26:44 -04:00
Peter Boyle
6c08385782 Simplify 2018-03-24 19:26:19 -04:00
Peter Boyle
4e1272fabf Kernels need to be static to work on GPU. No reference to host resident data 2018-03-22 18:44:53 -04:00
Peter Boyle
607dc2d3c6 Remove lebesgue order 2018-03-22 18:23:09 -04:00
Peter Boyle
23c880b009 Remove lebesgue order; stick in stencil if need 2018-03-22 18:13:41 -04:00
Peter Boyle
334bb6792f Lebesgue order removed. Stick in the stencil view 2018-03-22 18:12:12 -04:00
Peter Boyle
a3690071b4 Warm up GPu 2018-03-22 18:05:20 -04:00
Peter Boyle
299d119013 GPU work allocation improved 2018-03-22 18:04:24 -04:00
Peter Boyle
55be842d23 Dont force l1p.h so early 2018-03-22 18:01:43 -04:00
Peter Boyle
9875c446c6 Clean up pragmas 2018-03-20 07:19:17 -04:00
Peter Boyle
9c25eb35ca Eigen develop branch for now 2018-03-20 07:18:56 -04:00
Peter Boyle
5ac96dbdc6 Warm behaviour in SU3 benchmark 2018-03-20 07:18:31 -04:00
Peter Boyle
5cc9aca85d Use 64bit index for looping 2018-03-20 06:34:52 -04:00
Peter Boyle
ac29ebcb95 Clean up debug prints 2018-03-20 06:33:59 -04:00
Peter Boyle
a5cfb89304 Update eigen process direct from develop on github. Dangerous, but needed from GPU 2018-03-19 07:20:48 -04:00
Peter Boyle
f04a7251cc Gpu welcome message and device info 2018-03-19 07:12:12 -04:00
Peter Boyle
d4ce7d9905 GPU friendly Stencil needs a view 2018-03-19 07:11:21 -04:00
Peter Boyle
8a1d303ab9 GPU friendly stencil improvements 2018-03-19 07:11:03 -04:00
Peter Boyle
bf0a4de919 GPU friendly params object 2018-03-19 07:10:12 -04:00
Peter Boyle
6fe5885fe4 Warning suppress 2018-03-19 07:09:49 -04:00
Peter Boyle
17ac309e84 Fix the compile 2018-03-19 07:08:59 -04:00
Peter Boyle
7467a1c027 Latest eigen needed for GPU 2018-03-19 07:08:10 -04:00
Peter Boyle
fdfb8a26a8 Disable eigen vectorisation on GPU because of Summit compile issues 2018-03-19 07:07:30 -04:00
paboyle
2df4e422ad Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port 2018-03-18 14:45:41 +00:00
paboyle
3a3e3cac40 Pull the trigger on offload 2018-03-18 14:45:29 +00:00
paboyle
b1c02ec310 MallocManaged in GPU 2018-03-18 14:44:46 +00:00
paboyle
38eadee2c9 Prettier code 2018-03-18 14:44:22 +00:00
paboyle
42c70437be Views 2018-03-18 14:43:47 +00:00
paboyle
65274b4d7f Tidy up 2018-03-18 14:43:16 +00:00
Peter Boyle
7e8be32755 Typo fix 2018-03-13 19:22:31 -04:00
paboyle
ff761ea4e6 Bound check improvement 2018-03-09 20:00:46 +00:00
paboyle
a31d3e60d8 Better bounds check 2018-03-09 18:10:21 +00:00
paboyle
4d60b92b7f Update oSites 2018-03-08 21:00:25 +00:00
paboyle
c159c70c84 View introduced 2018-03-08 14:58:04 +00:00
paboyle
28b5572755 Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port 2018-03-08 13:01:42 +00:00
paboyle
5fac7080bc Adding -fno-strict-aliasing by default 2018-03-08 13:01:24 +00:00
Peter Boyle
4548523ecc This modification eliminates what looks like a compiler bug
on Intel 2017.
2018-03-08 04:41:16 -08:00
paboyle
4154fc6f44 Revert a change 2018-03-07 16:54:11 +00:00
paboyle
4e3458516a Reverting after fixing issue with extract merge 2018-03-07 16:50:13 +00:00
Peter Boyle
90a2efb9b3 Hit an annoying strict alias optimisation in GCC 4.9 through 6.3
Chris K was correct. It appears that an additional memcpy (UGHHH) is enough
to suppress the compiler
2018-03-07 07:27:26 -08:00
paboyle
40699221e2 Dont alias lhs and rhs in a where statement 2018-03-06 04:14:13 -08:00
paboyle
3cb1b545d0 Don't alias the variables with a where statement. 2018-03-06 04:13:26 -08:00
paboyle
e199ba7e88 Fix the Charge conjugate BC's 2018-03-05 13:59:02 +00:00
paboyle
4d53703c67 Scalar type differeing allowed, eg. precisoin change 2018-03-05 11:39:52 +00:00
paboyle
d506c59efa Warnings disabled 2018-03-05 11:39:20 +00:00
paboyle
44188a5c6f AVX512 fix 2018-03-05 00:32:24 +00:00
paboyle
2018077770 Make NVCC happy with the compile. This is warning free on 9.1 on my laptop (both make and make tests). 2018-03-05 00:28:24 +00:00
paboyle
984e06e2b5 Introduce view objects that can safely be copied to GPU for access 2018-03-04 16:40:11 +00:00
paboyle
aead94e9a7 View introduced 2018-03-04 16:39:29 +00:00
paboyle
3277bda130 View introduction to prepare for accelerator offload.
Probably same problem exists for stencil object
2018-03-04 16:38:08 +00:00
paboyle
442b0b406c View related changes 2018-03-04 16:34:14 +00:00
paboyle
8824a54269 View related changes 2018-03-04 16:33:33 +00:00
paboyle
c03423250f Indexable changes 2018-03-04 16:31:35 +00:00
paboyle
317fd0da44 Views introduced. Need to accelerator offload these routines. 2018-03-04 16:30:45 +00:00
paboyle
783795a44a Views introduced 2018-03-04 16:12:49 +00:00
paboyle
0e6197fbed Introduce accelerator friendly expression template rewrite.
Must obtain and access lattice indexing through a view object that is safe
to copy construct in copy to GPU (without copying the lattice).
2018-03-04 16:03:19 +00:00
paboyle
dad7862f91 Go through a view object that can be copied to GPU 2018-03-04 16:02:02 +00:00
paboyle
c89a883448 where was deprecated and integrated to ET engine a long time ago. Remove dead old original code 2018-03-04 15:58:02 +00:00
paboyle
c204288fbc Remove a couple of print statements 2018-03-04 15:57:15 +00:00
paboyle
ad739f042a Introduce views for passing lattice indexing to accelerators. 2018-03-04 15:56:14 +00:00
paboyle
db988301d0 Introduce view objects for indexing lattices. Used to pass the view to acccelerators 2018-03-04 15:55:16 +00:00
paboyle
9b1f29c4c2 Support a view for passing to accelerator 2018-03-04 15:54:35 +00:00
paboyle
e5ea04ee0c Need to support precision change, and real replication in multiple simd lanes 2018-03-04 15:53:04 +00:00
paboyle
c92a3c6068 Need to support any vector type template and run on accelerator 2018-03-04 15:52:14 +00:00
paboyle
03f8da8fbc enable-debug option for debug flags in compile 2018-03-04 15:51:47 +00:00
paboyle
78a9e31ff0 options more obvious 2018-02-24 22:26:32 +00:00
paboyle
c1fc947bb8 Coordinate handling GPU friendly + some GPU merge/extract improvements 2018-02-24 22:26:10 +00:00
paboyle
ff7b19a71b Coordinate handling GPU ready avoid malloc 2018-02-24 22:25:39 +00:00
paboyle
1c16ffa1c1 Coordinate GPU ready. No malloc 2018-02-24 22:25:09 +00:00
paboyle
4962f59477 Eliminate both GPU issue and threading bottle neck by avoiding malloc in coordinate handling 2018-02-24 22:24:37 +00:00
paboyle
e158b60bce GPU friendly coords 2018-02-24 22:23:47 +00:00
paboyle
34820bec27 Coordinate handling GPU ready. No malloc 2018-02-24 22:23:18 +00:00
paboyle
eed9aa9f0c Extract merge gpu ready 2018-02-24 22:23:01 +00:00
paboyle
8792ff6439 Coordinate handling gpu ready 2018-02-24 22:22:43 +00:00
paboyle
078901278c Coordinate handling gpu friendly 2018-02-24 22:22:02 +00:00
paboyle
bf5fb89aff Coordinate handling GPU friendly 2018-02-24 22:21:36 +00:00
paboyle
7574c18cef Massive clean up extract merge.
Simpler and GPU friendly
2018-02-24 22:21:08 +00:00
paboyle
36ea5f6b77 gpu friendly coordinates ; no std::vector on GPU 2018-02-24 22:20:14 +00:00
paboyle
285deab432 Coordinate handling GPU friendly. Avoid std::vector 2018-02-24 22:19:28 +00:00
paboyle
bb7d87d0a0 Coordinate handling gpu friendly 2018-02-24 22:18:33 +00:00
paboyle
b9b5bdfc3a Proper offload (accelerator access) will require a mutable copy lambda. 2018-02-02 11:38:19 +00:00
paboyle
51eb2c5dfc Make referencign the stencil and all info required to evaluate the kernel
accelerator marked up
2018-02-02 11:37:13 +00:00
paboyle
ede0dff794 Mark up as an accelerator function 2018-02-02 11:36:44 +00:00
paboyle
aa6de818e2 Copy data needed by Kernels out of the grid object to avoid host reference 2018-02-02 11:36:11 +00:00
paboyle
dcf6517a93 Accelerator offload and copy Opt into the kernel for GPU host var safety 2018-02-02 11:35:35 +00:00
paboyle
a308dff410 accelerator loop, copy Opt into the GPU 2018-02-02 11:34:37 +00:00
paboyle
14ba20898a Accelerator loop the key kernel call 2018-02-02 11:30:07 +00:00
paboyle
a53d3ee19a Add Opt to the lambda capture to get it into the GPU 2018-02-02 11:28:39 +00:00
paboyle
5df435319d Use constexpr 2018-02-02 11:27:56 +00:00
paboyle
0da2d3e222 accelerator off load some more stuff 2018-02-02 11:27:35 +00:00
paboyle
9c9dfbfa78 Force accelerator 2018-02-02 11:25:09 +00:00
paboyle
e4df025d01 Accelerator related 2018-02-01 23:20:05 +00:00
paboyle
cfeda9d536 constexpr on const ints 2018-02-01 22:59:12 +00:00
paboyle
4450b1993a Offload 2018-02-01 22:45:47 +00:00
paboyle
d03ce5c2a4 Provide a way to get around std::vector for a known type on device.
Use template specialisation to access a private member in the Clang++ STL implementation
2018-02-01 22:44:25 +00:00
paboyle
7d6522c1ef Accelerator inline 2018-02-01 22:43:56 +00:00
paboyle
b96832a922 Accelerator inline 2018-02-01 22:43:26 +00:00
paboyle
5d7af47b05 accelerator_inline 2018-02-01 22:42:54 +00:00
paboyle
053ef25c90 constexpr makes GPU happy 2018-02-01 22:42:29 +00:00
paboyle
8ae77d3706 Small simplification of FermionOperatorImpl towards GPU but not there yet 2018-02-01 22:41:54 +00:00
paboyle
79b50feacf fixme updates 2018-01-29 16:00:40 +00:00
paboyle
c67c1544cd abs no compile on travis fix attempt 2018-01-28 10:26:04 +00:00
paboyle
e657f9a344 OMP collapse changes to make NVCC happy 2018-01-28 01:21:53 +00:00
paboyle
b6ebf35af5 Intel compiler doesn't like Nvidia error disable pragmas 2018-01-28 01:03:10 +00:00
paboyle
604c05f4b8 parallel_for elimination -> thread_loop 2018-01-28 01:01:36 +00:00
paboyle
70e276e1ab parallel_for elimination -> thread_loop 2018-01-28 01:01:14 +00:00
paboyle
9472b02771 Parallel_for elimination -> thread_loop. 2018-01-28 01:00:55 +00:00
paboyle
9597ab94eb Zero changes, swap on lattice type. 2018-01-27 23:51:40 +00:00
paboyle
ce4da83bc2 Zero changes, literally 2018-01-27 23:51:10 +00:00
paboyle
d557f3ef77 Zero changes (literally) and also a warning elimination 2018-01-27 23:50:43 +00:00
paboyle
f574c20118 Zero changes, __VA_ARGS__ and swap 2018-01-27 23:50:17 +00:00
paboyle
f102897385 VA_ARGS to make comma safe automatic 2018-01-27 23:49:47 +00:00
paboyle
d6fce3e498 Zero changes, literally 2018-01-27 23:48:01 +00:00
paboyle
2d0bcc2606 Zero changes, acceleartor on kernels and some thread loop changes 2018-01-27 23:47:38 +00:00
paboyle
45df59720e Zero changes and VA_ARGS changes 2018-01-27 23:46:58 +00:00
paboyle
44ef5bc207 Zero changes (literally speaking). 2018-01-27 23:46:28 +00:00
paboyle
98af36217a Zero changes. (I mean literally) 2018-01-27 23:46:02 +00:00
paboyle
be7b37b9c9 Mistake on openmp 2018-01-27 00:05:11 +00:00
paboyle
c4f82e072b _grid becomes private ; use Grid()§ 2018-01-27 00:04:12 +00:00
paboyle
3f9654e397 Hiding internals 2018-01-26 23:09:03 +00:00
paboyle
912b50f6fa Hiding lattice internals 2018-01-26 23:08:45 +00:00
paboyle
2a4a0e43c1 Hide internals 2018-01-26 23:08:27 +00:00
paboyle
32523a229c Hide internals 2018-01-26 23:08:02 +00:00
paboyle
1ebd56c3fb Hide internal data 2018-01-26 23:07:34 +00:00
paboyle
8dccffdfd5 Hide internal data 2018-01-26 23:06:51 +00:00
paboyle
5642ea270f Hide internal data 2018-01-26 23:06:28 +00:00
paboyle
43cea62855 Hide internal data 2018-01-26 23:06:03 +00:00
paboyle
2b4067bb71 Hide internal data 2018-01-26 23:05:32 +00:00
paboyle
85771e97e9 Hide internal data 2018-01-26 23:04:46 +00:00
paboyle
8b371ffa94 Hide internal data 2018-01-26 23:03:54 +00:00
paboyle
bf659dfd92 Hide the ._odata 2018-01-26 22:27:47 +00:00
paboyle
76a4dd36d9 Fix no compile of test serialisation 2018-01-26 00:13:21 +00:00
paboyle
f4010023ca Warning fixes 2018-01-25 23:46:47 +00:00
paboyle
24a4589def Changes to interface a little 2018-01-25 23:37:34 +00:00
paboyle
c904822e74 Warning removal 2018-01-25 23:37:15 +00:00
paboyle
40ee1e1957 Zero() 2018-01-25 23:36:58 +00:00
paboyle
461df78a3f Better to use Zero(), and not zero static data 2018-01-25 23:36:22 +00:00
paboyle
db9c9475d4 const 2018-01-25 23:36:06 +00:00
paboyle
214f7a6f13 Drop std::vector container for the lattice data 2018-01-25 23:35:04 +00:00
paboyle
c844cfcda8 Remove commAllocator; make more simple; option to switch off the pointer caceh 2018-01-25 23:33:57 +00:00
paboyle
a3e3034e6f Host compile 2018-01-25 23:33:00 +00:00
paboyle
e7cba358c2 Temporary update to reflect the new dropping of std::vector in Lattice
Will update again to hide the internals in an interface
2018-01-25 23:31:41 +00:00
paboyle
99329197ee Rename header to .h 2018-01-24 14:10:09 +00:00
paboyle
421401af55 Remove IMCI as really don't support 2018-01-24 13:53:21 +00:00
paboyle
0626c1e39e Accelerator flaggina dn thrust complex for NVCC 2018-01-24 13:50:41 +00:00
paboyle
725f03e2e2 Accelerator markup and thrust complex on nvcc 2018-01-24 13:50:10 +00:00
paboyle
65f77112e0 Thread loops done properly 2018-01-24 13:49:39 +00:00
paboyle
408b868475 Generic for GPU needs accelerator markup of functions 2018-01-24 13:49:12 +00:00
paboyle
1c797deb04 Accelerator tweaks 2018-01-24 13:43:43 +00:00
paboyle
b9d5a42b57 Should be able to eliminate the COMMA_SAFE with VA_ARGS trick ; revisit this file 2018-01-24 13:42:06 +00:00
paboyle
e737591918 Accelerator loops 2018-01-24 13:41:12 +00:00
paboyle
ba5ea5830b Acceleartor loops 2018-01-24 13:40:56 +00:00
paboyle
43f244badf Thread loops for now; figure out what can be GPU accelerated later here 2018-01-24 13:40:30 +00:00
paboyle
e9c8ba5ef7 Accelerator loosp 2018-01-24 13:39:54 +00:00
paboyle
d70709a8e8 Thread construct changes 2018-01-24 13:39:06 +00:00
paboyle
733f8ff0b2 Still using parallel_for -- don't know how to implement reduction on GPU yet. Look at some sample code is best. 2018-01-24 13:38:13 +00:00
paboyle
0bfa5bb213 Accelerator loosp 2018-01-24 13:37:26 +00:00
paboyle
1f26a234f9 CPU loops explicit for peek poke 2018-01-24 13:36:31 +00:00
paboyle
13f0116425 Accelerator loops 2018-01-24 13:35:55 +00:00
paboyle
25f589b064 Accelerator loops 2018-01-24 13:35:36 +00:00
paboyle
210c50a278 Accelerator prep work 2018-01-24 13:35:13 +00:00
paboyle
549a143e78 Accelerator related 2018-01-24 13:34:46 +00:00
paboyle
277301486d Simple warning elimination 2018-01-24 13:34:15 +00:00
paboyle
c851b39a49 Nicer way of including aggregate 2018-01-24 13:33:34 +00:00
paboyle
15cc12eb6c Delete the old non ET file 2018-01-24 13:33:07 +00:00
paboyle
ae4f1f8c12 New file, split out two from Lattice_reduction 2018-01-24 13:32:43 +00:00
paboyle
5609624b44 Threading constructs replaced 2018-01-24 13:32:24 +00:00
paboyle
b5a947dd79 Change to make NVCC happy 2018-01-24 13:32:02 +00:00
paboyle
ee16f62322 stray semicolon elimination. NVCC is picky, but eventually picked up these diags
with a pragma to suppress
2018-01-24 13:31:17 +00:00
paboyle
3318de27d6 Thread macro changes 2018-01-24 13:30:23 +00:00
paboyle
ac56965306 GPU changes and threading macros replaced 2018-01-24 13:28:30 +00:00
paboyle
8e99264f40 Accelerator mark up of entire tensore space for offload 2018-01-24 13:27:30 +00:00
paboyle
69327db9a9 Improviements for NVCC. Eigen is not compat with CUDA 9 and must hack to disable device
compilation
2018-01-24 13:25:07 +00:00
paboyle
7331ee2d80 Warnings control to overpower the NVCC compiler 2018-01-24 13:24:36 +00:00
paboyle
918c105c57 NVCC warning elimination 2018-01-24 13:23:59 +00:00
paboyle
be1511d469 Remove old macros for threading 2018-01-24 13:23:24 +00:00
paboyle
f1c31df9d2 updated Eigen version. Still didn't fix CUDA 9 no compile.
Worked around by switching off __NVCC__ during the include of Eigen and switching it
back on after. No Eigen code can be offloaded, note as a rsult of this. No harm done.
2018-01-24 13:19:29 +00:00
paboyle
ff7b587fad Ugly... nvcc needs -x cu to compile .cc as cuda.
Since CXXFLAGS is Also passed to linker, and -x cu breaks link phase must replace
CXX and CXXLD with nvcc -x cu and nvcc -link respectively.
2018-01-24 13:18:19 +00:00
paboyle
4e1135b214 Updated pugixml to v1.8; still didn't fix no compile under nvcc.
Turns out nvcc was right; must to an explicit template instantiation that was missing
but left gcc, icpc and clang happy for some reason.
Fix this.
2018-01-24 13:17:10 +00:00
paboyle
acd4955a18 remove rdtsc on __NVCC__ as may be device called 2018-01-24 13:16:18 +00:00
paboyle
bd08dc4f45 Pragma use for nvcc, warning elimination. 2018-01-24 13:15:43 +00:00
paboyle
22d137d4e5 Namespace, nvcc warning elimination. 2018-01-24 13:14:43 +00:00
paboyle
87ee592176 Pragma changes and layout and warning elimination for nvcc 2018-01-24 13:14:09 +00:00
paboyle
063603b1ea Warning elimination 2018-01-24 13:12:14 +00:00
paboyle
f292106db6 Split out pragms from threads.h;
More work needed; renam threads directory to "parallelism" or something like that
2018-01-24 13:11:04 +00:00
paboyle
9d08aebea9 Compile through nvcc ; warning elimination fixes 2018-01-24 13:09:53 +00:00
paboyle
4e30739093 First compile OK through nvcc on host 2018-01-24 13:08:47 +00:00
paboyle
90ea472411 Auto emacs format C++ no namespace indent 2018-01-15 11:44:54 +00:00
paboyle
56999474e2 Indent 2018-01-15 11:44:45 +00:00
paboyle
d74c21a386 GLobal edit for QCD namespace removal & NAMESPACE macros 2018-01-15 09:37:58 +00:00
paboyle
ca6bdd7302 Useful drive to emacs C++ mode 2018-01-15 00:24:41 +00:00
paboyle
6f20f1d224 Namespace 2018-01-15 00:24:20 +00:00
paboyle
d0e357ef89 CLeanup and no QCD namespace 2018-01-15 00:23:51 +00:00
paboyle
21251f2e1b Namespace and formatting changes 2018-01-15 00:21:27 +00:00
paboyle
fcf1ccf669 Namespace, indent, badly formatted 2018-01-15 00:17:58 +00:00
paboyle
49cce514f1 Namespace 2018-01-15 00:17:11 +00:00
paboyle
695af98a1d Namespace, indent, tidy 2018-01-15 00:16:13 +00:00
paboyle
f8cb46d360 Namspace, indent, badly formatted code fixed 2018-01-15 00:14:47 +00:00
paboyle
0da64dea90 Namespace, indent 2018-01-15 00:13:32 +00:00
paboyle
2cceebbf12 Namespace, indent 2018-01-15 00:12:20 +00:00
paboyle
40232dcefe Namespce 2018-01-15 00:11:19 +00:00
paboyle
dbd86bb95b CLeanup, namespace, indent 2018-01-15 00:10:11 +00:00
paboyle
b8fd2c161f Indent, namespace 2018-01-15 00:09:33 +00:00
paboyle
df9b979583 Indent, namespace 2018-01-15 00:08:40 +00:00
paboyle
23ef0e3e19 Namespace and indentation 2018-01-15 00:07:46 +00:00
paboyle
ae9175735a Indentation, Namespace 2018-01-15 00:07:10 +00:00
paboyle
2d13ea1a22 Namespace and indentation emacs choices 2018-01-15 00:05:55 +00:00
paboyle
8c675064bd Namespace and indentation 2018-01-15 00:04:43 +00:00
paboyle
550b905bb8 Namespace nd indentation 2018-01-15 00:03:49 +00:00
paboyle
edb79dc088 Namespce,and indent 2018-01-15 00:02:33 +00:00
paboyle
88e635c5d1 Namepscae, format 2018-01-15 00:02:01 +00:00
paboyle
ecb4a24de8 Namespace 2018-01-15 00:01:25 +00:00
paboyle
c8c1d36710 Namespace, indent 2018-01-15 00:00:52 +00:00
paboyle
b4bb428d9b Namespace, indent 2018-01-14 23:59:57 +00:00
paboyle
e9ef7e3852 Namespace, indent 2018-01-14 23:59:23 +00:00
paboyle
31cbbfc07e Namespace, indent 2018-01-14 23:58:44 +00:00
paboyle
4eb0552d1d Namespace, indnet 2018-01-14 23:58:03 +00:00
paboyle
08f2a4564f Namespace, formatting 2018-01-14 23:56:33 +00:00
paboyle
7e00f643f8 Namespace indent 2018-01-14 23:55:44 +00:00
paboyle
c19ccdad7c Namespace, indent 2018-01-14 23:55:07 +00:00
paboyle
8aed4181e1 Namespace, indent 2018-01-14 23:54:25 +00:00
paboyle
06ab7f5661 Namespace 2018-01-14 23:53:31 +00:00
paboyle
645ec8eba0 Namespace 2018-01-14 23:52:26 +00:00
paboyle
72ffa8a88e Namespace 2018-01-14 23:51:38 +00:00
paboyle
4c829b410e Namespace 2018-01-14 23:50:20 +00:00
paboyle
eda4fd9912 Namespace 2018-01-14 23:49:11 +00:00
paboyle
041d9137c0 Namespace 2018-01-14 23:48:27 +00:00
paboyle
eeacdfe031 Namespace 2018-01-14 23:47:37 +00:00
paboyle
e5535f4d72 Namespace, indent 2018-01-14 23:46:51 +00:00
paboyle
044a292281 Namespace, indnet 2018-01-14 23:46:07 +00:00
paboyle
fe0467df1e Namespace, indenting 2018-01-14 23:45:19 +00:00
paboyle
19234fb40e Namespace, format 2018-01-14 23:44:16 +00:00
paboyle
f445257d28 Namespace, indenting 2018-01-14 23:43:36 +00:00
paboyle
bdc2a987aa Namespace, indent 2018-01-14 23:42:47 +00:00
paboyle
72acb0e48f Namespace, indent 2018-01-14 23:41:59 +00:00
paboyle
b4e9211df7 Namespace, indent 2018-01-14 23:40:38 +00:00
paboyle
97019d2997 Namespace, format 2018-01-14 23:39:57 +00:00
paboyle
83c5f05094 Namespace, indent 2018-01-14 23:39:13 +00:00
paboyle
1619e42d90 Indent and Namespace changes 2018-01-14 23:38:25 +00:00
paboyle
9f6cebe5ff Namespace and format changes 2018-01-14 23:37:40 +00:00
paboyle
a84ebe5624 Namespace, format change 2018-01-14 23:36:45 +00:00
paboyle
c527e39881 Namespace, format indent change 2018-01-14 23:36:07 +00:00
paboyle
a0f4687887 Namespace, formatting indent changes 2018-01-14 23:35:16 +00:00
paboyle
3ef7b2389e Format eamcs style after NAMESPCCE change 2018-01-14 23:34:08 +00:00
paboyle
7dfa3d0b50 Namespace, format 2018-01-14 23:33:16 +00:00
paboyle
bf629dddce Namespace, format improved 2018-01-14 23:32:19 +00:00
paboyle
7747b95430 Namespace, formatting emacs style 2018-01-14 23:31:28 +00:00
paboyle
ccd75c039a Namespace, fmt 2018-01-14 23:30:34 +00:00
paboyle
493ea80208 Namespace 2018-01-14 23:29:53 +00:00
paboyle
229baf3aba Namespace, emacs fmt 2018-01-14 23:29:02 +00:00
paboyle
0ce4ecfc84 Emacs format indent 2018-01-14 23:28:12 +00:00
paboyle
ddfaae8ea6 Namespace 2018-01-14 23:27:49 +00:00
paboyle
70c5b781e5 Namespace, clean up 2018-01-14 23:26:41 +00:00
paboyle
901e359d28 Namespace changes; need to simplify the EOFA as too many cases and duplicated from Mobius 2018-01-14 23:25:51 +00:00
paboyle
e857d4d4c8 Namespace, indent 2018-01-14 23:24:51 +00:00
paboyle
e5b77c7fd8 Namespace, indent 2018-01-14 23:24:06 +00:00
paboyle
3b5d629048 Namespace, format 2018-01-14 23:23:26 +00:00
paboyle
08772d5e0c Namespace, indent 2018-01-14 23:22:42 +00:00
paboyle
017dcd69a6 Namespace, indent 2018-01-14 23:21:40 +00:00
paboyle
8178a17b88 Namespace, indent 2018-01-14 23:20:55 +00:00
paboyle
c5c1b53e54 Namespace, indent 2018-01-14 23:20:08 +00:00
paboyle
440f9e2013 Namespace, indent 2018-01-14 23:19:22 +00:00
paboyle
c98657d588 Namespace 2018-01-14 23:18:46 +00:00
paboyle
f450857716 Namespce, indent 2018-01-14 23:17:33 +00:00
paboyle
9ec238df9e Namespace, indent 2018-01-14 23:16:49 +00:00
paboyle
3ba8eb1500 Namespace, indent 2018-01-14 23:16:08 +00:00
paboyle
8da49c5a34 Namespace 2018-01-14 23:15:26 +00:00
paboyle
e04f61b1fa Namespace 2018-01-14 23:14:46 +00:00
paboyle
115e13b227 Namespace 2018-01-14 23:13:49 +00:00
paboyle
75f3062a80 Think this should move to the algorithms directory 2018-01-14 23:12:14 +00:00
paboyle
b460cd3ef1 Namespace, format 2018-01-14 23:11:24 +00:00
paboyle
0e6727a33b Namespace, format; possibly some conflict with Azusa beware 2018-01-14 23:10:21 +00:00
paboyle
4c6745cb4c Namespace 2018-01-14 23:09:44 +00:00
paboyle
efdd0e572c Namespace 2018-01-14 23:09:10 +00:00
paboyle
ca60a218ac Namespace 2018-01-14 23:08:35 +00:00
paboyle
03633d709e Namespace 2018-01-14 23:07:36 +00:00
paboyle
4de58c4aab Namespace 2018-01-14 23:06:47 +00:00
paboyle
4f8b1c1940 Namespace 2018-01-14 23:05:23 +00:00
paboyle
dec39b313d Namespace and format 2018-01-14 23:04:37 +00:00
paboyle
dc835ad1cb Namespace 2018-01-14 23:03:49 +00:00
paboyle
71c8c9e4fb Pretty 2018-01-14 23:03:01 +00:00
paboyle
a935ef7b39 Namespace 2018-01-14 23:01:07 +00:00
paboyle
a97ad1a51d Namespce 2018-01-14 23:01:01 +00:00
paboyle
5ab9129db3 Namespace 2018-01-14 22:58:42 +00:00
paboyle
634943c11f Namepsace 2018-01-14 22:57:59 +00:00
paboyle
e598e65f69 Namespace 2018-01-14 22:57:10 +00:00
paboyle
291407dc7f Namespace 2018-01-14 22:54:42 +00:00
paboyle
641a28aa1d Namespace 2018-01-14 22:53:50 +00:00
paboyle
75207fa010 FOrmat 2018-01-14 22:53:13 +00:00
paboyle
c2b0e0269a Namespace 2018-01-14 22:52:22 +00:00
paboyle
7828887604 Namespace, indent 2018-01-14 22:51:18 +00:00
paboyle
e6efc93a7c Namespace 2018-01-14 22:50:35 +00:00
paboyle
ff7e773d5e Namesapce 2018-01-14 22:49:48 +00:00
paboyle
a0380fad72 Namespace 2018-01-14 22:48:57 +00:00
paboyle
61e9a33777 Namesapce 2018-01-14 22:48:08 +00:00
paboyle
3e139b52d3 Namespace 2018-01-14 22:47:24 +00:00
paboyle
fd6031b005 Namespace 2018-01-14 22:46:17 +00:00
paboyle
fe44fc50d9 Namespace 2018-01-14 22:45:29 +00:00
paboyle
2dd88cf3f8 Namespace 2018-01-14 22:44:41 +00:00
paboyle
6b7e82f1a9 Namespace, indentation 2018-01-14 22:44:06 +00:00
paboyle
be612b3931 Namespace, indentation 2018-01-14 22:43:27 +00:00
paboyle
f5e74033f9 Namespace 2018-01-14 22:42:31 +00:00
paboyle
8d52e0a349 Namespace 2018-01-14 22:41:23 +00:00
paboyle
a60f6d353e Namespace 2018-01-14 22:40:29 +00:00
paboyle
5d3b574325 Missing banner; should recreate globally 2018-01-14 22:39:24 +00:00
paboyle
6ee5ea6b32 Namespace QCD gone 2018-01-14 22:38:22 +00:00
paboyle
cc349c6512 Namespace 2018-01-14 22:36:59 +00:00
paboyle
fde2e07bf4 Namespace 2018-01-14 22:36:15 +00:00
paboyle
2f38fe8d45 Namespace 2018-01-14 22:35:24 +00:00
paboyle
813af84ae8 Format emacs C++ mode 2018-01-14 22:34:12 +00:00
paboyle
cfe6c6838f Namespace 2018-01-14 22:33:18 +00:00
paboyle
12a7216dfe Namespace 2018-01-14 22:32:29 +00:00
paboyle
71ebd61327 Namespace 2018-01-14 22:31:39 +00:00
paboyle
2c2da60cc2 Namespace 2018-01-14 22:30:54 +00:00
paboyle
7631ed9c56 Namespace 2018-01-14 22:30:09 +00:00
paboyle
65669b116e Namespace 2018-01-14 22:29:18 +00:00
paboyle
ae2a6cfc6e Namespace 2018-01-14 22:27:32 +00:00
paboyle
c36223055e Namespace 2018-01-14 22:26:55 +00:00
paboyle
e42de105c5 Namespace 2018-01-14 22:26:11 +00:00
paboyle
b08dae0809 Namespace 2018-01-14 22:25:29 +00:00
paboyle
3bf8fddbb5 Namespace 2018-01-14 22:24:47 +00:00
paboyle
d29fa23ebc Namespace 2018-01-14 22:23:49 +00:00
paboyle
c978c88521 Namespace 2018-01-14 22:22:27 +00:00
paboyle
93f09818da Namespace 2018-01-14 22:21:40 +00:00
paboyle
54a8ea93ec Namespace QCD gone 2018-01-14 22:20:42 +00:00
paboyle
56e87d6e55 Namespace 2018-01-14 22:19:25 +00:00
paboyle
df29cc19ab Namespace 2018-01-14 22:18:27 +00:00
paboyle
e61189db3f Namespace 2018-01-14 22:17:43 +00:00
paboyle
361ce948c3 Namespace 2018-01-14 22:16:33 +00:00
paboyle
049b4a4631 Namespace 2018-01-14 22:15:55 +00:00
paboyle
9f2f294a27 Namespace 2018-01-14 22:14:58 +00:00
paboyle
81dcd0e6ea Namespace 2018-01-14 22:13:46 +00:00
paboyle
34a788331f Namespace 2018-01-14 22:13:02 +00:00
paboyle
e2c39945b3 Namespace 2018-01-14 22:11:03 +00:00
paboyle
1591d391b9 Namespace 2018-01-14 22:09:42 +00:00
paboyle
f4c06ed8c0 Namespace 2018-01-14 22:08:25 +00:00
paboyle
1f49f781bf Namespace 2018-01-14 22:07:27 +00:00
paboyle
3a9f746421 Namespace 2018-01-14 22:06:01 +00:00
paboyle
4491d87766 Namespace 2018-01-14 22:04:21 +00:00
paboyle
0e080a7abc Namespace 2018-01-14 22:03:14 +00:00
paboyle
8bf78846ee Namespace 2018-01-14 22:02:09 +00:00
paboyle
9aa34dc803 Namespace 2018-01-14 22:01:17 +00:00
paboyle
fdcbe0a0d1 Namespace 2018-01-14 22:00:29 +00:00
paboyle
6a62a9c6a5 Namespace 2018-01-14 21:59:48 +00:00
paboyle
b331ecea78 Namespace 2018-01-14 21:58:47 +00:00
paboyle
66f8a2f082 Namespace 2018-01-14 21:57:46 +00:00
paboyle
d58b7cf9b9 Namespace changes 2018-01-14 21:56:55 +00:00
paboyle
0d749becff Namespace 2018-01-14 21:55:47 +00:00
paboyle
1dbea9aa69 Namespace 2018-01-14 21:54:28 +00:00
paboyle
c1438cbbe3 Namespace 2018-01-14 21:53:39 +00:00
paboyle
f4623fd551 Namespace 2018-01-14 21:53:05 +00:00
paboyle
59ba9ff3bb NAMESPACE & format 2018-01-14 21:52:27 +00:00
paboyle
1fbab4032b Namespace changes 2018-01-14 21:51:19 +00:00
paboyle
c037244874 Tensor reformatted with NAMESPACE too 2018-01-13 00:31:02 +00:00
paboyle
f4272aa6fd Clean up 2018-01-13 00:19:19 +00:00
paboyle
8cb7a1a887 Format 2018-01-13 00:17:16 +00:00
paboyle
b45bd8e097 NAMESPACE 2018-01-13 00:16:34 +00:00
paboyle
5e48b701ec FOrmatting 2018-01-13 00:11:53 +00:00
paboyle
7f6bffe5ad NAMESPACE 2018-01-13 00:11:30 +00:00
paboyle
6bf5fb1924 Clean up and format NAMESPACE 2018-01-13 00:08:25 +00:00
paboyle
086db7bd19 NAMESPACE and reformat 2018-01-13 00:05:33 +00:00
paboyle
c0a9b38c02 C++ NAMESPACE format emacs happy 2018-01-13 00:03:57 +00:00
paboyle
6d7bdfb5f5 Emacs happy 2018-01-13 00:02:53 +00:00
paboyle
be5d70ae6e C++ happy 2018-01-13 00:02:10 +00:00
paboyle
ab1068044e C++ emacs happy 2018-01-13 00:01:58 +00:00
paboyle
dda151250f Emacs format 2018-01-12 23:59:58 +00:00
paboyle
18daf85069 Emacs format 2018-01-12 23:58:23 +00:00
paboyle
81cc28f6ca Format 2018-01-12 23:57:22 +00:00
paboyle
c01a1e02fe Namespace, format 2018-01-12 23:55:38 +00:00
paboyle
7e70f4ed9c Format, NAMESPACE 2018-01-12 23:55:03 +00:00
paboyle
1056e36f11 Format, NAMESPACE 2018-01-12 23:49:46 +00:00
paboyle
0b8a88978b Format, NAMESPACE 2018-01-12 23:47:24 +00:00
paboyle
59b31b6bb8 Format, NAMESPACE 2018-01-12 23:43:44 +00:00
paboyle
69496482fc Format, NAMESPACE 2018-01-12 23:42:22 +00:00
paboyle
4be31ad1f6 C++ indentation 2018-01-12 23:39:49 +00:00
paboyle
176a021ce9 Formatting, NAMESPACE§ 2018-01-12 23:38:15 +00:00
paboyle
b673174b71 FOrmat, NAMESPACE 2018-01-12 23:29:22 +00:00
paboyle
e6f7a5a818 Namespace 2018-01-12 23:28:01 +00:00
paboyle
68b69a2ac0 Namespace management 2018-01-12 23:26:14 +00:00
paboyle
bd15c38ae8 Formatting emacs compliant 2018-01-12 23:25:02 +00:00
paboyle
b815f5f764 Formatting 2018-01-12 23:23:21 +00:00
paboyle
4da437431e Reformat 2018-01-12 23:22:46 +00:00
paboyle
3c7bf211a9 Reformat 2018-01-12 23:22:18 +00:00
paboyle
347d5404dd format 2018-01-12 23:21:25 +00:00
paboyle
5e2cd0d07c Format 2018-01-12 23:18:22 +00:00
paboyle
62fcee72c5 Format, NAMESPACE 2018-01-12 23:16:37 +00:00
paboyle
0a6168eef0 Format emacs style 2018-01-12 23:11:22 +00:00
paboyle
63865e4232 format 2018-01-12 23:10:48 +00:00
paboyle
c64deedf74 Format 2018-01-12 23:09:35 +00:00
paboyle
3281559ec3 Format 2018-01-12 23:09:01 +00:00
paboyle
6a2eca2ec2 NAMESAPCE 2018-01-12 23:00:03 +00:00
paboyle
d8ff895e74 NAMESPACE and format 2018-01-12 18:27:22 +00:00
paboyle
00c49d4c17 Format 2018-01-12 18:25:39 +00:00
paboyle
ec89714cce NAMESPACE 2018-01-12 18:24:16 +00:00
paboyle
6ab744c720 NAMESPACE and formatting 2018-01-12 18:11:04 +00:00
paboyle
bbb657da5c NAMESPACE and formatting 2018-01-12 18:10:11 +00:00
paboyle
fbc2380cb8 NAMESPACE & format 2018-01-12 18:05:36 +00:00
paboyle
08682c5461 NAMESPACE and format to my liking 2018-01-12 18:03:57 +00:00
paboyle
13bce2a6bf NAMESPACE 2018-01-12 17:58:53 +00:00
paboyle
70e689900b NAMESPACE 2018-01-12 17:58:13 +00:00
842 changed files with 48999 additions and 52793 deletions

View File

@@ -30,8 +30,34 @@ directory
#ifndef DISABLE_WARNINGS_H
#define DISABLE_WARNINGS_H
#if defined __GNUC__ && __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
//disables and intel compiler specific warning (in json.hpp)
#pragma warning disable 488
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon
//Eigen only
#endif
// Disable vectorisation in Eigen on the Power8/9 and PowerPC
#ifdef __ALTIVEC__
#define EIGEN_DONT_VECTORIZE
#endif
#ifdef __VSX__
#define EIGEN_DONT_VECTORIZE
#endif
#endif

View File

@@ -38,16 +38,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_BASE_H
#define GRID_BASE_H
#include <Grid/GridStd.h>
#include <Grid/DisableWarnings.h>
#include <Grid/Namespace.h>
#include <Grid/GridStd.h>
#include <Grid/threads/Pragmas.h>
#include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h>
#include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h>
#include <Grid/simd/Simd.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/threads/Threads.h>
#include <Grid/util/Util.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h>
#include <Grid/cartesian/Cartesian.h>
@@ -57,5 +60,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/stencil/Stencil.h>
#include <Grid/parallelIO/BinaryIO.h>
#include <Grid/algorithms/Algorithms.h>
NAMESPACE_CHECK(GridCore)
#endif

View File

@@ -38,5 +38,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore);
#endif

View File

@@ -7,6 +7,7 @@
#include <cassert>
#include <complex>
#include <vector>
#include <array>
#include <string>
#include <iostream>
#include <iomanip>

View File

@@ -1,14 +1,41 @@
#include <Grid/GridCore.h>
#pragma once
// Force Eigen to use MKL if Grid has been configured with --enable-mkl
#ifdef USE_MKL
#define EIGEN_USE_MKL_ALL
#endif
#if defined __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#pragma diag_suppress code_is_unreachable
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")
#undef __NVCC__
#undef __CUDACC__
#undef __CUDA_ARCH__
#define __NVCC__REDEFINE__
#endif
#include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor>
/* NVCC restore */
#ifdef __NVCC__REDEFINE__
#pragma pop_macro("__CUDACC__")
#pragma pop_macro("__NVCC__")
#pragma pop_macro("__CUDA_ARCH__")
#pragma pop
#endif
#if defined __GNUC__
#pragma GCC diagnostic pop
#endif

1
Grid/Grid_Eigen_Tensor.h Normal file
View File

@@ -0,0 +1 @@
#include <Grid/Grid_Eigen_Dense.h>

38
Grid/Namespace.h Normal file
View File

@@ -0,0 +1,38 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Namespace.h
Copyright (C) 2016
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <type_traits>
#include <cassert>
#define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) }
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );

View File

@@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/approx/Zolotarev.h>
#include <Grid/algorithms/approx/Chebyshev.h>
#include <Grid/algorithms/approx/JacobiPolynomial.h>
#include <Grid/algorithms/approx/Remez.h>
#include <Grid/algorithms/approx/MultiShiftFunction.h>
#include <Grid/algorithms/approx/Forecast.h>

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_FFT_H_
#define _GRID_FFT_H_
@@ -38,64 +38,64 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { };
template<class scalar> struct FFTW { };
#ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> {
public:
template<> struct FFTW<ComplexD> {
public:
typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan;
typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftw_flops(p,add,mul,fmas);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftw_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p);
}
};
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p);
}
};
template<> struct FFTW<ComplexF> {
public:
template<> struct FFTW<ComplexF> {
public:
typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan;
typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftwf_flops(p,add,mul,fmas);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftwf_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p);
}
};
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p);
}
};
#endif
@@ -104,203 +104,188 @@ namespace Grid {
#define FFTW_BACKWARD (+1)
#endif
class FFT {
private:
class FFT {
private:
GridCartesian *vgrid;
GridCartesian *sgrid;
GridCartesian *vgrid;
GridCartesian *sgrid;
int Nd;
double flops;
double flops_call;
uint64_t usec;
int Nd;
double flops;
double flops_call;
uint64_t usec;
std::vector<int> dimensions;
std::vector<int> processors;
std::vector<int> processor_coor;
Coordinate dimensions;
Coordinate processors;
Coordinate processor_coor;
public:
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;}
double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) :
FFT ( GridCartesian * grid ) :
vgrid(grid),
Nd(grid->_ndimension),
dimensions(grid->_fdimensions),
processors(grid->_processors),
processor_coor(grid->_processor_coor)
{
flops=0;
usec =0;
std::vector<int> layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors);
};
~FFT ( void) {
delete sgrid;
}
template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){
conformable(result._grid,vgrid);
conformable(source._grid,vgrid);
Lattice<vobj> tmp(vgrid);
tmp = source;
for(int d=0;d<Nd;d++){
if( mask[d] ) {
FFT_dim(result,tmp,d,sign);
tmp=result;
}
}
}
template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
std::vector<int> mask(Nd,1);
FFT_dim_mask(result,source,mask,sign);
}
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
assert(0);
#else
conformable(result._grid,vgrid);
conformable(source._grid,vgrid);
int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim];
std::vector<int> layout(Nd,1);
std::vector<int> pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors);
// Construct pencils
typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1;
for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d];
}
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
scalar div;
if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0;
else assert(0);
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
// Barrel shift and collect global pencil
std::vector<int> lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
PARALLEL_REGION
{
std::vector<int> cbuf(Nd);
sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf);
}
}
if (p != processors[dim] - 1)
{
result = Cshift(result,dim,L);
}
}
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
timer.Start();
PARALLEL_REGION
{
std::vector<int> cbuf(Nd);
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<NN;idx++) {
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
}
}
timer.Stop();
// performance counting
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
// writing out result
PARALLEL_REGION
{
std::vector<int> clbuf(Nd), cgbuf(Nd);
sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result,clbuf);
}
}
result = result*div;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif
}
{
flops=0;
usec =0;
Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors);
};
}
~FFT ( void) {
delete sgrid;
}
template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
Lattice<vobj> tmp(vgrid);
tmp = source;
for(int d=0;d<Nd;d++){
if( mask[d] ) {
FFT_dim(result,tmp,d,sign);
tmp=result;
}
}
}
template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
Coordinate mask(Nd,1);
FFT_dim_mask(result,source,mask,sign);
}
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
assert(0);
#else
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim];
Coordinate layout(Nd,1);
Coordinate pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors);
// Construct pencils
typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g);
auto pgbuf_v = pgbuf.View();
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1;
for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d];
}
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
scalar div;
if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0;
else assert(0);
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
// Barrel shift and collect global pencil
Coordinate lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf);
});
if (p != processors[dim] - 1) {
result = Cshift(result,dim,L);
}
}
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
timer.Start();
thread_for( idx,NN,{
Coordinate cbuf(Nd);
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
});
timer.Stop();
// performance counting
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
// writing out result
thread_for(idx,sgrid->lSites(),{
Coordinate clbuf(Nd), cgbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result,clbuf);
});
result = result*div;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,153 +24,189 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHM_LINEAR_OP_H
#define GRID_ALGORITHM_LINEAR_OP_H
*************************************************************************************/
/* END LEGAL */
#pragma once
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////
// LinearOperators Take a something and return a something.
/////////////////////////////////////////////////////////////////////////////////////////////
//
// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real):
//SBase
// i) F(a x + b y) = aF(x) + b F(y).
// ii) <x|Op|y> = <y|AdjOp|x>^\ast
//
// Would be fun to have a test linearity & Herm Conj function!
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class LinearOperatorBase {
public:
/////////////////////////////////////////////////////////////////////////////////////////////
// LinearOperators Take a something and return a something.
/////////////////////////////////////////////////////////////////////////////////////////////
//
// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian Conjugateugate (transpose if real):
//SBase
// i) F(a x + b y) = aF(x) + b F(y).
// ii) <x|Op|y> = <y|AdjOp|x>^\ast
//
// Would be fun to have a test linearity & Herm Conj function!
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class LinearOperatorBase {
public:
// Support for coarsening to a multigrid
virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
// Support for coarsening to a multigrid
virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
virtual void OpDirAll (const Field &in, std::vector<Field> &out) = 0; // Abstract base
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
virtual void HermOp(const Field &in, Field &out)=0;
};
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
virtual void HermOp(const Field &in, Field &out)=0;
};
/////////////////////////////////////////////////////////////////////////////////////////////
// By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code
// between RB and non-RB variants. Sparse matrix is like the fermion action def, and then
// the wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising
// replication of code.
//
// I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
// while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
// with an assert trap in the non-herm. This isn't right; there must be a better C++ way to
// do it, but I fear it required multiple inheritance and mixed in abstract base classes
/////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////
// By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code
// between RB and non-RB variants. Sparse matrix is like the fermion action def, and then
// the wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising
// replication of code.
//
// I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
// while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
// with an assert trap in the non-herm. This isn't right; there must be a better C++ way to
// do it, but I fear it required multiple inheritance and mixed in abstract base classes
/////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
// Construct herm op from non-herm matrix
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class MdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
////////////////////////////////////////////////////////////////////
// Construct herm op from non-herm matrix
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class MdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2);
}
void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2);
}
void HermOp(const Field &in, Field &out){
_Mat.MdagM(in,out);
}
};
////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD _shift;
public:
ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
assert(0);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
assert(0);
}
void Op (const Field &in, Field &out){
_Mat.M(in,out);
assert(0);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
assert(0);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2);
out = out + _shift*in;
////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD _shift;
public:
ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
assert(0);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
assert(0);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
assert(0);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
assert(0);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2);
out = out + _shift*in;
ComplexD dot;
dot= innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
};
ComplexD dot;
dot= innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
};
////////////////////////////////////////////////////////////////////
// Wrap an already herm matrix
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class HermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.M(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.M(in,out);
////////////////////////////////////////////////////////////////////
// Wrap an already herm matrix
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class HermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.M(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.M(in,out);
ComplexD dot= innerProduct(in,out); n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.M(in,out);
}
};
ComplexD dot= innerProduct(in,out); n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.M(in,out);
}
};
template<class Matrix,class Field>
class NonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
}
};
//////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several
@@ -183,13 +219,13 @@ namespace Grid {
virtual RealD Mpc (const Field &in, Field &out) =0;
virtual RealD MpcDag (const Field &in, Field &out) =0;
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp(in._grid);
tmp.checkerboard = in.checkerboard;
Field tmp(in.Grid());
tmp.Checkerboard() = in.Checkerboard();
ni=Mpc(in,tmp);
no=MpcDag(tmp,out);
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
out.checkerboard = in.checkerboard;
out.Checkerboard() = in.Checkerboard();
MpcDagMpc(in,out,n1,n2);
}
virtual void HermOp(const Field &in, Field &out){
@@ -209,6 +245,9 @@ namespace Grid {
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
};
};
template<class Matrix,class Field>
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
@@ -216,20 +255,20 @@ namespace Grid {
Matrix &_Mat;
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid);
tmp.checkerboard = !in.checkerboard;
Field tmp(in.Grid());
tmp.Checkerboard() = !in.Checkerboard();
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
_Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp);
//std::cout << "cb in " << in.checkerboard << " cb out " << out.checkerboard << std::endl;
//std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl;
_Mat.Mooee(in,out);
return axpy_norm(out,-1.0,tmp,out);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid);
Field tmp(in.Grid());
_Mat.MeooeDag(in,tmp);
_Mat.MooeeInvDag(tmp,out);
@@ -247,7 +286,7 @@ namespace Grid {
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid);
Field tmp(in.Grid());
_Mat.Meooe(in,out);
_Mat.MooeeInv(out,tmp);
@@ -257,7 +296,7 @@ namespace Grid {
return axpy_norm(out,-1.0,tmp,in);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid);
Field tmp(in.Grid());
_Mat.MooeeInvDag(in,out);
_Mat.MeooeDag(out,tmp);
@@ -275,7 +314,7 @@ namespace Grid {
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid);
Field tmp(in.Grid());
_Mat.MooeeInv(in,out);
_Mat.Meooe(out,tmp);
@@ -285,7 +324,7 @@ namespace Grid {
return axpy_norm(out,-1.0,tmp,in);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid);
Field tmp(in.Grid());
_Mat.MeooeDag(in,out);
_Mat.MooeeInvDag(out,tmp);
@@ -297,7 +336,7 @@ namespace Grid {
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
@@ -315,7 +354,7 @@ namespace Grid {
double tMeo;
double taxpby_norm;
uint64_t ncall;
public:
public:
void Report(void)
{
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
@@ -333,17 +372,17 @@ namespace Grid {
taxpby_norm=0;
ncall=0;
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
ncall++;
tMpc-=usecond();
n2 = Mpc(in,out);
n2 = Mpc(in,out);
tMpc+=usecond();
tIP-=usecond();
ComplexD dot= innerProduct(in,out);
ComplexD dot= innerProduct(in,out);
tIP+=usecond();
n1 = real(dot);
}
virtual void HermOp(const Field &in, Field &out){
n1 = real(dot);
}
virtual void HermOp(const Field &in, Field &out){
ncall++;
tMpc-=usecond();
_Mat.Meooe(in,out);
@@ -352,135 +391,145 @@ namespace Grid {
taxpby_norm-=usecond();
axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
}
virtual RealD Mpc (const Field &in, Field &out) {
tMeo-=usecond();
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp);
tMeo+=usecond();
taxpby_norm-=usecond();
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
return nn;
}
virtual RealD MpcDag (const Field &in, Field &out){
return Mpc(in,out);
}
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
assert(0);// Never need with staggered
}
};
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
}
virtual RealD Mpc (const Field &in, Field &out)
{
Field tmp(in.Grid());
Field tmp2(in.Grid());
// std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
_Mat.Mooee(in,out);
_Mat.Mooee(out,tmp);
// std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
tMeo-=usecond();
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp);
tMeo+=usecond();
taxpby_norm-=usecond();
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
return nn;
}
virtual RealD MpcDag (const Field &in, Field &out){
return Mpc(in,out);
}
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
assert(0);// Never need with staggered
}
};
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
/////////////////////////////////////////////////////////////
// Base classes for functions of operators
/////////////////////////////////////////////////////////////
template<class Field> class OperatorFunction {
public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
assert(in.size()==out.size());
for(int k=0;k<in.size();k++){
(*this)(Linop,in[k],out[k]);
}
};
};
template<class Field> class LinearFunction {
public:
virtual void operator() (const Field &in, Field &out) = 0;
};
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
public:
void operator() (const Field &in, Field &out){
out = in;
};
};
/////////////////////////////////////////////////////////////
// Base classes for Multishift solvers for operators
/////////////////////////////////////////////////////////////
template<class Field> class OperatorMultiFunction {
public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, std::vector<Field> &out) = 0;
};
// FIXME : To think about
// Chroma functionality list defining LinearOperator
/*
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign) const = 0;
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign, Real epsilon) const
virtual const Subset& subset() const = 0;
virtual unsigned long nFlops() const { return 0; }
virtual void deriv(P& ds_u, const T& chi, const T& psi, enum PlusMinus isign) const
class UnprecLinearOperator : public DiffLinearOperator<T,P,Q>
const Subset& subset() const {return all;}
};
*/
////////////////////////////////////////////////////////////////////////////////////////////
// Hermitian operator Linear function and operator function
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class HermOpOperatorFunction : public OperatorFunction<Field> {
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Linop.HermOp(in,out);
};
};
template<typename Field>
class PlainHermOp : public LinearFunction<Field> {
public:
LinearOperatorBase<Field> &_Linop;
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
{}
void operator()(const Field& in, Field& out) {
_Linop.HermOp(in,out);
}
};
template<typename Field>
class FunctionHermOp : public LinearFunction<Field> {
public:
OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;
FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop)
: _poly(poly), _Linop(linop) {};
void operator()(const Field& in, Field& out) {
_poly(_Linop,in,out);
}
};
template<class Field>
class Polynomial : public OperatorFunction<Field> {
private:
std::vector<RealD> Coeffs;
public:
Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Field AtoN(in._grid);
Field Mtmp(in._grid);
AtoN = in;
out = AtoN*Coeffs[0];
for(int n=1;n<Coeffs.size();n++){
Mtmp = AtoN;
Linop.HermOp(Mtmp,AtoN);
out=out+AtoN*Coeffs[n];
}
};
/////////////////////////////////////////////////////////////
// Base classes for functions of operators
/////////////////////////////////////////////////////////////
template<class Field> class OperatorFunction {
public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
assert(in.size()==out.size());
for(int k=0;k<in.size();k++){
(*this)(Linop,in[k],out[k]);
}
};
};
}
template<class Field> class LinearFunction {
public:
virtual void operator() (const Field &in, Field &out) = 0;
};
#endif
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
public:
void operator() (const Field &in, Field &out){
out = in;
};
};
/////////////////////////////////////////////////////////////
// Base classes for Multishift solvers for operators
/////////////////////////////////////////////////////////////
template<class Field> class OperatorMultiFunction {
public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, std::vector<Field> &out) = 0;
};
// FIXME : To think about
// Chroma functionality list defining LinearOperator
/*
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign) const = 0;
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign, Real epsilon) const
virtual const Subset& subset() const = 0;
virtual unsigned long nFlops() const { return 0; }
virtual void deriv(P& ds_u, const T& chi, const T& psi, enum PlusMinus isign) const
class UnprecLinearOperator : public DiffLinearOperator<T,P,Q>
const Subset& subset() const {return all;}
};
*/
////////////////////////////////////////////////////////////////////////////////////////////
// Hermitian operator Linear function and operator function
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class HermOpOperatorFunction : public OperatorFunction<Field> {
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Linop.HermOp(in,out);
};
};
template<typename Field>
class PlainHermOp : public LinearFunction<Field> {
public:
LinearOperatorBase<Field> &_Linop;
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
{}
void operator()(const Field& in, Field& out) {
_Linop.HermOp(in,out);
}
};
template<typename Field>
class FunctionHermOp : public LinearFunction<Field> {
public:
OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;
FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop)
: _poly(poly), _Linop(linop) {};
void operator()(const Field& in, Field& out) {
_poly(_Linop,in,out);
}
};
template<class Field>
class Polynomial : public OperatorFunction<Field> {
private:
std::vector<RealD> Coeffs;
public:
using OperatorFunction<Field>::operator();
Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Field AtoN(in.Grid());
Field Mtmp(in.Grid());
AtoN = in;
out = AtoN*Coeffs[0];
for(int n=1;n<Coeffs.size();n++){
Mtmp = AtoN;
Linop.HermOp(Mtmp,AtoN);
out=out+AtoN*Coeffs[n];
}
};
};
NAMESPACE_END(Grid);

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,24 +23,24 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PRECONDITIONER_H
#define GRID_PRECONDITIONER_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class Field> class Preconditioner : public LinearFunction<Field> {
virtual void operator()(const Field &src, Field & psi)=0;
};
template<class Field> class Preconditioner : public LinearFunction<Field> {
virtual void operator()(const Field &src, Field & psi)=0;
};
template<class Field> class TrivialPrecon : public Preconditioner<Field> {
public:
void operator()(const Field &src, Field & psi){
psi = src;
}
TrivialPrecon(void){};
};
template<class Field> class TrivialPrecon : public Preconditioner<Field> {
public:
void operator()(const Field &src, Field & psi){
psi = src;
}
TrivialPrecon(void){};
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,57 +23,62 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHM_SPARSE_MATRIX_H
#define GRID_ALGORITHM_SPARSE_MATRIX_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface defining what I expect of a general sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SparseMatrixBase {
public:
virtual GridBase *Grid(void) =0;
// Full checkerboar operations
virtual RealD M (const Field &in, Field &out)=0;
virtual RealD Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp (in._grid);
ni=M(in,tmp);
no=Mdag(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
};
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface defining what I expect of a general sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SparseMatrixBase {
public:
virtual GridBase *Grid(void) =0;
// Full checkerboar operations
virtual RealD M (const Field &in, Field &out)=0;
virtual RealD Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp (in.Grid());
ni=M(in,tmp);
no=Mdag(tmp,out);
}
virtual void MdagM(const Field &in, Field &out) {
RealD ni, no;
MdagM(in,out,ni,no);
}
virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
};
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface augmented by a red black sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
public:
virtual GridBase *RedBlackGrid(void)=0;
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface augmented by a red black sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
public:
virtual GridBase *RedBlackGrid(void)=0;
//////////////////////////////////////////////////////////////////////
// Query the even even properties to make algorithmic decisions
//////////////////////////////////////////////////////////////////////
virtual RealD Mass(void) { return 0.0; };
virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden
virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better
//////////////////////////////////////////////////////////////////////
// Query the even even properties to make algorithmic decisions
//////////////////////////////////////////////////////////////////////
virtual RealD Mass(void) { return 0.0; };
virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden
virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better
// half checkerboard operaions
virtual void Meooe (const Field &in, Field &out)=0;
virtual void Mooee (const Field &in, Field &out)=0;
virtual void MooeeInv (const Field &in, Field &out)=0;
// half checkerboard operaions
virtual void Meooe (const Field &in, Field &out)=0;
virtual void Mooee (const Field &in, Field &out)=0;
virtual void MooeeInv (const Field &in, Field &out)=0;
virtual void MeooeDag (const Field &in, Field &out)=0;
virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0;
virtual void MeooeDag (const Field &in, Field &out)=0;
virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0;
};
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -25,14 +25,14 @@ Author: Christoph Lehner <clehner@bnl.gov>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CHEBYSHEV_H
#define GRID_CHEBYSHEV_H
#include <Grid/algorithms/LinearOperator.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
struct ChebyParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
@@ -41,337 +41,357 @@ struct ChebyParams : Serializable {
int, Npoly);
};
////////////////////////////////////////////////////////////////////////////////////////////
// Generic Chebyshev approximations
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class Chebyshev : public OperatorFunction<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD hi;
RealD lo;
////////////////////////////////////////////////////////////////////////////////////////////
// Generic Chebyshev approximations
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class Chebyshev : public OperatorFunction<Field> {
private:
using OperatorFunction<Field>::operator();
public:
void csv(std::ostream &out){
RealD diff = hi-lo;
RealD delta = (hi-lo)*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1;
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
std::vector<RealD> Coeffs;
int order;
RealD hi;
RealD lo;
public:
void csv(std::ostream &out){
RealD diff = hi-lo;
RealD delta = diff*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1;
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
// Convenience for plotting the approximation
void PlotApprox(std::ostream &out) {
out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
out <<x<<"\t"<<approx(x)<<std::endl;
}
};
Chebyshev(){};
Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
////////////////////////////////////////////////////////////////////////////////////////////////////
// c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
////////////////////////////////////////////////////////////////////////////////////////////////////
// CJ: the one we need for Lanczos
void Init(RealD _lo,RealD _hi,int _order)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
Coeffs.assign(0.,order);
Coeffs[order-1] = 1.;
};
// PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
// Similar kick effect below the threshold as Lanczos filter approach
void InitLowPass(RealD _lo,RealD _hi,int _order)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD k=(order-1.0);
RealD s=std::cos( j*M_PI*(k+0.5)/order );
Coeffs[j] = s * 2.0/order;
}
// Convenience for plotting the approximation
void PlotApprox(std::ostream &out) {
out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
out <<x<<"\t"<<approx(x)<<std::endl;
}
};
Chebyshev(){};
Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
////////////////////////////////////////////////////////////////////////////////////////////////////
// c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
////////////////////////////////////////////////////////////////////////////////////////////////////
// CJ: the one we need for Lanczos
void Init(RealD _lo,RealD _hi,int _order)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
Coeffs.assign(0.,order);
Coeffs[order-1] = 1.;
};
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){
RealD M=order;
RealD alpha = M_PI/(M+2);
RealD lmax = std::cos(alpha);
RealD sumUsq =0;
std::vector<RealD> U(M);
std::vector<RealD> a(M);
std::vector<RealD> g(M);
for(int n=0;n<=M;n++){
U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
sumUsq += U[n]*U[n];
}
sumUsq = std::sqrt(sumUsq);
for(int i=1;i<=M;i++){
a[i] = U[i]/sumUsq;
}
g[0] = 1.0;
for(int m=1;m<=M;m++){
g[m] = 0;
for(int i=0;i<=M-m;i++){
g[m]+= a[i]*a[m+i];
}
}
for(int m=1;m<=M;m++){
Coeffs[m]*=g[m];
}
}
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
RealD approxD(RealD x)
{
RealD Un;
RealD Unm;
RealD Unp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD U0=1;
RealD U1=2*y;
RealD sum;
sum = Coeffs[1]*U0;
sum+= Coeffs[2]*U1*2.0;
Un =U1;
Unm=U0;
for(int i=2;i<order-1;i++){
Unp=2*y*Un-Unm;
Unm=Un;
Un =Unp;
sum+= Un*Coeffs[i+1]*(i+1.0);
}
return sum/(0.5*(hi-lo));
};
RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
RealD x = x0;
RealD eps;
int i;
for (i=0;i<maxiter;i++) {
eps = approx(x) - z;
if (fabs(eps / z) < resid)
return x;
x = x - eps / approxD(x);
}
return std::numeric_limits<double>::quiet_NaN();
}
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid;
// std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
T1=y*xscale+in*mscale;
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y);
y=xscale*y+mscale*(*Tn);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
};
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD alpha;
RealD beta;
RealD mu;
void JacksonSmooth(void){
RealD M=order;
RealD alpha = M_PI/(M+2);
RealD lmax = std::cos(alpha);
RealD sumUsq =0;
std::vector<RealD> U(M);
std::vector<RealD> a(M);
std::vector<RealD> g(M);
for(int n=0;n<=M;n++){
U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
sumUsq += U[n]*U[n];
}
sumUsq = std::sqrt(sumUsq);
public:
ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
for(int i=1;i<=M;i++){
a[i] = U[i]/sumUsq;
}
g[0] = 1.0;
for(int m=1;m<=M;m++){
g[m] = 0;
for(int i=0;i<=M-m;i++){
g[m]+= a[i]*a[m+i];
}
}
for(int m=1;m<=M;m++){
Coeffs[m]*=g[m];
}
}
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
RealD approxD(RealD x)
{
RealD Un;
RealD Unm;
RealD Unp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD U0=1;
RealD U1=2*y;
RealD sum;
sum = Coeffs[1]*U0;
sum+= Coeffs[2]*U1*2.0;
Un =U1;
Unm=U0;
for(int i=2;i<order-1;i++){
Unp=2*y*Un-Unm;
Unm=Un;
Un =Unp;
sum+= Un*Coeffs[i+1]*(i+1.0);
}
return sum/(0.5*(hi-lo));
};
RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
RealD x = x0;
RealD eps;
int i;
for (i=0;i<maxiter;i++) {
eps = approx(x) - z;
if (fabs(eps / z) < resid)
return x;
x = x - eps / approxD(x);
}
return std::numeric_limits<double>::quiet_NaN();
}
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
// std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
axpby(T1,xscale,mscale,y,in);
// sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1;
axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y);
// y=xscale*y+mscale*(*Tn);
// *Tnp=2.0*y-(*Tnm);
// out=out+Coeffs[n]* (*Tnp);
axpby(y,xscale,mscale,y,(*Tn));
axpby(*Tnp,2.0,-1.0,y,(*Tnm));
axpy(out,Coeffs[n],*Tnp,out);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
};
template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD alpha;
RealD beta;
RealD mu;
public:
ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
alpha(_alpha),
beta(_beta),
mu(_mu)
{
order=_order;
Coeffs.resize(order);
for(int i=0;i<_order;i++){
Coeffs[i] = 0.0;
}
Coeffs[order-1]=1.0;
};
void csv(std::ostream &out){
for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
RealD approx(RealD xx) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
Real aa = alpha * alpha;
Real bb = beta * beta;
RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
RealD y= x;
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
// shift_Multiply in Rudy's code
void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)
{
GridBase *grid=in._grid;
Field tmp(grid);
RealD aa= alpha*alpha;
RealD bb= beta * beta;
Linop.HermOp(in,out);
out = out - mu*in;
Linop.HermOp(out,tmp);
tmp = tmp - mu * out;
out = (2.0/ (aa-bb) ) * tmp - ((aa+bb)/(aa-bb))*in;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M )*in
AminusMuSq(Linop,T0,T1);
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
AminusMuSq(Linop,*Tn,y);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
beta(_beta),
mu(_mu)
{
order=_order;
Coeffs.resize(order);
for(int i=0;i<_order;i++){
Coeffs[i] = 0.0;
}
Coeffs[order-1]=1.0;
};
}
void csv(std::ostream &out){
for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
RealD approx(RealD xx) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
Real aa = alpha * alpha;
Real bb = beta * beta;
RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
RealD y= x;
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
// shift_Multiply in Rudy's code
void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)
{
GridBase *grid=in.Grid();
Field tmp(grid);
RealD aa= alpha*alpha;
RealD bb= beta * beta;
Linop.HermOp(in,out);
out = out - mu*in;
Linop.HermOp(out,tmp);
tmp = tmp - mu * out;
out = (2.0/ (aa-bb) ) * tmp - ((aa+bb)/(aa-bb))*in;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M )*in
AminusMuSq(Linop,T0,T1);
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
AminusMuSq(Linop,*Tn,y);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -26,127 +26,127 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
/* END LEGAL */
#ifndef INCLUDED_FORECAST_H
#define INCLUDED_FORECAST_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
// Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
// and returns a forecasted solution to the system D*psi = phi (psi).
template<class Matrix, class Field>
class Forecast
// Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
// and returns a forecasted solution to the system D*psi = phi (psi).
template<class Matrix, class Field>
class Forecast
{
public:
virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
};
// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
// used to forecast solutions across poles of the EOFA heatbath.
//
// Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
template<class Matrix, class Field>
class ChronoForecast : public Forecast<Matrix,Field>
{
public:
Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
{
public:
virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
int degree = prev_solns.size();
Field chi(phi); // forecasted solution
// Trivial cases
if(degree == 0){ chi = Zero(); return chi; }
else if(degree == 1){ return prev_solns[0]; }
// RealD dot;
ComplexD xp;
Field r(phi); // residual
Field Mv(phi);
std::vector<Field> v(prev_solns); // orthonormalized previous solutions
std::vector<Field> MdagMv(degree,phi);
// Array to hold the matrix elements
std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
// Solution and source vectors
std::vector<ComplexD> a(degree);
std::vector<ComplexD> b(degree);
// Orthonormalize the vector basis
for(int i=0; i<degree; i++){
v[i] *= 1.0/std::sqrt(norm2(v[i]));
for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
}
// Perform sparse matrix multiplication and construct rhs
for(int i=0; i<degree; i++){
b[i] = innerProduct(v[i],phi);
Mat.M(v[i],Mv);
Mat.Mdag(Mv,MdagMv[i]);
G[i][i] = innerProduct(v[i],MdagMv[i]);
}
// Construct the matrix
for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = conjugate(G[j][k]);
}}
// Gauss-Jordan elimination with partial pivoting
for(int i=0; i<degree; i++){
// Perform partial pivoting
int k = i;
for(int j=i+1; j<degree; j++){ if(abs(G[j][j]) > abs(G[k][k])){ k = j; } }
if(k != i){
xp = b[k];
b[k] = b[i];
b[i] = xp;
for(int j=0; j<degree; j++){
xp = G[k][j];
G[k][j] = G[i][j];
G[i][j] = xp;
}
}
// Convert matrix to upper triangular form
for(int j=i+1; j<degree; j++){
xp = G[j][i]/G[i][i];
b[j] -= xp * b[i];
for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
}
}
// Use Gaussian elimination to solve equations and calculate initial guess
chi = Zero();
r = phi;
for(int i=degree-1; i>=0; i--){
a[i] = 0.0;
for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
a[i] = (b[i]-a[i])/G[i][i];
chi += a[i]*v[i];
r -= a[i]*MdagMv[i];
}
RealD true_r(0.0);
ComplexD tmp;
for(int i=0; i<degree; i++){
tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = conjugate(tmp)*tmp;
true_r += std::sqrt(tmp.real());
}
RealD error = std::sqrt(norm2(r)/norm2(phi));
std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
return chi;
};
};
// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
// used to forecast solutions across poles of the EOFA heatbath.
//
// Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
template<class Matrix, class Field>
class ChronoForecast : public Forecast<Matrix,Field>
{
public:
Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
{
int degree = prev_solns.size();
Field chi(phi); // forecasted solution
// Trivial cases
if(degree == 0){ chi = zero; return chi; }
else if(degree == 1){ return prev_solns[0]; }
RealD dot;
ComplexD xp;
Field r(phi); // residual
Field Mv(phi);
std::vector<Field> v(prev_solns); // orthonormalized previous solutions
std::vector<Field> MdagMv(degree,phi);
// Array to hold the matrix elements
std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
// Solution and source vectors
std::vector<ComplexD> a(degree);
std::vector<ComplexD> b(degree);
// Orthonormalize the vector basis
for(int i=0; i<degree; i++){
v[i] *= 1.0/std::sqrt(norm2(v[i]));
for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
}
// Perform sparse matrix multiplication and construct rhs
for(int i=0; i<degree; i++){
b[i] = innerProduct(v[i],phi);
Mat.M(v[i],Mv);
Mat.Mdag(Mv,MdagMv[i]);
G[i][i] = innerProduct(v[i],MdagMv[i]);
}
// Construct the matrix
for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = std::conj(G[j][k]);
}}
// Gauss-Jordan elimination with partial pivoting
for(int i=0; i<degree; i++){
// Perform partial pivoting
int k = i;
for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
if(k != i){
xp = b[k];
b[k] = b[i];
b[i] = xp;
for(int j=0; j<degree; j++){
xp = G[k][j];
G[k][j] = G[i][j];
G[i][j] = xp;
}
}
// Convert matrix to upper triangular form
for(int j=i+1; j<degree; j++){
xp = G[j][i]/G[i][i];
b[j] -= xp * b[i];
for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
}
}
// Use Gaussian elimination to solve equations and calculate initial guess
chi = zero;
r = phi;
for(int i=degree-1; i>=0; i--){
a[i] = 0.0;
for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
a[i] = (b[i]-a[i])/G[i][i];
chi += a[i]*v[i];
r -= a[i]*MdagMv[i];
}
RealD true_r(0.0);
ComplexD tmp;
for(int i=0; i<degree; i++){
tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = std::conj(tmp)*tmp;
true_r += std::sqrt(tmp.real());
}
RealD error = std::sqrt(norm2(r)/norm2(phi));
std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
return chi;
};
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,129 @@
#ifndef GRID_JACOBIPOLYNOMIAL_H
#define GRID_JACOBIPOLYNOMIAL_H
#include <Grid/algorithms/LinearOperator.h>
NAMESPACE_BEGIN(Grid);
template<class Field>
class JacobiPolynomial : public OperatorFunction<Field> {
private:
using OperatorFunction<Field>::operator();
int order;
RealD hi;
RealD lo;
RealD alpha;
RealD beta;
public:
void csv(std::ostream &out){
csv(out,lo,hi);
}
void csv(std::ostream &out,RealD llo,RealD hhi){
RealD diff = hhi-llo;
RealD delta = diff*1.0e-5;
for (RealD x=llo-delta; x<=hhi; x+=delta) {
RealD f = approx(x);
out<< x<<" "<<f <<std::endl;
}
return;
}
JacobiPolynomial(){};
JacobiPolynomial(RealD _lo,RealD _hi,int _order,RealD _alpha, RealD _beta)
{
lo=_lo;
hi=_hi;
alpha=_alpha;
beta=_beta;
order=_order;
};
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1.0;
RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
Tn =T1;
Tnm=T0;
for(int n=2;n<=order;n++){
RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
Tnm=Tn;
Tn =Tnp;
}
return Tnp;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
int vol=grid->gSites();
Field T0(grid);
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// RealD T0=1.0;
T0=in;
// RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
// = x * 2/(hi-lo) - (hi+lo)/(hi-lo)
Linop.HermOp(T0,y);
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
y=y*xscale+in*mscale;
// RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
RealD halfAmB = (alpha-beta)*0.5;
RealD halfApBp2= (alpha+beta+2.0)*0.5;
T1 = halfAmB * in + halfApBp2*y;
for(int n=2;n<=order;n++){
Linop.HermOp(*Tn,y);
y=xscale*y+mscale*(*Tn);
RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
// Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
cny=cny/cnp;
cn1=cn1/cnp;
cn1=cn1/cnp;
cnm=cnm/cnp;
*Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
out=*Tnp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -27,7 +27,8 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
double MultiShiftFunction::approx(double x)
{
double a = norm;
@@ -53,4 +54,4 @@ void MultiShiftFunction::csv(std::ostream &out)
}
return;
}
}
NAMESPACE_END(Grid);

View File

@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef MULTI_SHIFT_FUNCTION
#define MULTI_SHIFT_FUNCTION
namespace Grid {
NAMESPACE_BEGIN(Grid);
class MultiShiftFunction {
public:
@@ -63,5 +63,5 @@ public:
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -298,7 +298,7 @@ void AlgRemez::stpini(bigfloat *step) {
// Search for error maxima and minima
void AlgRemez::search(bigfloat *step) {
bigfloat a, q, xm, ym, xn, yn, xx0, xx1;
int i, j, meq, emsign, ensign, steps;
int i, meq, emsign, ensign, steps;
meq = neq + 1;
bigfloat *yy = new bigfloat[meq];
@@ -306,7 +306,6 @@ void AlgRemez::search(bigfloat *step) {
bigfloat eclose = 1.0e30;
bigfloat farther = 0l;
j = 1;
xx0 = apstrt;
for (i = 0; i < meq; i++) {

View File

@@ -58,8 +58,8 @@
/* Compute the partial fraction expansion coefficients (alpha) from the
* factored form */
namespace Grid {
namespace Approx {
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
static void construct_partfrac(izd *z) {
int dn = z -> dn, dd = z -> dd, type = z -> type;
@@ -516,7 +516,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
free(d);
return zd;
}
}}
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#ifdef TEST
@@ -585,6 +587,7 @@ static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
return (ONE - T) / (ONE + T);
}
/* Test program. Apart from printing out the parameters for R(x) it produces
* the following data files for plotting (unless NPLOT is defined):
*
@@ -723,5 +726,5 @@ int main(int argc, char** argv) {
return EXIT_SUCCESS;
}
#endif /* TEST */

View File

@@ -1,13 +1,13 @@
/* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */
#ifdef __cplusplus
namespace Grid {
namespace Approx {
#include <Grid/Namespace.h>
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
#endif
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL
#ifndef PRECISION
#define PRECISION double
@@ -83,5 +83,6 @@ void zolotarev_free(zolotarev_data *zdata);
#endif
#ifdef __cplusplus
}}
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif

View File

@@ -10,10 +10,12 @@
#ifndef INCLUDED_BIGFLOAT_H
#define INCLUDED_BIGFLOAT_H
#define __GMP_WITHIN_CONFIGURE
#include <gmp.h>
#include <mpf2mpfr.h>
#include <mpfr.h>
#undef __GMP_WITHIN_CONFIGURE
class bigfloat {
private:

View File

@@ -90,8 +90,8 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){
psi.checkerboard = src.checkerboard;
grid = src._grid;
psi.Checkerboard() = src.Checkerboard();
grid = src.Grid();
RealD f;
RealD rtzp,rtz,a,d,b;

View File

@@ -27,11 +27,9 @@ See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
#define GRID_BLOCK_CONJUGATE_GRADIENT_H
#pragma once
namespace Grid {
NAMESPACE_BEGIN(Grid);
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
@@ -154,12 +152,12 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel
void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = B._grid->_fdimensions[Orthog];
Nblock = B.Grid()->_fdimensions[Orthog];
/* FAKE */
Nblock=8;
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
X.checkerboard = B.checkerboard;
X.Checkerboard() = B.Checkerboard();
conformable(X, B);
Field tmp(B);
@@ -334,11 +332,11 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = blockDim; // First dimension is block dim
Nblock = Src._grid->_fdimensions[Orthog];
Nblock = Src.Grid()->_fdimensions[Orthog];
std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard;
Psi.Checkerboard() = Src.Checkerboard();
conformable(Psi, Src);
Field P(Src);
@@ -478,7 +476,7 @@ void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp];
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
@@ -488,9 +486,9 @@ void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){
AP[b] = zero;
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += (m(bp,b))*X[bp];
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
@@ -517,7 +515,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
for(int b=0;b<Nblock;b++){
X[b].checkerboard = B[b].checkerboard;
X[b].Checkerboard() = B[b].Checkerboard();
conformable(X[b], B[b]);
conformable(X[b], X[0]);
}
@@ -690,9 +688,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
IterationsToComplete = k;
}
};
}
#endif
NAMESPACE_END(Grid);

View File

@@ -34,6 +34,8 @@ namespace Grid {
template<class Field>
class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
// defaults to true
@@ -52,10 +54,10 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
Eigen::MatrixXcd H;
std::vector<std::complex<double>> y;
std::vector<std::complex<double>> gamma;
std::vector<std::complex<double>> c;
std::vector<std::complex<double>> s;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
CommunicationAvoidingGeneralisedMinimalResidual(RealD tol,
Integer maxit,
@@ -76,7 +78,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl;
psi.checkerboard = src.checkerboard;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
@@ -86,7 +88,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid);
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
@@ -142,11 +144,11 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
RealD cp = 0;
Field w(src._grid);
Field r(src._grid);
Field w(src.Grid());
Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
@@ -157,7 +159,9 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
ComplexD scale = 1.0/gamma[0];
v[0] = scale * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
@@ -168,7 +172,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
qrUpdate(i);
cp = std::norm(gamma[i+1]);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
@@ -194,11 +198,11 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i];
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w;
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
@@ -206,13 +210,13 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
@@ -221,7 +225,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
@@ -231,8 +235,8 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k];
y[i] = y[i] / H(i, i);
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)

View File

@@ -27,11 +27,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_H
#define GRID_CONJUGATE_GRADIENT_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
@@ -40,7 +40,10 @@ namespace Grid {
template <class Field>
class ConjugateGradient : public OperatorFunction<Field> {
public:
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
@@ -48,17 +51,18 @@ class ConjugateGradient : public OperatorFunction<Field> {
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
psi.checkerboard = src.checkerboard;
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred;
RealD cp, c, a, d, b, ssq, qq;
//RealD b_pred;
Field p(src);
Field mmp(src);
@@ -68,7 +72,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
@@ -127,10 +130,13 @@ class ConjugateGradient : public OperatorFunction<Field> {
b = cp / c;
LinearCombTimer.Start();
parallel_for(int ss=0;ss<src._grid->oSites();ss++){
vstream(psi[ss], a * p[ss] + psi[ss]);
vstream(p [ss], b * p[ss] + r[ss]);
}
auto psi_v = psi.View();
auto p_v = p.View();
auto r_v = r.View();
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
});
LinearCombTimer.Stop();
LinalgTimer.Stop();
@@ -143,22 +149,22 @@ class ConjugateGradient : public OperatorFunction<Field> {
Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k
<< "\tComputed residual " << std::sqrt(cp / ssq)
<< "\tTrue residual " << true_residual
<< "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "Time breakdown "<<std::endl;
std::cout << GridLogIterative << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
@@ -174,5 +180,5 @@ class ConjugateGradient : public OperatorFunction<Field> {
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,13 +23,12 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF,
@@ -67,98 +66,96 @@ namespace Grid {
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
TotalInnerIterations = 0;
void operator() (const FieldD &src_d_in, FieldD &sol_d){
TotalInnerIterations = 0;
GridStopWatch TotalTimer;
TotalTimer.Start();
GridStopWatch TotalTimer;
TotalTimer.Start();
int cb = src_d_in.checkerboard;
sol_d.checkerboard = cb;
int cb = src_d_in.Checkerboard();
sol_d.Checkerboard() = cb;
RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance;
RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in._grid;
FieldD tmp_d(DoublePrecGrid);
tmp_d.checkerboard = cb;
GridBase* DoublePrecGrid = src_d_in.Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.checkerboard = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation
FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation
RealD inner_tol = InnerTolerance;
RealD inner_tol = InnerTolerance;
FieldF src_f(SinglePrecGrid);
src_f.checkerboard = cb;
FieldF src_f(SinglePrecGrid);
src_f.Checkerboard() = cb;
FieldF sol_f(SinglePrecGrid);
sol_f.checkerboard = cb;
FieldF sol_f(SinglePrecGrid);
sol_f.Checkerboard() = cb;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
GridStopWatch InnerCGtimer;
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
GridStopWatch PrecChangeTimer;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break;
}
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
PrecChangeTimer.Stop();
zeroit(sol_f);
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL)
(*guesser)(src_f, sol_f);
//Inner CG
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
TotalInnerIterations += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break;
}
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
//Final trial CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
PrecChangeTimer.Stop();
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
sol_f = Zero();
TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL)
(*guesser)(src_f, sol_f);
//Inner CG
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
TotalInnerIterations += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
}
};
//Final trial CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
}
TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,146 +24,149 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
#define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
public OperatorFunction<Field>
{
template<class Field>
class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
public OperatorFunction<Field>
{
public:
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
int verbose;
MultiShiftFunction shifts;
ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :
MaxIterations(maxit),
shifts(_shifts)
{
verbose=1;
using OperatorFunction<Field>::operator();
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
int verbose;
MultiShiftFunction shifts;
ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :
MaxIterations(maxit),
shifts(_shifts)
{
verbose=1;
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<Field> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
GridBase *grid = src._grid;
int nshift = shifts.order;
std::vector<Field> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
return;
}
return;
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
{
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
{
GridBase *grid = src.Grid();
GridBase *grid = src._grid;
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
std::vector<Field> ps(nshift,grid);// Search directions
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
std::vector<Field> ps(nshift,grid);// Search directions
assert(psi.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
assert(psi.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
Field r(grid);
Field p(grid);
Field tmp(grid);
Field mmp(grid);
// Matrix mult fields
Field r(grid);
Field p(grid);
Field tmp(grid);
Field mmp(grid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src);
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
<<" target resid "<<rsq[s]<<std::endl;
ps[s] = src;
}
// r and p for primary
r=src;
p=src;
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src);
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
<<" target resid "<<rsq[s]<<std::endl;
ps[s] = src;
}
// r and p for primary
r=src;
p=src;
//MdagM+m[0]
Linop.HermOpAndNorm(p,mmp,d,qq);
axpy(mmp,mass[0],p,mmp);
RealD rn = norm2(p);
d += rn*mass[0];
//MdagM+m[0]
Linop.HermOpAndNorm(p,mmp,d,qq);
axpy(mmp,mass[0],p,mmp);
RealD rn = norm2(p);
d += rn*mass[0];
// have verified that inner product of
// p and mmp is equal to d after this since
// the d computation is tricky
// qq = real(innerProduct(p,mmp));
// std::cout<<GridLogMessage << "debug equal ? qq "<<qq<<" d "<< d<<std::endl;
// have verified that inner product of
// p and mmp is equal to d after this since
// the d computation is tricky
// qq = real(innerProduct(p,mmp));
// std::cout<<GridLogMessage << "debug equal ? qq "<<qq<<" d "<< d<<std::endl;
b = -cp /d;
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r,b,mmp,r);
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r,b,mmp,r);
for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
}
for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
}
///////////////////////////////////////
// Timers
@@ -175,37 +178,37 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
GridStopWatch SolverTimer;
SolverTimer.Start();
// Iteration loop
int k;
// Iteration loop
int k;
for (k=1;k<=MaxIterations;k++){
for (k=1;k<=MaxIterations;k++){
a = c /cp;
a = c /cp;
AXPYTimer.Start();
axpy(p,a,p,r);
axpy(p,a,p,r);
AXPYTimer.Stop();
// Note to self - direction ps is iterated seperately
// for each shift. Does not appear to have any scope
// for avoiding linear algebra in "single" case.
//
// However SAME r is used. Could load "r" and update
// ALL ps[s]. 2/3 Bandwidth saving
// New Kernel: Load r, vector of coeffs, vector of pointers ps
// Note to self - direction ps is iterated seperately
// for each shift. Does not appear to have any scope
// for avoiding linear algebra in "single" case.
//
// However SAME r is used. Could load "r" and update
// ALL ps[s]. 2/3 Bandwidth saving
// New Kernel: Load r, vector of coeffs, vector of pointers ps
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps[s],a,ps[s],r);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps[s],z[s][iz],as,r,ps[s]);
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps[s],a,ps[s],r);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps[s],z[s][iz],as,r,ps[s]);
}
}
}
}
AXPYTimer.Stop();
cp=c;
cp=c;
MatrixTimer.Start();
//Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
// The below is faster on KNL
@@ -215,89 +218,89 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp,mass[0],p,mmp);
axpy(mmp,mass[0],p,mmp);
AXPYTimer.Stop();
RealD rn = norm2(p);
d += rn*mass[0];
RealD rn = norm2(p);
d += rn*mass[0];
bp=b;
b=-cp/d;
bp=b;
b=-cp/d;
AXPYTimer.Start();
c=axpy_norm(r,b,mmp,r);
c=axpy_norm(r,b,mmp,r);
AXPYTimer.Stop();
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
}
ShiftTimer.Stop();
for(int s=0;s<nshift;s++){
int ss = s;
// Scope for optimisation here in case of "single".
// Could load psi[0] and pull all ps[s] in.
// if ( single ) ss=primary;
// Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
// Pipelined CG gain:
//
// New Kernel: Load r, vector of coeffs, vector of pointers ps
// New Kernel: Load psi[0], vector of coeffs, vector of pointers ps
// If can predict the coefficient bs then we can fuse these and avoid write reread cyce
// on ps[s].
// Before: 3 x npole + 3 x npole
// After : 2 x npole (ps[s]) => 3x speed up of multishift CG.
for(int s=0;s<nshift;s++){
int ss = s;
// Scope for optimisation here in case of "single".
// Could load psi[0] and pull all ps[s] in.
// if ( single ) ss=primary;
// Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
// Pipelined CG gain:
//
// New Kernel: Load r, vector of coeffs, vector of pointers ps
// New Kernel: Load psi[0], vector of coeffs, vector of pointers ps
// If can predict the coefficient bs then we can fuse these and avoid write reread cyce
// on ps[s].
// Before: 3 x npole + 3 x npole
// After : 2 x npole (ps[s]) => 3x speed up of multishift CG.
if( (!converged[s]) ) {
axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]);
}
}
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
if( (!converged[s]) ) {
axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]);
}
}
}
if ( all_converged ){
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged ){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
// Check answers
for(int s=0; s < nshift; s++) {
Linop.HermOpAndNorm(psi[s],mmp,d,qq);
axpy(tmp,mass[s],psi[s],mmp);
axpy(r,-alpha[s],src,tmp);
RealD rn = norm2(r);
RealD cn = norm2(src);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop.HermOpAndNorm(psi[s],mmp,d,qq);
axpy(tmp,mass[s],psi[s],mmp);
axpy(r,-alpha[s],src,tmp);
RealD rn = norm2(r);
RealD cn = norm2(src);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
}
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
@@ -307,16 +310,16 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
IterationsToComplete = k;
return;
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
};
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,234 +23,236 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer ReliableUpdatesPerformed;
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer ReliableUpdatesPerformed;
bool DoFinalCleanup; //Final DP cleanup, defaults to true
Integer IterationsToCleanup; //Final DP cleanup step iterations
bool DoFinalCleanup; //Final DP cleanup, defaults to true
Integer IterationsToCleanup; //Final DP cleanup step iterations
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback;
RealD fallback_transition_tol;
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback;
RealD fallback_transition_tol;
ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
Delta(_delta),
Linop_f(_Linop_f),
Linop_d(_Linop_d),
SinglePrecGrid(_sp_grid),
ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true),
Linop_fallback(NULL)
{};
ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
Delta(_delta),
Linop_f(_Linop_f),
Linop_d(_Linop_d),
SinglePrecGrid(_sp_grid),
ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true),
Linop_fallback(NULL)
{};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback;
fallback_transition_tol = _fallback_transition_tol;
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback;
fallback_transition_tol = _fallback_transition_tol;
}
void operator()(const FieldD &src, FieldD &psi) {
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred;
FieldD p(src);
FieldD mmp(src);
FieldD r(src);
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
return;
}
void operator()(const FieldD &src, FieldD &psi) {
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false;
//Single prec initialization
FieldF r_f(SinglePrecGrid);
r_f.Checkerboard() = r.Checkerboard();
precisionChange(r_f, r);
psi.checkerboard = src.checkerboard;
conformable(psi, src);
FieldF psi_f(r_f);
psi_f = Zero();
RealD cp, c, a, d, b, ssq, qq, b_pred;
FieldF p_f(r_f);
FieldF mmp_f(r_f);
FieldD p(src);
FieldD mmp(src);
FieldD r(src);
RealD MaxResidSinceLastRelUp = cp; //initial residual
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
Linop_d.HermOpAndNorm(psi, mmp, d, b);
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
r = src - mmp;
p = r;
SolverTimer.Start();
int k = 0;
int l = 0;
a = norm2(p);
cp = a;
ssq = norm2(src);
for (k = 1; k <= MaxIterations; k++) {
c = cp;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: p " << a << std::endl;
MatrixTimer.Start();
Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
MatrixTimer.Stop();
RealD rsq = Tolerance * Tolerance * ssq;
LinalgTimer.Start();
// Check if guess is really REALLY good :)
a = c / d;
b_pred = a * (a * qq - d) / c;
cp = axpy_norm(r_f, -a, mmp_f, r_f);
b = cp / c;
// Fuse these loops ; should be really easy
psi_f = a * p_f + psi_f;
//p_f = p_f * b + r_f;
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
if(cp > MaxResidSinceLastRelUp){
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
MaxResidSinceLastRelUp = cp;
}
// Stopping condition
if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
//Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f);
psi = psi + mmp;
SolverTimer.Stop();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
if(DoFinalCleanup){
//Do a final CG to cleanup
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
CG.ErrorOnNoConverge = ErrorOnNoConverge;
CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete;
}
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return;
}
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
psi = psi + mmp;
//Single prec initialization
FieldF r_f(SinglePrecGrid);
r_f.checkerboard = r.checkerboard;
precisionChange(r_f, r);
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
r = src - mmp;
FieldF psi_f(r_f);
psi_f = zero;
psi_f = Zero();
precisionChange(r_f, r);
cp = norm2(r);
MaxResidSinceLastRelUp = cp;
FieldF p_f(r_f);
FieldF mmp_f(r_f);
RealD MaxResidSinceLastRelUp = cp; //initial residual
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k = 0;
int l = 0;
for (k = 1; k <= MaxIterations; k++) {
c = cp;
MatrixTimer.Start();
Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
MatrixTimer.Stop();
LinalgTimer.Start();
a = c / d;
b_pred = a * (a * qq - d) / c;
cp = axpy_norm(r_f, -a, mmp_f, r_f);
b = cp / c;
// Fuse these loops ; should be really easy
psi_f = a * p_f + psi_f;
//p_f = p_f * b + r_f;
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
if(cp > MaxResidSinceLastRelUp){
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
MaxResidSinceLastRelUp = cp;
}
// Stopping condition
if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f);
psi = psi + mmp;
SolverTimer.Stop();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
if(DoFinalCleanup){
//Do a final CG to cleanup
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
CG.ErrorOnNoConverge = ErrorOnNoConverge;
CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete;
}
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return;
}
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
psi = psi + mmp;
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
r = src - mmp;
psi_f = zero;
precisionChange(r_f, r);
cp = norm2(r);
MaxResidSinceLastRelUp = cp;
b = cp/c;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
l = l+1;
}
p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
Linop_f_use = Linop_fallback;
using_fallback = true;
}
b = cp/c;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
l = l+1;
}
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
Linop_f_use = Linop_fallback;
using_fallback = true;
}
}
};
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,88 +24,90 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_RESIDUAL_H
#define GRID_CONJUGATE_RESIDUAL_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class ConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
template<class Field>
class ConjugateResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) {
verbose=0;
};
RealD Tolerance;
Integer MaxIterations;
int verbose;
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) {
verbose=0;
};
RealD a, b, c, d;
RealD cp, ssq,rsq;
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
RealD a, b; // c, d;
RealD cp, ssq,rsq;
GridBase *grid = src._grid;
psi=zero;
Field r(grid), p(grid), Ap(grid), Ar(grid);
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
r=src;
p=src;
GridBase *grid = src.Grid();
psi=Zero();
Field r(grid), p(grid), Ap(grid), Ar(grid);
r=src;
p=src;
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=1;k<MaxIterations;k++){
a = rAr/pAAp;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,Ap,r);
rArp=rAr;
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
b =rAr/rArp;
if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
for(int k=1;k<MaxIterations;k++){
a = rAr/pAAp;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,Ap,r);
rArp=rAr;
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
b =rAr/rArp;
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<std::sqrt(cp/ssq)
<< " true residual "<<std::sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
}
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -33,7 +33,7 @@ namespace Grid {
template<class Field>
class ZeroGuesser: public LinearFunction<Field> {
public:
virtual void operator()(const Field &src, Field &guess) { guess = zero; };
virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
};
template<class Field>
class DoNothingGuesser: public LinearFunction<Field> {
@@ -60,14 +60,14 @@ public:
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
virtual void operator()(const Field &src,Field &guess) {
guess = zero;
guess = Zero();
assert(evec.size()==eval.size());
auto N = evec.size();
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
}
guess.checkerboard = src.checkerboard;
guess.Checkerboard() = src.Checkerboard();
}
};
@@ -90,15 +90,15 @@ public:
void operator()(const FineField &src,FineField &guess) {
int N = (int)evec_coarse.size();
CoarseField src_coarse(evec_coarse[0]._grid);
CoarseField guess_coarse(evec_coarse[0]._grid); guess_coarse = zero;
CoarseField src_coarse(evec_coarse[0].Grid());
CoarseField guess_coarse(evec_coarse[0].Grid()); guess_coarse = Zero();
blockProject(src_coarse,src,subspace);
for (int i=0;i<N;i++) {
const CoarseField & tmp = evec_coarse[i];
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
}
blockPromote(guess_coarse,guess,subspace);
guess.checkerboard = src.checkerboard;
guess.Checkerboard() = src.Checkerboard();
};
};

View File

@@ -34,6 +34,8 @@ namespace Grid {
template<class Field>
class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
// defaults to true
@@ -53,10 +55,10 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
Eigen::MatrixXcd H;
std::vector<std::complex<double>> y;
std::vector<std::complex<double>> gamma;
std::vector<std::complex<double>> c;
std::vector<std::complex<double>> s;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner;
@@ -81,7 +83,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl;
psi.checkerboard = src.checkerboard;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
@@ -91,7 +93,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid);
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
@@ -149,12 +151,12 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
RealD cp = 0;
Field w(src._grid);
Field r(src._grid);
Field w(src.Grid());
Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
@@ -176,7 +178,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
qrUpdate(i);
cp = std::norm(gamma[i+1]);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
@@ -206,11 +208,11 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i];
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w;
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
@@ -218,13 +220,13 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
@@ -233,7 +235,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
@@ -243,8 +245,8 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k];
y[i] = y[i] / H(i, i);
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)

View File

@@ -34,6 +34,8 @@ namespace Grid {
template<class Field>
class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
// defaults to true
@@ -53,10 +55,10 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
Eigen::MatrixXcd H;
std::vector<std::complex<double>> y;
std::vector<std::complex<double>> gamma;
std::vector<std::complex<double>> c;
std::vector<std::complex<double>> s;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner;
@@ -79,7 +81,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
@@ -89,7 +91,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid);
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl;
@@ -147,12 +149,12 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD cp = 0;
Field w(src._grid);
Field r(src._grid);
Field w(src.Grid());
Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
@@ -174,7 +176,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
qrUpdate(i);
cp = std::norm(gamma[i+1]);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
@@ -204,11 +206,11 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i];
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w;
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
@@ -216,13 +218,13 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
@@ -231,7 +233,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
@@ -241,8 +243,8 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k];
y[i] = y[i] / H(i, i);
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)

View File

@@ -34,6 +34,8 @@ namespace Grid {
template<class Field>
class GeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
// defaults to true
@@ -52,10 +54,10 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
Eigen::MatrixXcd H;
std::vector<std::complex<double>> y;
std::vector<std::complex<double>> gamma;
std::vector<std::complex<double>> c;
std::vector<std::complex<double>> s;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
GeneralisedMinimalResidual(RealD tol,
Integer maxit,
@@ -74,7 +76,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
@@ -84,7 +86,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid);
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl;
@@ -140,11 +142,11 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD cp = 0;
Field w(src._grid);
Field r(src._grid);
Field w(src.Grid());
Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
@@ -166,7 +168,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
qrUpdate(i);
cp = std::norm(gamma[i+1]);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
@@ -192,11 +194,11 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i];
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w;
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
@@ -204,13 +206,13 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
@@ -219,7 +221,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
@@ -229,8 +231,8 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k];
y[i] = y[i] / H(i, i);
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)

View File

@@ -35,7 +35,7 @@ Author: Christoph Lehner <clehner@bnl.gov>
//#include <zlib.h>
#include <sys/stat.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h
@@ -43,6 +43,11 @@ namespace Grid {
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
// If assume basis[j] are already orthonormal,
// can take all inner products in parallel saving 2x bandwidth
// Save 3x bandwidth on the second line of loop.
// perhaps 2.5x speed up.
// 2x overall in Multigrid Lanczos
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
@@ -52,44 +57,118 @@ void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef decltype(basis[0].View()) View;
auto tmp_v = basis[0].View();
Vector<View> basis_v(basis.size(),tmp_v);
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid;
GridBase* grid = basis[0].Grid();
parallel_region
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View();
}
#if 0
std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
thread_region
{
vobj* B = Bt.data() + Nm * thread_num();
std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis[k]._odata[ss];
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis[j]._odata[ss] = B[j];
basis_v[j][ss] = B[j];
}
});
}
#else
int nrot = j1-j0;
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
// printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of Eigen matrix
Vector<double> Qt_jv(Nm*Nm);
double *Qt_p = & Qt_jv[0];
for(int k=0;k<Nm;++k){
for(int j=0;j<Nm;++j){
Qt_p[j*Nm+k]=Qt(j,k);
}
}
// Block the loop to keep storage footprint down
vobj zz=Zero();
for(uint64_t s=0;s<oSites;s+=siteBlock){
// remaining work in this block
int ssites=MIN(siteBlock,oSites-s);
// zero out the accumulators
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
auto z=coalescedRead(zz);
coalescedWrite(Bp[ss],z);
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
}
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
#endif
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef decltype(basis[0].View()) View;
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid;
GridBase* grid = basis[0].Grid();
result.checkerboard = basis[0].checkerboard;
parallel_for(int ss=0;ss < grid->oSites();ss++){
vobj B = zero;
for(int k=k0; k<k1; ++k){
B +=Qt(j,k) * basis[k]._odata[ss];
}
result._odata[ss] = B;
result.Checkerboard() = basis[0].Checkerboard();
auto result_v=result.View();
Vector<View> basis_v(basis.size(),result_v);
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View();
}
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz);
for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
}
coalescedWrite(result_v[ss], B);
});
}
template<class Field>
@@ -119,7 +198,7 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
@@ -150,6 +229,19 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
/////////////////////////////////////////////////////////////
// Implicitly restarted lanczos
/////////////////////////////////////////////////////////////
@@ -259,7 +351,7 @@ public:
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
int _MinRestart=0, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(Tester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
@@ -275,7 +367,7 @@ public:
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
int _MinRestart=0, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(SimpleTester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
@@ -289,7 +381,7 @@ public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
nn = std::sqrt(nn);
v = v * (1.0/nn);
return nn;
}
@@ -321,10 +413,10 @@ until convergence
*/
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{
GridBase *grid = src._grid;
assert(grid == evec[0]._grid);
GridBase *grid = src.Grid();
assert(grid == evec[0].Grid());
GridLogIRL.TimingMode(1);
// GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl;
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -349,14 +441,17 @@ until convergence
{
auto src_n = src;
auto tmp = src;
std::cout << GridLogIRL << " IRL source norm " << norm2(src) << std::endl;
const int _MAX_ITER_IRL_MEVAPP_ = 50;
for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
normalise(src_n);
_HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
if (fabs(evalMaxApprox/na - 1.0) < 0.05)
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
@@ -446,7 +541,7 @@ until convergence
assert(k2<Nm); assert(k2<Nm); assert(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt"<<std::endl;
std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
////////////////////////////////////////////////////
// Compressed vector f and beta(k2)
@@ -454,7 +549,7 @@ until convergence
f *= Qt(k2-1,Nm-1);
f += lme[k2-1] * evec[k2];
beta_k = norm2(f);
beta_k = sqrt(beta_k);
beta_k = std::sqrt(beta_k);
std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
RealD betar = 1.0/beta_k;
@@ -477,7 +572,7 @@ until convergence
std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
Field B(grid); B.checkerboard = evec[0].checkerboard;
Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
// power of two search pattern; not every evalue in eval2 is assessed.
int allconv =1;
@@ -515,7 +610,7 @@ until convergence
converged:
{
Field B(grid); B.checkerboard = evec[0].checkerboard;
Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
basisRotate(evec,Qt,0,Nk,0,Nk,Nm);
std::cout << GridLogIRL << " Rotated basis"<<std::endl;
Nconv=0;
@@ -554,11 +649,11 @@ until convergence
/* Saad PP. 195
1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
2. For k = 1,2,...,m Do:
3. wk:=Avkβkv_{k1}
4. αk:=(wk,vk) //
5. wk:=wkαkvk // wk orthog vk
6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
7. vk+1 := wk/βk+1
3. wk:=Avk - b_k v_{k-1}
4. ak:=(wk,vk) //
5. wk:=wk-akvk // wk orthog vk
6. bk+1 := ||wk||_2. If b_k+1 = 0 then Stop
7. vk+1 := wk/b_k+1
8. EndDo
*/
void step(std::vector<RealD>& lmd,
@@ -566,6 +661,7 @@ until convergence
std::vector<Field>& evec,
Field& w,int Nm,int k)
{
std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
const RealD tiny = 1.0e-20;
assert( k< Nm );
@@ -577,20 +673,20 @@ until convergence
if(k>0) w -= lme[k-1] * evec[k-1];
ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
ComplexD zalph = innerProduct(evec_k,w);
RealD alph = real(zalph);
w = w - alph * evec_k;// 5. wk:=wkαkvk
w = w - alph * evec_k;
RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
// 7. vk+1 := wk/βk+1
RealD beta = normalise(w);
lmd[k] = alph;
lme[k] = beta;
if (k>0 && k % orth_period == 0) {
if ( (k>0) && ( (k % orth_period) == 0 )) {
std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
orthogonalize(w,evec,k); // orthonormalise
std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
}
if(k < Nm-1) evec[k+1] = w;
@@ -598,6 +694,8 @@ until convergence
std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
if ( beta < tiny )
std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
}
void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,
@@ -807,7 +905,7 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
// determination of 2x2 leading submatrix
RealD dsub = lmd[kmax-1]-lmd[kmax-2];
RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
RealD dd = std::sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
// (Dsh: shift)
@@ -838,5 +936,6 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
abort();
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,16 +24,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LOCAL_COHERENCE_IRL_H
#define GRID_LOCAL_COHERENCE_IRL_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
struct LanczosParams : Serializable {
public:
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
ChebyParams, Cheby,/*Chebyshev*/
int, Nstop, /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
@@ -46,7 +45,7 @@ struct LanczosParams : Serializable {
};
struct LocalCoherenceLanczosParams : Serializable {
public:
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
bool, saveEvecs,
bool, doFine,
@@ -59,7 +58,7 @@ struct LocalCoherenceLanczosParams : Serializable {
RealD , coarse_relax_tol,
std::vector<int>, blockSize,
std::string, config,
std::vector < std::complex<double> >, omega,
std::vector < ComplexD >, omega,
RealD, mass,
RealD, M5);
};
@@ -83,14 +82,14 @@ public:
};
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
GridBase *FineGrid = subspace[0].Grid();
int checkerboard = subspace[0].Checkerboard();
FineField fin (FineGrid); fin.checkerboard= checkerboard;
FineField fout(FineGrid); fout.checkerboard = checkerboard;
FineField fin (FineGrid); fin.Checkerboard()= checkerboard;
FineField fout(FineGrid); fout.Checkerboard() = checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
}
};
@@ -118,11 +117,11 @@ public:
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
GridBase *FineGrid = subspace[0].Grid();
int checkerboard = subspace[0].Checkerboard();
FineField fin (FineGrid); fin.checkerboard =checkerboard;
FineField fout(FineGrid);fout.checkerboard =checkerboard;
FineField fin (FineGrid); fin.Checkerboard() =checkerboard;
FineField fout(FineGrid);fout.Checkerboard() =checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
@@ -133,7 +132,7 @@ public:
template<class Fobj,class CComplex,int nbasis>
class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
{
public:
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
@@ -142,7 +141,7 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
LinearFunction<CoarseField> & _Poly;
OperatorFunction<FineField> & _smoother;
LinearOperatorBase<FineField> &_Linop;
RealD _coarse_relax_tol;
RealD _coarse_relax_tol;
std::vector<FineField> &_subspace;
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
@@ -182,10 +181,10 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
}
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
GridBase *FineGrid = _subspace[0]._grid;
int checkerboard = _subspace[0].checkerboard;
FineField fB(FineGrid);fB.checkerboard =checkerboard;
FineField fv(FineGrid);fv.checkerboard =checkerboard;
GridBase *FineGrid = _subspace[0].Grid();
int checkerboard = _subspace[0].Checkerboard();
FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
FineField fv(FineGrid);fv.Checkerboard() =checkerboard;
blockPromote(B,fv,_subspace);
@@ -305,11 +304,11 @@ public:
int Nk = nbasis;
subspace.resize(Nk,_FineGrid);
subspace[0]=1.0;
subspace[0].checkerboard=_checkerboard;
subspace[0].Checkerboard()=_checkerboard;
normalise(subspace[0]);
PlainHermOp<FineField> Op(_FineOp);
for(int k=1;k<Nk;k++){
subspace[k].checkerboard=_checkerboard;
subspace[k].Checkerboard()=_checkerboard;
Op(subspace[k-1],subspace[k]);
normalise(subspace[k]);
}
@@ -360,7 +359,11 @@ public:
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
FineField src(_FineGrid);
typedef typename FineField::scalar_type Scalar;
// src=1.0;
src=Scalar(1.0);
src.Checkerboard() = _checkerboard;
int Nconv;
IRL.calc(evals_fine,subspace,src,Nconv,false);
@@ -402,5 +405,5 @@ public:
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -33,6 +33,8 @@ namespace Grid {
template<class Field> class MinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
// Defaults true.
RealD Tolerance;
@@ -46,11 +48,11 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
Complex a, c;
Real d;
ComplexD a, c;
RealD d;
Field Mr(src);
Field r(src);
@@ -71,7 +73,6 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "MinimalResidual: src " << ssq << std::endl;
std::cout << GridLogIterative << "MinimalResidual: mp " << d << std::endl;
std::cout << GridLogIterative << "MinimalResidual: cp,r " << cp << std::endl;
if (cp <= rsq) {

View File

@@ -34,6 +34,9 @@ namespace Grid {
template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> {
public:
using OperatorFunction<FieldD>::operator();
bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
// defaults to true
@@ -54,10 +57,10 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
Eigen::MatrixXcd H;
std::vector<std::complex<double>> y;
std::vector<std::complex<double>> gamma;
std::vector<std::complex<double>> c;
std::vector<std::complex<double>> s;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
GridBase* SinglePrecGrid;
@@ -84,7 +87,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) {
psi.checkerboard = src.checkerboard;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
@@ -94,7 +97,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
FieldD r(src._grid);
FieldD r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl;
@@ -154,12 +157,12 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
RealD cp = 0;
FieldD w(src._grid);
FieldD r(src._grid);
FieldD w(src.Grid());
FieldD r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<FieldD> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero;
std::vector<FieldD> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero;
std::vector<FieldD> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<FieldD> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
@@ -181,7 +184,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
qrUpdate(i);
cp = std::norm(gamma[i+1]);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
@@ -223,11 +226,11 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i];
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w;
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
@@ -235,13 +238,13 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1);
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1);
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
@@ -250,7 +253,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
@@ -260,8 +263,8 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k];
y[i] = y[i] / H(i, i);
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,38 +23,90 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_NORMAL_EQUATIONS_H
#define GRID_NORMAL_EQUATIONS_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class NormalEquations : public OperatorFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class NormalEquations {
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver) {};
void operator() (const Field &in, Field &out){
void operator() (const Field &in, Field &out){
Field src(in.Grid());
Field tmp(in.Grid());
Field src(in._grid);
MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
_Matrix.Mdag(in,src);
_Guess(src,out);
_HermitianSolver(MdagMOp,src,out); // Mdag M out = Mdag in
_Matrix.Mdag(in,src);
_HermitianSolver(src,out); // Mdag M out = Mdag in
}
};
}
};
template<class Field> class HPDSolver {
private:
LinearOperatorBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
}
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
HPDSolver(LinearOperatorBase<Field> &Matrix,
OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
_Guess(in,out);
_HermitianSolver(_Matrix,in,out); // Mdag M out = Mdag in
}
};
template<class Field> class MdagMSolver {
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
MdagMSolver(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
_Guess(in,out);
_HermitianSolver(MdagMOp,in,out); // Mdag M out = Mdag in
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -14,7 +14,7 @@ template<class Field> class PowerMethod
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src._grid;
GridBase *grid = src.Grid();
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0;
@@ -30,12 +30,12 @@ template<class Field> class PowerMethod
RealD vden = norm2(src_n);
RealD na = vnum/vden;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) {
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return evalMaxApprox;
}
evalMaxApprox = na;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
src_n = tmp;
}
assert(0);

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,97 +23,97 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
#define GRID_PREC_CONJUGATE_RESIDUAL_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class PrecConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
LinearFunction<Field> &Preconditioner;
template<class Field>
class PrecConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
LinearFunction<Field> &Preconditioner;
PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit), Preconditioner(Prec)
{
verbose=1;
};
PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit), Preconditioner(Prec)
{
verbose=1;
};
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD a, b, c, d;
RealD cp, ssq,rsq;
RealD a, b, c, d;
RealD cp, ssq,rsq;
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
GridBase *grid = src._grid;
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
GridBase *grid = src.Grid();
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
psi=zero;
r = src;
Preconditioner(r,p);
psi=zero;
r = src;
Preconditioner(r,p);
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Ar=Ap;
rAr=pAp;
rAAr=pAAp;
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Ar=Ap;
rAr=pAp;
rAAr=pAAp;
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=0;k<MaxIterations;k++){
for(int k=0;k<MaxIterations;k++){
Preconditioner(Ap,z);
RealD rq= real(innerProduct(Ap,z));
Preconditioner(Ap,z);
RealD rq= real(innerProduct(Ap,z));
a = rAr/rq;
a = rAr/rq;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,z,r);
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,z,r);
rArp=rAr;
rArp=rAr;
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
b =rAr/rArp;
b =rAr/rArp;
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
}
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_GCR_H
#define GRID_PREC_GCR_H
@@ -36,206 +36,204 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
//NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper
///////////////////////////////////////////////////////////////////////////////////////////////////////
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class Field>
class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
LinearFunction<Field> &Preconditioner;
template<class Field>
class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
public:
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
verbose=1;
};
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
int level;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
LinearFunction<Field> &Preconditioner;
LinearOperatorBase<Field> &Linop;
psi=zero;
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
void Level(int lv) { level=lv; };
Field r(src._grid);
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Linop(_Linop),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
level=1;
verbose=1;
};
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
void operator() (const Field &src, Field &psi){
GridStopWatch SolverTimer;
SolverTimer.Start();
psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
steps=0;
for(int k=0;k<MaxIterations;k++){
Field r(src.Grid());
cp=GCRnStep(Linop,src,psi,rsq);
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
GridStopWatch SolverTimer;
SolverTimer.Start();
if(cp<rsq) {
steps=0;
for(int k=0;k<MaxIterations;k++){
SolverTimer.Stop();
cp=GCRnStep(src,psi,rsq);
Linop.HermOp(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
return;
}
if(cp<rsq) {
SolverTimer.Stop();
Linop.HermOp(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
/*
GCRLogLevel<<"PGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
*/
return;
}
std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
assert(0);
}
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0);
}
RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp;
RealD a, b, c, d;
RealD zAz, zAAz;
RealD rAq, rq;
RealD cp;
RealD a, b;
RealD zAz, zAAz;
RealD rq;
GridBase *grid = src._grid;
GridBase *grid = src.Grid();
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
MatTimer.Stop();
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
MatTimer.Stop();
LinalgTimer.Start();
r=src-Az;
LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
MatTimer.Stop();
LinalgTimer.Start();
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
LinalgTimer.Start();
r=src-Az;
rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
/////////////////////
// p = Prec(r)
/////////////////////
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
if((k==nstep-1)||(cp<rsq)){
return cp;
}
PrecTimer.Start();
Preconditioner(r,z);
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.HermOp(z,tmp);
MatTimer.Stop();
LinalgTimer.Start();
ttmp=tmp;
tmp=tmp-r;
LinalgTimer.Stop();
/*
std::cout<<GridLogMessage<<r<<std::endl;
std::cout<<GridLogMessage<<z<<std::endl;
std::cout<<GridLogMessage<<ttmp<<std::endl;
std::cout<<GridLogMessage<<tmp<<std::endl;
*/
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
MatTimer.Stop();
LinalgTimer.Start();
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
q[peri_kp]=Az;
p[peri_kp]=z;
for(int k=0;k<nstep;k++){
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
steps++;
int peri_back=(k-back)%mmax; assert((k-back)>=0);
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
LinalgTimer.Start();
rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
if((k==nstep-1)||(cp<rsq)){
return cp;
}
std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"] resid " <<sqrt(cp/rsq)<<std::endl;
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
Linop.HermOp(z,tmp);
MatTimer.Stop();
LinalgTimer.Start();
tmp=tmp-r;
std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl;
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
assert(0); // never reached
return cp;
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
};
}
assert(0); // never reached
return cp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,371 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithmsf/iterative/QuasiMinimalResidual.h
Copyright (C) 2019
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Field>
RealD innerG5ProductReal(Field &l, Field &r)
{
Gamma G5(Gamma::Algebra::Gamma5);
Field tmp(l.Grid());
// tmp = G5*r;
G5R5(tmp,r);
ComplexD ip =innerProduct(l,tmp);
std::cout << "innerProductRealG5R5 "<<ip<<std::endl;
return ip.real();
}
template<class Field>
class QuasiMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge;
RealD Tolerance;
Integer MaxIterations;
Integer IterationCount;
QuasiMinimalResidual(RealD tol,
Integer maxit,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, ErrorOnNoConverge(err_on_no_conv)
{};
#if 1
void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x)
{
RealD resid;
IterationCount=0;
RealD rho, rho_1, xi, gamma, gamma_1, theta, theta_1;
RealD eta, delta, ep, beta;
GridBase *Grid = b.Grid();
Field r(Grid), d(Grid), s(Grid);
Field v(Grid), w(Grid), y(Grid), z(Grid);
Field v_tld(Grid), w_tld(Grid), y_tld(Grid), z_tld(Grid);
Field p(Grid), q(Grid), p_tld(Grid);
Real normb = norm2(b);
LinOp.Op(x,r); r = b - r;
assert(normb> 0.0);
resid = norm2(r)/normb;
if (resid <= Tolerance) {
return;
}
v_tld = r;
y = v_tld;
rho = norm2(y);
// Take Gamma5 conjugate
// Gamma G5(Gamma::Algebra::Gamma5);
// G5R5(w_tld,r);
// w_tld = G5* v_tld;
w_tld=v_tld;
z = w_tld;
xi = norm2(z);
gamma = 1.0;
eta = -1.0;
theta = 0.0;
for (int i = 1; i <= MaxIterations; i++) {
// Breakdown tests
assert( rho != 0.0);
assert( xi != 0.0);
v = (1. / rho) * v_tld;
y = (1. / rho) * y;
w = (1. / xi) * w_tld;
z = (1. / xi) * z;
ComplexD Zdelta = innerProduct(z, y); // Complex?
std::cout << "Zdelta "<<Zdelta<<std::endl;
delta = Zdelta.real();
y_tld = y;
z_tld = z;
if (i > 1) {
p = y_tld - (xi * delta / ep) * p;
q = z_tld - (rho * delta / ep) * q;
} else {
p = y_tld;
q = z_tld;
}
LinOp.Op(p,p_tld); // p_tld = A * p;
ComplexD Zep = innerProduct(q, p_tld);
ep=Zep.real();
std::cout << "Zep "<<Zep <<std::endl;
// Complex Audit
assert(abs(ep)>0);
beta = ep / delta;
assert(abs(beta)>0);
v_tld = p_tld - beta * v;
y = v_tld;
rho_1 = rho;
rho = norm2(y);
LinOp.AdjOp(q,w_tld);
w_tld = w_tld - beta * w;
z = w_tld;
xi = norm2(z);
gamma_1 = gamma;
theta_1 = theta;
theta = rho / (gamma_1 * beta);
gamma = 1.0 / sqrt(1.0 + theta * theta);
std::cout << "theta "<<theta<<std::endl;
std::cout << "gamma "<<gamma<<std::endl;
assert(abs(gamma)> 0.0);
eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1);
if (i > 1) {
d = eta * p + (theta_1 * theta_1 * gamma * gamma) * d;
s = eta * p_tld + (theta_1 * theta_1 * gamma * gamma) * s;
} else {
d = eta * p;
s = eta * p_tld;
}
x =x+d; // update approximation vector
r =r-s; // compute residual
if ((resid = norm2(r) / normb) <= Tolerance) {
return;
}
std::cout << "Iteration "<<i<<" resid " << resid<<std::endl;
}
assert(0);
return; // no convergence
}
#else
// QMRg5 SMP thesis
void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x)
{
// Real scalars
GridBase *grid = b.Grid();
Field r(grid);
Field p_m(grid), p_m_minus_1(grid), p_m_minus_2(grid);
Field v_m(grid), v_m_minus_1(grid), v_m_plus_1(grid);
Field tmp(grid);
RealD w;
RealD z1, z2;
RealD delta_m, delta_m_minus_1;
RealD c_m_plus_1, c_m, c_m_minus_1;
RealD s_m_plus_1, s_m, s_m_minus_1;
RealD alpha, beta, gamma, epsilon;
RealD mu, nu, rho, theta, xi, chi;
RealD mod2r, mod2b;
RealD tau2, target2;
mod2b=norm2(b);
/////////////////////////
// Initial residual
/////////////////////////
LinOp.Op(x,tmp);
r = b - tmp;
/////////////////////////
// \mu = \rho = |r_0|
/////////////////////////
mod2r = norm2(r);
rho = sqrt( mod2r);
mu=rho;
std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl;
/////////////////////////
// Zero negative history
/////////////////////////
v_m_plus_1 = Zero();
v_m_minus_1 = Zero();
p_m_minus_1 = Zero();
p_m_minus_2 = Zero();
// v0
v_m = (1.0/rho)*r;
/////////////////////////
// Initial coeffs
/////////////////////////
delta_m_minus_1 = 1.0;
c_m_minus_1 = 1.0;
c_m = 1.0;
s_m_minus_1 = 0.0;
s_m = 0.0;
/////////////////////////
// Set up convergence check
/////////////////////////
tau2 = mod2r;
target2 = mod2b * Tolerance*Tolerance;
for(int iter = 0 ; iter < MaxIterations; iter++){
/////////////////////////
// \delta_m = (v_m, \gamma_5 v_m)
/////////////////////////
delta_m = innerG5ProductReal(v_m,v_m);
std::cout << "QuasiMinimalResidual delta_m "<< delta_m<<std::endl;
/////////////////////////
// tmp = A v_m
/////////////////////////
LinOp.Op(v_m,tmp);
/////////////////////////
// \alpha = (v_m, \gamma_5 temp) / \delta_m
/////////////////////////
alpha = innerG5ProductReal(v_m,tmp);
alpha = alpha/delta_m ;
std::cout << "QuasiMinimalResidual alpha "<< alpha<<std::endl;
/////////////////////////
// \beta = \rho \delta_m / \delta_{m-1}
/////////////////////////
beta = rho * delta_m / delta_m_minus_1;
std::cout << "QuasiMinimalResidual beta "<< beta<<std::endl;
/////////////////////////
// \tilde{v}_{m+1} = temp - \alpha v_m - \beta v_{m-1}
/////////////////////////
v_m_plus_1 = tmp - alpha*v_m - beta*v_m_minus_1;
///////////////////////////////
// \rho = || \tilde{v}_{m+1} ||
///////////////////////////////
rho = sqrt( norm2(v_m_plus_1) );
std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl;
///////////////////////////////
// v_{m+1} = \tilde{v}_{m+1}
///////////////////////////////
v_m_plus_1 = (1.0 / rho) * v_m_plus_1;
////////////////////////////////
// QMR recurrence coefficients.
////////////////////////////////
theta = s_m_minus_1 * beta;
gamma = c_m_minus_1 * beta;
epsilon = c_m * gamma + s_m * alpha;
xi = -s_m * gamma + c_m * alpha;
nu = sqrt( xi*xi + rho*rho );
c_m_plus_1 = fabs(xi) / nu;
if ( xi == 0.0 ) {
s_m_plus_1 = 1.0;
} else {
s_m_plus_1 = c_m_plus_1 * rho / xi;
}
chi = c_m_plus_1 * xi + s_m_plus_1 * rho;
std::cout << "QuasiMinimalResidual coeffs "<< theta <<" "<<gamma<<" "<< epsilon<<" "<< xi<<" "<< nu<<std::endl;
std::cout << "QuasiMinimalResidual coeffs "<< chi <<std::endl;
////////////////////////////////
//p_m=(v_m - \epsilon p_{m-1} - \theta p_{m-2}) / \chi
////////////////////////////////
p_m = (1.0/chi) * v_m - (epsilon/chi) * p_m_minus_1 - (theta/chi) * p_m_minus_2;
////////////////////////////////////////////////////////////////
// \psi = \psi + c_{m+1} \mu p_m
////////////////////////////////////////////////////////////////
x = x + ( c_m_plus_1 * mu ) * p_m;
////////////////////////////////////////
//
////////////////////////////////////////
mu = -s_m_plus_1 * mu;
delta_m_minus_1 = delta_m;
c_m_minus_1 = c_m;
c_m = c_m_plus_1;
s_m_minus_1 = s_m;
s_m = s_m_plus_1;
////////////////////////////////////
// Could use pointer swizzle games.
////////////////////////////////////
v_m_minus_1 = v_m;
v_m = v_m_plus_1;
p_m_minus_2 = p_m_minus_1;
p_m_minus_1 = p_m;
/////////////////////////////////////
// Convergence checks
/////////////////////////////////////
z1 = RealD(iter+1.0);
z2 = z1 + 1.0;
tau2 = tau2 *( z2 / z1 ) * s_m * s_m;
std::cout << " QuasiMinimumResidual iteration "<< iter<<std::endl;
std::cout << " QuasiMinimumResidual tau bound "<< tau2<<std::endl;
// Compute true residual
mod2r = tau2;
if ( 1 || (tau2 < (100.0 * target2)) ) {
LinOp.Op(x,tmp);
r = b - tmp;
mod2r = norm2(r);
std::cout << " QuasiMinimumResidual true residual is "<< mod2r<<std::endl;
}
if ( mod2r < target2 ) {
std::cout << " QuasiMinimumResidual has converged"<<std::endl;
return;
}
}
}
#endif
};
NAMESPACE_END(Grid);

View File

@@ -297,9 +297,9 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
}
@@ -317,17 +317,17 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.checkerboard ==Odd );
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
}
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
@@ -366,13 +366,13 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -386,17 +386,17 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e_i = src_e-tmp; assert( src_e_i.checkerboard ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.checkerboard ==Even);
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; assert( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.checkerboard ==Odd );
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
}
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
@@ -437,12 +437,12 @@ namespace Grid {
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -463,12 +463,12 @@ namespace Grid {
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i,tmp); assert( tmp.checkerboard ==Even);
tmp = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.checkerboard ==Even);
_Matrix.Meooe(sol_o_i,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(sol,sol_o_i); assert( sol_o_i.checkerboard ==Odd );
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o_i); assert( sol_o_i.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)

View File

@@ -1,18 +1,25 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
#ifdef GRID_NVCC
#define SMALL_LIMIT (0)
#else
#define SMALL_LIMIT (4096)
#endif
#ifdef POINTER_CACHE
int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return ptr;
if (bytes < SMALL_LIMIT ) return ptr;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
@@ -49,9 +56,9 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
void *PointerCache::Lookup(size_t bytes) {
if (bytes < 4096 ) return NULL;
if (bytes < SMALL_LIMIT ) return NULL;
#ifdef _OPENMP
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
@@ -63,7 +70,7 @@ void *PointerCache::Lookup(size_t bytes) {
}
return NULL;
}
#endif
void check_huge_pages(void *Buf,uint64_t BYTES)
{
@@ -90,7 +97,7 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
@@ -106,20 +113,21 @@ std::string sizeString(const size_t bytes)
double count = bytes;
while (count >= 1024 && s < 7)
{
{
s++;
count /= 1024;
}
}
if (count - floor(count) == 0.0)
{
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
}
else
{
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
}
return std::string(buf);
}
}
NAMESPACE_END(Grid);

View File

@@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H
#define GRID_ALIGNED_ALLOCATOR_H
@@ -40,89 +40,99 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <mm_malloc.h>
#endif
namespace Grid {
#define POINTER_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
class PointerCache {
private:
NAMESPACE_BEGIN(Grid);
static const int Ncache=8;
static int victim;
// Move control to configure.ac and Config.h?
#ifdef POINTER_CACHE
class PointerCache {
private:
/*Pinning pages is costly*/
/*Could maintain separate large and small allocation caches*/
#ifdef GRID_NVCC
static const int Ncache=128;
#else
static const int Ncache=8;
#endif
static int victim;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[Ncache];
static PointerCacheEntry Entries[Ncache];
public:
public:
static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ;
static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ;
};
#endif
};
std::string sizeString(size_t bytes);
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl;\
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerAllocate(bytes)\
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
s->totalAllocated += (bytes);\
s->currentlyAllocated += (bytes);\
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated);\
}\
if (MemoryProfiler::debug)\
{\
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\
profilerDebugPrint;\
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes)\
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
s->totalFreed += (bytes);\
s->currentlyAllocated -= (bytes);\
}\
if (MemoryProfiler::debug)\
{\
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\
profilerDebugPrint;\
}
void check_huge_pages(void *Buf,uint64_t BYTES);
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
@@ -152,29 +162,46 @@ public:
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
// if ( ptr != NULL )
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
//////////////////
// Hack 2MB align; could make option probably doesn't need configurability
//////////////////
//define GRID_ALLOC_ALIGN (128)
#define GRID_ALLOC_ALIGN (2*1024*1024)
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#ifdef POINTER_CACHE
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
pointer ptr = nullptr;
#endif
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
// First touch optimise in threaded loop
uint8_t *cp = (uint8_t *)ptr;
#ifdef GRID_OMP
#pragma omp parallel for
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
#ifdef GRID_NVCC
////////////////////////////////////
// Unified (managed) memory
////////////////////////////////////
if ( ptr == (_Tp *) NULL ) {
// printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout);
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
assert(0);
}
}
assert( ptr != (_Tp *)NULL);
#else
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
assert( ptr != (_Tp *)NULL);
//////////////////////////////////////////////////
// First touch optimise in threaded loop
//////////////////////////////////////////////////
uint64_t *cp = (uint64_t *)ptr;
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
cp[n]=0;
});
#endif
return ptr;
}
@@ -183,133 +210,40 @@ public:
profilerFree(bytes);
#ifdef POINTER_CACHE
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else
pointer __freeme = __p;
#endif
#ifdef GRID_NVCC
if ( __freeme ) cudaFree((void *)__freeme);
#else
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else
if ( __freeme ) free((void *)__freeme);
#endif
#endif
}
void construct(pointer __p, const _Tp& __val) { };
// FIXME: hack for the copy constructor, eventually it must be avoided
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
//void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////////
// MPI3 : comms must use shm region
// SHMEM: comms must use symmetric heap
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_SHMEM
extern "C" {
#include <mpp/shmem.h>
extern void * shmem_align(size_t, size_t);
extern void shmem_free(void *);
}
#define PARANOID_SYMMETRIC_HEAP
#endif
template<typename _Tp>
class commAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef commAllocator<_Tp1> other; };
commAllocator() throw() { }
commAllocator(const commAllocator&) throw() { }
template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
~commAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
#ifdef GRID_COMMS_SHMEM
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef CRAY
_Tp *ptr = (_Tp *) shmem_align(bytes,64);
#else
_Tp *ptr = (_Tp *) shmem_align(64,bytes);
#endif
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
bcast = (void *) ptr;
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
// BACKTRACEFILE();
exit(0);
}
assert( bcast == (void *) ptr);
#endif
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
shmem_free((void *)__p);
}
#else
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
#else
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
#endif
uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order
#ifdef GRID_OMP
#pragma omp parallel for schedule(static)
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
}
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
#ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p);
#else
free((void *)__p);
#endif
}
#endif
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
template<class T> using commAllocator = alignedAllocator<T>;
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,commAllocator<T> >;
template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
}; // namespace Grid
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_H
#define GRID_CARTESIAN_H

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -25,268 +25,266 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_BASE_H
#define GRID_CARTESIAN_BASE_H
NAMESPACE_BEGIN(Grid);
namespace Grid{
//////////////////////////////////////////////////////////////////////
// Commicator provides information on the processor grid
//////////////////////////////////////////////////////////////////////
// unsigned long _ndimension;
// std::vector<int> _processors; // processor grid
// int _processor; // linear processor rank
// std::vector<int> _processor_coor; // linear processor rank
//////////////////////////////////////////////////////////////////////
class GridBase : public CartesianCommunicator , public GridThread {
//////////////////////////////////////////////////////////////////////
// Commicator provides information on the processor grid
//////////////////////////////////////////////////////////////////////
// unsigned long _ndimension;
// Coordinate _processors; // processor grid
// int _processor; // linear processor rank
// Coordinate _processor_coor; // linear processor rank
//////////////////////////////////////////////////////////////////////
class GridBase : public CartesianCommunicator , public GridThread {
public:
int dummy;
// Give Lattice access
template<class object> friend class Lattice;
int dummy;
// Give Lattice access
template<class object> friend class Lattice;
GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
GridBase(const std::vector<int> & processor_grid,
const CartesianCommunicator &parent,
int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {};
GridBase(const std::vector<int> & processor_grid,
const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {};
GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) { LocallyPeriodic=0;};
virtual ~GridBase() = default;
GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent,
int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {LocallyPeriodic=0;};
GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {LocallyPeriodic=0;};
// Physics Grid information.
std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
std::vector<int> _gdimensions;// Global dimensions of array after cb removal
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
std::vector<int> _ostride; // Outer stride for each dimension
std::vector<int> _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions).
int _isites;
int _fsites; // _isites*_osites = product(dimensions).
int _gsites;
std::vector<int> _slice_block;// subslice information
std::vector<int> _slice_stride;
std::vector<int> _slice_nblock;
virtual ~GridBase() = default;
std::vector<int> _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
std::vector<int> _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
// Physics Grid information.
Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes.
Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal
Coordinate _gdimensions;// Global dimensions of array after cb removal
Coordinate _ldimensions;// local dimensions of array with processor images removed
Coordinate _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
Coordinate _ostride; // Outer stride for each dimension
Coordinate _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions).
int _isites;
int _fsites; // _isites*_osites = product(dimensions).
int _gsites;
Coordinate _slice_block;// subslice information
Coordinate _slice_stride;
Coordinate _slice_nblock;
bool _isCheckerBoarded;
Coordinate _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
Coordinate _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
bool _isCheckerBoarded;
int LocallyPeriodic;
public:
////////////////////////////////////////////////////////////////
// Checkerboarding interface is virtual and overridden by
// GridCartesian / GridRedBlackCartesian
////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(const std::vector<int> &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
virtual int CheckerBoardFromOindex (int Oindex)=0;
virtual int CheckerBoardFromOindexTable (int Oindex)=0;
////////////////////////////////////////////////////////////////
// Checkerboarding interface is virtual and overridden by
// GridCartesian / GridRedBlackCartesian
////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(const Coordinate &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
virtual int CheckerBoardFromOindex (int Oindex)=0;
virtual int CheckerBoardFromOindexTable (int Oindex)=0;
//////////////////////////////////////////////////////////////////////////////////////////////
// Local layout calculations
//////////////////////////////////////////////////////////////////////////////////////////////
// These routines are key. Subdivide the linearised cartesian index into
// "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
// "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
//////////////////////////////////////////////////////////////////////////////////////////////
// Local layout calculations
//////////////////////////////////////////////////////////////////////////////////////////////
// These routines are key. Subdivide the linearised cartesian index into
// "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
// "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
//
// Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
// stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
// lanes are operated upon simultaneously.
virtual int oIndex(Coordinate &coor)
{
int idx=0;
// Works with either global or local coordinates
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx;
}
virtual int iIndex(Coordinate &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx;
}
inline int oIndexReduced(Coordinate &ocoor)
{
int idx=0;
// ocoor is already reduced so can eliminate the modulo operation
// for fast indexing and inline the routine
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
return idx;
}
inline void oCoorFromOindex (Coordinate& coor,int Oindex){
Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
}
inline void InOutCoorToLocalCoor (Coordinate &ocoor, Coordinate &icoor, Coordinate &lcoor) {
lcoor.resize(_ndimension);
for (int d = 0; d < _ndimension; d++)
lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
}
//////////////////////////////////////////////////////////
// SIMD lane addressing
//////////////////////////////////////////////////////////
inline void iCoorFromIindex(Coordinate &coor,int lane)
{
Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
}
inline int PermuteDim(int dimension){
return _simd_layout[dimension]>1;
}
inline int PermuteType(int dimension){
int permute_type=0;
//
// Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
// stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
// lanes are operated upon simultaneously.
virtual int oIndex(std::vector<int> &coor)
{
int idx=0;
// Works with either global or local coordinates
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx;
}
virtual int iIndex(std::vector<int> &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx;
}
inline int oIndexReduced(std::vector<int> &ocoor)
{
int idx=0;
// ocoor is already reduced so can eliminate the modulo operation
// for fast indexing and inline the routine
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
return idx;
}
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
}
inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) {
lcoor.resize(_ndimension);
for (int d = 0; d < _ndimension; d++)
lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
}
//////////////////////////////////////////////////////////
// SIMD lane addressing
//////////////////////////////////////////////////////////
inline void iCoorFromIindex(std::vector<int> &coor,int lane)
{
Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
}
inline int PermuteDim(int dimension){
return _simd_layout[dimension]>1;
}
inline int PermuteType(int dimension){
int permute_type=0;
//
// FIXME:
//
// Best way to encode this would be to present a mask
// for which simd dimensions are rotated, and the rotation
// size. If there is only one simd dimension rotated, this is just
// a permute.
//
// Cases: PermuteType == 1,2,4,8
// Distance should be either 0,1,2..
//
if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
}
permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type;
}
for(int d=_ndimension-1;d>dimension;d--){
if (_simd_layout[d]>1 ) permute_type++;
// Best way to encode this would be to present a mask
// for which simd dimensions are rotated, and the rotation
// size. If there is only one simd dimension rotated, this is just
// a permute.
//
// Cases: PermuteType == 1,2,4,8
// Distance should be either 0,1,2..
//
if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
}
permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type;
}
////////////////////////////////////////////////////////////////
// Array sizing queries
////////////////////////////////////////////////////////////////
inline int iSites(void) const { return _isites; };
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; };
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
for(int d=_ndimension-1;d>dimension;d--){
if (_simd_layout[d]>1 ) permute_type++;
}
return permute_type;
}
////////////////////////////////////////////////////////////////
// Array sizing queries
////////////////////////////////////////////////////////////////
inline const std::vector<int> LocalStarts(void) { return _lstart; };
inline const std::vector<int> &FullDimensions(void) { return _fdimensions;};
inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;};
inline const std::vector<int> &LocalDimensions(void) { return _ldimensions;};
inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
inline int iSites(void) const { return _isites; };
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; };
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
////////////////////////////////////////////////////////////////
// Utility to print the full decomposition details
////////////////////////////////////////////////////////////////
inline const Coordinate LocalStarts(void) { return _lstart; };
inline const Coordinate &FullDimensions(void) { return _fdimensions;};
inline const Coordinate &GlobalDimensions(void) { return _gdimensions;};
inline const Coordinate &LocalDimensions(void) { return _ldimensions;};
inline const Coordinate &VirtualLocalDimensions(void) { return _ldimensions;};
void show_decomposition(){
std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
}
////////////////////////////////////////////////////////////////
// Utility to print the full decomposition details
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
void show_decomposition(){
std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
}
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
}
void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
gidx=0;
int mult=1;
for(int mu=0;mu<_ndimension;mu++) {
gidx+=mult*gcoor[mu];
mult*=_gdimensions[mu];
}
void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToProcessorCoorLocalCoor(Coordinate &pcoor,Coordinate &lcoor,const Coordinate &gcoor)
{
pcoor.resize(_ndimension);
lcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++){
int _fld = _fdimensions[mu]/_processors[mu];
pcoor[mu] = gcoor[mu]/_fld;
lcoor[mu] = gcoor[mu]%_fld;
}
void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
gidx=0;
int mult=1;
for(int mu=0;mu<_ndimension;mu++) {
gidx+=mult*gcoor[mu];
mult*=_gdimensions[mu];
}
}
void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
{
pcoor.resize(_ndimension);
lcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++){
int _fld = _fdimensions[mu]/_processors[mu];
pcoor[mu] = gcoor[mu]/_fld;
lcoor[mu] = gcoor[mu]%_fld;
}
}
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
{
std::vector<int> pcoor;
std::vector<int> lcoor;
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
rank = RankFromProcessorCoor(pcoor);
/*
std::vector<int> cblcoor(lcoor);
}
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const Coordinate &gcoor)
{
Coordinate pcoor;
Coordinate lcoor;
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
rank = RankFromProcessorCoor(pcoor);
/*
Coordinate cblcoor(lcoor);
for(int d=0;d<cblcoor.size();d++){
if( this->CheckerBoarded(d) ) {
cblcoor[d] = lcoor[d]/2;
}
if( this->CheckerBoarded(d) ) {
cblcoor[d] = lcoor[d]/2;
}
*/
i_idx= iIndex(lcoor);
o_idx= oIndex(lcoor);
}
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
{
gcoor.resize(_ndimension);
std::vector<int> coor(_ndimension);
ProcessorCoorFromRank(rank,coor);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu];
iCoorFromIindex(coor,i_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu];
oCoorFromOindex (coor,o_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
}
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
{
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
if(CheckerBoarded(0)){
fcoor[0] = fcoor[0]*2+cb;
}
*/
i_idx= iIndex(lcoor);
o_idx= oIndex(lcoor);
}
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , Coordinate &gcoor)
{
gcoor.resize(_ndimension);
Coordinate coor(_ndimension);
ProcessorCoorFromRank(rank,coor);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu];
iCoorFromIindex(coor,i_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu];
oCoorFromOindex (coor,o_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
}
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,Coordinate &fcoor)
{
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
if(CheckerBoarded(0)){
fcoor[0] = fcoor[0]*2+cb;
}
void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
{
gcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
}
}
void ProcessorCoorLocalCoorToGlobalCoor(Coordinate &Pcoor,Coordinate &Lcoor,Coordinate &gcoor)
{
gcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,97 +23,96 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_FULL_H
#define GRID_CARTESIAN_FULL_H
namespace Grid{
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////
// Grid Support.
/////////////////////////////////////////////////////////////////////////////////////////
class GridCartesian: public GridBase {
public:
int dummy;
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
virtual int CheckerBoardFromOindex (int Oindex)
{
return 0;
}
virtual int CheckerBoarded(int dim){
return 0;
}
virtual int CheckerBoard(const std::vector<int> &site){
return 0;
}
virtual int CheckerBoardDestination(int cb,int shift,int dim){
return 0;
}
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
return shift;
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
return shift;
}
/////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator.
/////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{
Init(dimensions,simd_layout,processor_grid);
}
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{
Init(dimensions,simd_layout,processor_grid);
}
/////////////////////////////////////////////////////////////////////////
// Construct from comm world
/////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid)
{
Init(dimensions,simd_layout,processor_grid);
}
int dummy;
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
virtual int CheckerBoardFromOindex (int Oindex)
{
return 0;
}
virtual int CheckerBoarded(int dim){
return 0;
}
virtual int CheckerBoard(const Coordinate &site){
return 0;
}
virtual int CheckerBoardDestination(int cb,int shift,int dim){
return 0;
}
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
return shift;
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
return shift;
}
/////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator.
/////////////////////////////////////////////////////////////////////////
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{
Init(dimensions,simd_layout,processor_grid);
}
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{
Init(dimensions,simd_layout,processor_grid);
}
/////////////////////////////////////////////////////////////////////////
// Construct from comm world
/////////////////////////////////////////////////////////////////////////
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid) : GridBase(processor_grid)
{
Init(dimensions,simd_layout,processor_grid);
}
virtual ~GridCartesian() = default;
virtual ~GridCartesian() = default;
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid)
{
///////////////////////
// Grid information
///////////////////////
void Init(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid)
{
///////////////////////
// Grid information
///////////////////////
_isCheckerBoarded = false;
_ndimension = dimensions.size();
_ndimension = dimensions.size();
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
_fsites = _gsites = _osites = _isites = 1;
for (int d = 0; d < _ndimension; d++)
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions
@@ -136,30 +135,30 @@ public:
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
@@ -167,8 +166,9 @@ public:
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
};
};
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,178 +24,147 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_RED_BLACK_H
#define GRID_CARTESIAN_RED_BLACK_H
NAMESPACE_BEGIN(Grid);
namespace Grid {
static const int CbRed =0;
static const int CbBlack=1;
static const int Even =CbRed;
static const int Odd =CbBlack;
static const int CbRed =0;
static const int CbBlack=1;
static const int Even =CbRed;
static const int Odd =CbBlack;
// Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase
{
public:
std::vector<int> _checker_dim_mask;
int _checker_dim;
std::vector<int> _checker_board;
Coordinate _checker_dim_mask;
int _checker_dim;
std::vector<int> _checker_board;
virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1;
else return 0;
virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1;
else return 0;
}
virtual int CheckerBoard(const Coordinate &site){
int linear=0;
assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d])
linear=linear+site[d];
}
virtual int CheckerBoard(const std::vector<int> &site){
int linear=0;
assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d])
linear=linear+site[d];
}
return (linear&0x1);
return (linear&0x1);
}
// Depending on the cb of site, we toggle source cb.
// for block #b, element #e = (b, e)
// we need
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){
if(dim != _checker_dim) return shift;
int fulldim =_fdimensions[dim];
shift = (shift+fulldim)%fulldim;
// Probably faster with table lookup;
// or by looping over x,y,z and multiply rather than computing checkerboard.
if ( (source_cb+ocb)&1 ) {
return (shift)/2;
} else {
return (shift+1)/2;
}
}
virtual int CheckerBoardFromOindexTable (int Oindex) {
return _checker_board[Oindex];
}
virtual int CheckerBoardFromOindex (int Oindex)
{
Coordinate ocoor;
oCoorFromOindex(ocoor,Oindex);
return CheckerBoard(ocoor);
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
if(dim != _checker_dim) return shift;
// Depending on the cb of site, we toggle source cb.
// for block #b, element #e = (b, e)
// we need
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){
if(dim != _checker_dim) return shift;
int ocb=CheckerBoardFromOindex(osite);
int fulldim =_fdimensions[dim];
shift = (shift+fulldim)%fulldim;
return CheckerBoardShiftForCB(source_cb,dim,shift,ocb);
}
// Probably faster with table lookup;
// or by looping over x,y,z and multiply rather than computing checkerboard.
if ( (source_cb+ocb)&1 ) {
return (shift)/2;
} else {
return (shift+1)/2;
}
}
virtual int CheckerBoardFromOindexTable (int Oindex) {
return _checker_board[Oindex];
}
virtual int CheckerBoardFromOindex (int Oindex)
{
std::vector<int> ocoor;
oCoorFromOindex(ocoor,Oindex);
return CheckerBoard(ocoor);
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
if(dim != _checker_dim) return shift;
int ocb=CheckerBoardFromOindex(osite);
return CheckerBoardShiftForCB(source_cb,dim,shift,ocb);
}
virtual int CheckerBoardDestination(int source_cb,int shift,int dim){
if ( _checker_dim_mask[dim] ) {
// If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims
// does NOT cause a parity hop.
int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim];
if ( (shift+add) &0x1) {
return 1-source_cb;
} else {
return source_cb;
}
virtual int CheckerBoardDestination(int source_cb,int shift,int dim){
if ( _checker_dim_mask[dim] ) {
// If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims
// does NOT cause a parity hop.
int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim];
if ( (shift+add) &0x1) {
return 1-source_cb;
} else {
return source_cb;
}
};
} else {
return source_cb;
////////////////////////////////////////////////////////////
// Create Redblack from original grid; require full grid pointer ?
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
{
int dims = base->_ndimension;
std::vector<int> checker_dim_mask(dims,1);
int checker_dim = 0;
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
};
////////////////////////////////////////////////////////////
// Create redblack from original grid, with non-trivial checker dim mask
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(base->_processors,*base)
{
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
}
};
virtual ~GridRedBlackCartesian() = default;
#if 0
////////////////////////////////////////////////////////////
// Create redblack grid ;; deprecate these. Should not
// need direct creation of redblack without a full grid to base on
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(processor_grid,*base)
{
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
////////////////////////////////////////////////////////////
// Create Redblack from original grid; require full grid pointer ?
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
{
int dims = base->_ndimension;
Coordinate checker_dim_mask(dims,1);
int checker_dim = 0;
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
};
////////////////////////////////////////////////////////////
// Create redblack grid
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid,*base)
{
std::vector<int> checker_dim_mask(dimensions.size(),1);
int checker_dim = 0;
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
#endif
////////////////////////////////////////////////////////////
// Create redblack from original grid, with non-trivial checker dim mask
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const Coordinate &checker_dim_mask,
int checker_dim
) : GridBase(base->_processors,*base)
{
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
}
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim)
{
virtual ~GridRedBlackCartesian() = default;
void Init(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const Coordinate &checker_dim_mask,
int checker_dim)
{
_isCheckerBoarded = true;
_checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size();
assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension);
_checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size();
assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
_fsites = _gsites = _osites = _isites = 1;
_checker_dim_mask = checker_dim_mask;
_checker_dim_mask = checker_dim_mask;
for (int d = 0; d < _ndimension; d++)
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d];
@@ -203,11 +172,11 @@ public:
_gsites = _gsites * _gdimensions[d];
if (d == _checker_dim)
{
assert((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2;
}
{
assert((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2;
}
_ldimensions[d] = _gdimensions[d] / _processors[d];
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
@@ -222,42 +191,42 @@ public:
// all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
if (_simd_layout[d] > 1)
{
if (checker_dim_mask[d])
{
assert((_rdimensions[d] & 0x1) == 0);
}
}
{
if (checker_dim_mask[d])
{
assert((_rdimensions[d] & 0x1) == 0);
}
}
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
////////////////////////////////////////////////////////////////////////////////////////////
// subplane information
////////////////////////////////////////////////////////////////////////////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
////////////////////////////////////////////////////////////////////////////////////////////
// subplane information
////////////////////////////////////////////////////////////////////////////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
@@ -266,55 +235,55 @@ public:
block = block * _rdimensions[d];
}
////////////////////////////////////////////////
// Create a checkerboard lookup table
////////////////////////////////////////////////
int rvol = 1;
for (int d = 0; d < _ndimension; d++)
////////////////////////////////////////////////
// Create a checkerboard lookup table
////////////////////////////////////////////////
int rvol = 1;
for (int d = 0; d < _ndimension; d++)
{
rvol = rvol * _rdimensions[d];
}
_checker_board.resize(rvol);
for (int osite = 0; osite < _osites; osite++)
_checker_board.resize(rvol);
for (int osite = 0; osite < _osites; osite++)
{
_checker_board[osite] = CheckerBoardFromOindex(osite);
}
};
};
protected:
virtual int oIndex(std::vector<int> &coor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
protected:
virtual int oIndex(Coordinate &coor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
}
{
idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
}
else
{
idx += _ostride[d] * (coor[d] % _rdimensions[d]);
}
{
idx += _ostride[d] * (coor[d] % _rdimensions[d]);
}
}
return idx;
};
return idx;
};
virtual int iIndex(std::vector<int> &lcoor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
virtual int iIndex(Coordinate &lcoor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
}
{
idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
}
else
{
idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
}
{
idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
}
}
return idx;
}
return idx;
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,11 +23,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H
#include <Grid/util/Coordinate.h>
#include <Grid/communicator/SharedMemory.h>
#include <Grid/communicator/Communicator_base.h>

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,15 +23,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/mman.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
@@ -47,8 +47,8 @@ int CartesianCommunicator::Dimensions(void) { return
int CartesianCommunicator::IsBoss(void) { return _processor==0; };
int CartesianCommunicator::BossRank(void) { return 0; };
int CartesianCommunicator::ThisRank(void) { return _processor; };
const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
const Coordinate & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const Coordinate & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
////////////////////////////////////////////////////////////////////////////////
@@ -72,5 +72,6 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
GlobalSumVector((double *)c,2*N);
}
}
NAMESPACE_END(Grid);

View File

@@ -1,5 +1,5 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMMUNICATOR_BASE_H
#define GRID_COMMUNICATOR_BASE_H
@@ -34,7 +34,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
///////////////////////////////////
#include <Grid/communicator/SharedMemory.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
class CartesianCommunicator : public SharedMemory {
@@ -52,9 +52,9 @@ public:
// Communicator should know nothing of the physics grid, only processor grid.
////////////////////////////////////////////
int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
Coordinate _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate
Coordinate _processor_coor; // linear processor coordinate
unsigned long _ndimension;
static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator;
@@ -69,34 +69,34 @@ public:
// Constructors to sub-divide a parent communicator
// and default to comm world
////////////////////////////////////////////////
CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank);
CartesianCommunicator(const std::vector<int> &pdimensions_in);
CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank);
CartesianCommunicator(const Coordinate &pdimensions_in);
virtual ~CartesianCommunicator();
private:
private:
////////////////////////////////////////////////
// Private initialise from an MPI communicator
// Can use after an MPI_Comm_split, but hidden from user so private
////////////////////////////////////////////////
void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
void InitFromMPICommunicator(const Coordinate &processors, Grid_MPI_Comm communicator_base);
public:
public:
////////////////////////////////////////////////////////////////////////////////////////
// Wraps MPI_Cart routines, or implements equivalent on other impls
////////////////////////////////////////////////////////////////////////////////////////
void ShiftedRanks(int dim,int shift,int & source, int & dest);
int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
int RankFromProcessorCoor(Coordinate &coor);
void ProcessorCoorFromRank(int rank,Coordinate &coor);
int Dimensions(void) ;
int IsBoss(void) ;
int BossRank(void) ;
int ThisRank(void) ;
const std::vector<int> & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ;
const Coordinate & ThisProcessorCoor(void) ;
const Coordinate & ProcessorGrid(void) ;
int ProcessorCount(void) ;
////////////////////////////////////////////////////////////////////////////////
@@ -197,11 +197,12 @@ public:
void AllToAll(void *in,void *out,uint64_t words ,uint64_t bytes);
template<class obj> void Broadcast(int root,obj &data)
{
Broadcast(root,(void *)&data,sizeof(data));
};
{
Broadcast(root,(void *)&data,sizeof(data));
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -23,12 +23,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world;
@@ -44,10 +44,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
(nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
assert(0);
}
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
assert(0);
}
}
// Never clean up as done once.
@@ -69,14 +74,14 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
@@ -86,7 +91,7 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Initialises from communicator_world
////////////////////////////////////////////////////////////////////////////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{
MPI_Comm optimal_comm;
////////////////////////////////////////////////////
@@ -105,12 +110,12 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
//////////////////////////////////
// Try to subdivide communicator
//////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
{
_ndimension = processors.size(); assert(_ndimension>=1);
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
std::vector<int> parent_processor_coor(_ndimension,0);
std::vector<int> parent_processors (_ndimension,1);
Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1);
// Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension;
@@ -133,9 +138,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent);
std::vector<int> ccoor(_ndimension); // coor within subcommunicator
std::vector<int> scoor(_ndimension); // coor of split within parent
std::vector<int> ssize(_ndimension); // coor of split within parent
Coordinate ccoor(_ndimension); // coor within subcommunicator
Coordinate scoor(_ndimension); // coor of split within parent
Coordinate ssize(_ndimension); // coor of split within parent
for(int d=0;d<_ndimension;d++){
ccoor[d] = parent_processor_coor[d] % processors[d];
@@ -152,36 +157,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
MPI_Comm comm_split;
if ( Nchild > 1 ) {
if(0){
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << scoor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
std::cout<<std::endl;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Declare victory
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
std::cout << " Split communicator " <<comm_split <<std::endl;
}
////////////////////////////////////////////////////////////////
// Split the communicator
////////////////////////////////////////////////////////////////
@@ -220,7 +195,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
}
}
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors, MPI_Comm communicator_base)
{
////////////////////////////////////////////////////
// Creates communicator, and the communicator_halo
@@ -237,7 +212,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &proc
_Nprocessors*=_processors[i];
}
std::vector<int> periodic(_ndimension,1);
Coordinate periodic(_ndimension,1);
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
@@ -474,7 +449,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
std::vector<int> row(_ndimension,1);
Coordinate row(_ndimension,1);
assert(dim>=0 && dim<_ndimension);
// Split the communicator
@@ -503,7 +478,6 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
MPI_Type_free(&object);
}
NAMESPACE_END(Grid);
}

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,11 +23,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
@@ -38,18 +38,18 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
{
GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors)
{
srank=0;
SetCommunicator(communicator_world);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{
_processors = processors;
_ndimension = processors.size(); assert(_ndimension>=1);
@@ -122,8 +122,8 @@ int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor; }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
source =0;
@@ -160,6 +160,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
void CartesianCommunicator::StencilBarrier(void){};
NAMESPACE_END(Grid);
}

View File

@@ -28,10 +28,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
// static data
int GlobalSharedMemory::HPEhypercube = 1;
uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
int GlobalSharedMemory::Hugepages = 0;
int GlobalSharedMemory::_ShmSetup;
@@ -76,6 +77,7 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
assert(heap_bytes<heap_size);
}
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
return ptr;
}
void SharedMemory::ShmBufferFreeAll(void) {
@@ -84,9 +86,9 @@ void SharedMemory::ShmBufferFreeAll(void) {
}
void *SharedMemory::ShmBufferSelf(void)
{
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
return ShmCommBufs[ShmRank];
}
NAMESPACE_END(Grid);
}

View File

@@ -25,18 +25,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// TODO
// 1) move includes into SharedMemory.cc
//
// 2) split shared memory into a) optimal communicator creation from comm world
//
// b) shared memory buffers container
// -- static globally shared; init once
// -- per instance set of buffers.
//
#pragma once
#include <Grid/GridCore.h>
@@ -53,30 +41,33 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <sys/shm.h>
#include <sys/mman.h>
#include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
namespace Grid {
NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request CommsRequest_t;
typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request CommsRequest_t;
#else
typedef int CommsRequest_t;
typedef int Grid_MPI_Comm;
typedef int CommsRequest_t;
typedef int Grid_MPI_Comm;
#endif
class GlobalSharedMemory {
private:
private:
static const int MAXLOG2RANKSPERNODE = 16;
// Init once lock on the buffer allocation
static int _ShmSetup;
static int _ShmAlloc;
static uint64_t _ShmAllocBytes;
public:
public:
///////////////////////////////////////
// HPE 8600 hypercube optimisation
///////////////////////////////////////
static int HPEhypercube;
static int ShmSetup(void) { return _ShmSetup; }
static int ShmAlloc(void) { return _ShmAlloc; }
static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
@@ -102,14 +93,17 @@ class GlobalSharedMemory {
// Create an optimal reordered communicator that makes MPI_Cart_create get it right
//////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
///////////////////////////////////////////////////
// Provide shared memory facilities off comm world
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
};
@@ -118,14 +112,14 @@ class GlobalSharedMemory {
//////////////////////////////
class SharedMemory
{
private:
private:
static const int MAXLOG2RANKSPERNODE = 16;
size_t heap_top;
size_t heap_bytes;
size_t heap_size;
protected:
protected:
Grid_MPI_Comm ShmComm; // for barriers
int ShmRank;
@@ -133,7 +127,7 @@ class SharedMemory
std::vector<void *> ShmCommBufs;
std::vector<int> ShmRanks;// Mapping comm ranks to Shm ranks
public:
public:
SharedMemory() {};
~SharedMemory();
///////////////////////////////////////////////////////////////////////////////////////
@@ -150,6 +144,7 @@ class SharedMemory
// Call on any instance
///////////////////////////////////////////////////
void SharedMemoryTest(void);
void *ShmBufferSelf(void);
void *ShmBuffer (int rank);
void *ShmBufferTranslate(int rank,void * local_p);
@@ -164,4 +159,5 @@ class SharedMemory
};
}
NAMESPACE_END(Grid);

View File

@@ -29,8 +29,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
#include <pwd.h>
namespace Grid {
#ifdef GRID_NVCC
#include <cuda_runtime_api.h>
#endif
NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: "
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
@@ -46,6 +50,11 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldRank == 0) {
std::cout << header " World communicator of size " <<WorldSize << std::endl;
std::cout << header " Node communicator of size " <<WorldShmSize << std::endl;
}
// WorldShmComm, WorldShmSize, WorldShmRank
// WorldNodes
@@ -130,7 +139,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
}
return log2size;
}
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
//////////////////////////////////////////////////////////////////////////////
// Look and see if it looks like an HPE 8600 based on hostname conventions
@@ -143,10 +152,39 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
if(nscan==3) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm);
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm);
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
for(int p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
break;
}
}
dim=(dim+1) %ndimension;
}
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
@@ -188,9 +226,9 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr
}
std::string hname(name);
std::cout << "hostname "<<hname<<std::endl;
std::cout << "R " << R << " I " << I << " N "<< N
<< " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
// std::cout << "hostname "<<hname<<std::endl;
// std::cout << "R " << R << " I " << I << " N "<< N
// << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
//////////////////////////////////////////////////////////////////
// broadcast node 0's base coordinate for this partition.
@@ -212,16 +250,13 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
std::vector<int> HyperCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
Coordinate processor_coor(ndimension);
Coordinate WorldDims = processors;
Coordinate ShmDims (ndimension); Coordinate NodeDims (ndimension);
Coordinate ShmCoor (ndimension); Coordinate NodeCoor (ndimension); Coordinate WorldCoor(ndimension);
Coordinate HyperCoor(ndimension);
GetShmDims(WorldDims,ShmDims);
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
@@ -269,29 +304,18 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
}
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
Coordinate processor_coor(ndimension);
Coordinate WorldDims = processors; Coordinate ShmDims(ndimension); Coordinate NodeDims (ndimension);
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
GetShmDims(WorldDims,ShmDims);
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
@@ -330,7 +354,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int>
#ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
@@ -389,10 +413,104 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// TODO/FIXME : NOT ALL NVLINK BOARDS have full Peer to peer connectivity.
// The annoyance is that they have partial peer 2 peer. This occurs on the 8 GPU blades.
// e.g. DGX1, supermicro board,
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
#else
std::cout << "setting device to WorldShmRank"<<std::endl;
cudaSetDevice(WorldShmRank);
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
auto err = cudaMalloc(&ShmCommBuf, bytes);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
for(int r=0;r<WorldShmSize;r++){
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
//////////////////////////////////////////////////
cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
//////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
{
int ierr=MPI_Bcast(&handle,
sizeof(handle),
MPI_BYTE,
r,
WorldShmComm);
assert(ierr==0);
}
///////////////////////////////////////////////////////////////
// If I am not the source, overwrite thisBuf with remote buffer
///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
if ( r!=WorldShmRank ) {
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
///////////////////////////////////////////////////////////////
// Save a copy of the device buffers
///////////////////////////////////////////////////////////////
WorldShmCommBufs[r] = thisBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -429,7 +547,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
// std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
@@ -439,7 +557,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -486,7 +604,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);
@@ -552,14 +670,31 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes = bytes;
}
#endif
#endif // End NVCC case for GPU device buffers
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
// Routines accessing shared memory should route through for GPU safety
/////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
#ifdef GRID_NVCC
cudaMemset(dest,0,bytes);
#else
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
{
#ifdef GRID_NVCC
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else
bcopy(src,dest,bytes);
#endif
}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
int rank, size;
@@ -587,7 +722,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< " wsr = "<<wsr<<std::endl;
}
ShmBufferFreeAll();
@@ -600,6 +734,26 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
#ifdef GRID_IBM_SUMMIT
// Hide the shared memory path between sockets
// if even number of nodes
if ( (ShmSize & 0x1)==0 ) {
int SocketSize = ShmSize/2;
int mySocket = ShmRank/SocketSize;
for(int r=0;r<size;r++){
int hisRank=ShmRanks[r];
if ( hisRank!= MPI_UNDEFINED ) {
int hisSocket=hisRank/SocketSize;
if ( hisSocket != mySocket ) {
ShmRanks[r] = MPI_UNDEFINED;
}
}
}
}
#endif
SharedMemoryTest();
}
//////////////////////////////////////////////////////////////////
// On node barrier
@@ -614,24 +768,26 @@ void SharedMemory::ShmBarrier(void)
void SharedMemory::SharedMemoryTest(void)
{
ShmBarrier();
uint64_t check[3];
uint64_t magic = 0x5A5A5A;
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
check[0] = GlobalSharedMemory::WorldNode;
check[1] = r;
check[2] = 0x5A5A5A;
for(uint64_t r=0;r<ShmSize;r++){
check[0]=GlobalSharedMemory::WorldNode;
check[1]=r;
check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
}
}
ShmBarrier();
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier();
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==0x5A5A5A);
assert(check[2]==magic);
ShmBarrier();
}
ShmBarrier();
}
void *SharedMemory::ShmBuffer(int rank)
@@ -645,7 +801,6 @@ void *SharedMemory::ShmBuffer(int rank)
}
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
static int count =0;
int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self
if (gpeer == MPI_UNDEFINED){
@@ -664,4 +819,5 @@ SharedMemory::~SharedMemory()
}
};
}
NAMESPACE_END(Grid);

View File

@@ -28,7 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@@ -47,7 +47,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
_ShmSetup=1;
}
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
optimal_comm = WorldComm;
}
@@ -84,10 +84,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAlloc=1;
};
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
assert(GlobalSharedMemory::ShmAlloc()==1);
@@ -125,4 +125,5 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
SharedMemory::~SharedMemory()
{};
}
NAMESPACE_END(Grid);

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_CSHIFT_H_
#define _GRID_CSHIFT_H_

View File

@@ -25,10 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_CSHIFT_COMMON_H_
#define _GRID_CSHIFT_COMMON_H_
#pragma once
namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
@@ -36,20 +35,21 @@ namespace Grid {
template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask = 0x3;
}
int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int so=plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension];
int ent = 0;
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
static Vector<std::pair<int,int> > table; table.resize(e1*e2);
int stride=rhs.Grid()->_slice_stride[dimension];
int stride=rhs._grid->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
@@ -63,66 +63,68 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) {
table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
}
}
}
}
parallel_for(int i=0;i<ent;i++){
buffer[table[i].first]=rhs._odata[table[i].second];
}
thread_for(i,ent,{
buffer[table[i].first]=rhs_v[table[i].second];
});
}
///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split
///////////////////////////////////////////////////////////////////
template<class vobj> void
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
Gather_plane_extract(const Lattice<vobj> &rhs,
ExtractPointerArray<typename vobj::scalar_object> pointers,
int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask = 0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int n1=rhs._grid->_slice_stride[dimension];
int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension];
int n1=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask ==0x3){
parallel_for_nest2(int n=0;n<e1;n++){
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs._odata[so+o+b];
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
}
});
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
std::cout << " Dense packed buffer WARNING " <<std::endl;
parallel_for_nest2(int n=0;n<e1;n++){
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){
int o=n*n1;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs._odata[so+o+b];
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
}
}
});
}
}
@@ -131,17 +133,17 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int stride=rhs._grid->_slice_stride[dimension];
int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension];
int stride=rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent =0;
@@ -150,8 +152,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
int o =n*rhs.Grid()->_slice_stride[dimension];
int bo =n*rhs.Grid()->_slice_block[dimension];
table[ent++] = std::pair<int,int>(so+o+b,bo+b);
}
}
@@ -160,8 +162,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
int o =n*rhs.Grid()->_slice_stride[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
table[ent++]=std::pair<int,int> (so+o+b,bo++);
}
@@ -169,48 +171,51 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
}
}
parallel_for(int i=0;i<ent;i++){
rhs._odata[table[i].first]=buffer[table[i].second];
}
auto rhs_v = rhs.View();
thread_for(i,ent,{
rhs_v[table[i].first]=buffer[table[i].second];
});
}
//////////////////////////////////////////////////////
// Scatter for when there *is* need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerArray<typename vobj::scalar_object> pointers,int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) {
parallel_for_nest2(int n=0;n<e1;n++){
auto rhs_v = rhs.View();
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
merge(rhs._odata[so+o+b],pointers,offset);
int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs.Grid()->_slice_block[dimension];
merge(rhs_v[so+o+b],pointers,offset);
}
}
});
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
auto rhs_v = rhs.View();
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs.Grid()->_slice_block[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers,offset);
merge(rhs_v[so+o+b],pointers,offset);
}
}
}
@@ -222,18 +227,18 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typ
//////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension];
int stride = rhs._grid->_slice_stride[dimension];
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0;
@@ -248,7 +253,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
@@ -256,32 +261,33 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
}
}
parallel_for(int i=0;i<ent;i++){
lhs._odata[table[i].first]=rhs._odata[table[i].second];
}
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
}
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block [dimension];
int stride = rhs._grid->_slice_stride[dimension];
int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0;
double t_tab,t_perm;
if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
@@ -292,14 +298,16 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
}
parallel_for(int i=0;i<ent;i++){
permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
}
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
}
//////////////////////////////////////////////////////
@@ -309,10 +317,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
double t_local;
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Cshift_local(ret,rhs,dimension,shift,0x3);
@@ -324,7 +330,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid = rhs._grid;
GridBase *grid = rhs.Grid();
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
@@ -335,18 +341,18 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r
shift = (shift+fd)%fd;
// the permute type
ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
ret.Checkerboard() = grid->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
int permute_dim =grid->PermuteDim(dimension);
int permute_type=grid->PermuteType(dimension);
int permute_type_dist;
for(int x=0;x<rd;x++){
int o = 0;
// int o = 0;
int bo = x * grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even;
int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
int sshift = grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
int sx = (x+sshift)%rd;
// wrap is whether sshift > rd.
@@ -387,5 +393,5 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r
}
}
}
#endif
NAMESPACE_END(Grid);

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,33 +24,33 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_CSHIFT_MPI_H_
#define _GRID_CSHIFT_MPI_H_
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
Lattice<vobj> ret(rhs._grid);
Lattice<vobj> ret(rhs.Grid());
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
// the permute type
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
if ( !comm_dim ) {
@@ -70,10 +70,10 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
// std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
// std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x3);
@@ -88,8 +88,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
@@ -107,25 +107,25 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs._grid;
Lattice<vobj> temp(rhs._grid);
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
int pd = rhs._grid->_processors[dimension];
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
commVector<vobj> send_buf(buffer_size);
commVector<vobj> recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
@@ -145,7 +145,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
int rank = grid->_processor;
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
@@ -165,7 +165,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs._grid;
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
@@ -193,21 +193,21 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
// int words = sizeof(vobj)/sizeof(vector_type);
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object);
std::vector<scalar_object *> pointers(Nsimd); //
std::vector<scalar_object *> rpointers(Nsimd); // received pointers
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
@@ -257,6 +257,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,17 +23,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_CSHIFT_NONE_H_
#define _GRID_CSHIFT_NONE_H_
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
Lattice<vobj> ret(rhs.Grid());
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
Cshift_local(ret,rhs,dimension,shift);
return ret;
}
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,3 +1,4 @@
#ifndef __NVCC__
/*
__ _____ _____ _____
__| | __| | | | JSON for Modern C++
@@ -18918,3 +18919,4 @@ inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std
#endif
#endif

View File

@@ -25,9 +25,22 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_H
#define GRID_LATTICE_H
#pragma once
#include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h>
#include <Grid/lattice/Lattice_arith.h>
#include <Grid/lattice/Lattice_trace.h>
#include <Grid/lattice/Lattice_transpose.h>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h>
//#include <Grid/lattice/Lattice_where.h>
#include <Grid/lattice/Lattice_rng.h>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#endif

View File

@@ -27,7 +27,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
/* END LEGAL */
#ifndef GRID_LATTICE_ET_H
#define GRID_LATTICE_ET_H
@@ -36,13 +36,13 @@ directory
#include <typeinfo>
#include <vector>
namespace Grid {
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////
// Predicated where support
////////////////////////////////////////////////////
template <class iobj, class vobj, class robj>
inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
const robj &iffalse) {
typename std::remove_const<vobj>::type ret;
@@ -51,11 +51,10 @@ inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
const int words = sizeof(vobj) / sizeof(vector_type);
std::vector<Integer> mask(Nsimd);
std::vector<scalar_object> truevals(Nsimd);
std::vector<scalar_object> falsevals(Nsimd);
ExtractBuffer<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
@@ -69,158 +68,148 @@ inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
return ret;
}
////////////////////////////////////////////
// recursive evaluation of expressions; Could
// switch to generic approach with variadics, a la
// Antonin's Lat Sim but the repack to variadic with popped
// from tuple is hideous; C++14 introduces std::make_index_sequence for this
////////////////////////////////////////////
// leaf eval of lattice ; should enable if protect using traits
template <typename T>
using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T>
using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
/////////////////////////////////////////////////////
//Specialization of getVectorType for lattices
/////////////////////////////////////////////////////
template<typename T>
struct getVectorType<Lattice<T> >{
typedef typename Lattice<T>::vector_object type;
};
template<class sobj>
inline sobj eval(const unsigned int ss, const sobj &arg)
////////////////////////////////////////////
//-- recursive evaluation of expressions; --
// handle leaves of syntax tree
///////////////////////////////////////////////////
template<class sobj> accelerator_inline
sobj eval(const uint64_t ss, const sobj &arg)
{
return arg;
}
template <class lobj>
inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
return arg._odata[ss];
template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
{
return arg[ss];
}
template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{
auto view = arg.View();
return view[ss];
}
// handle nodes in syntax tree
template <typename Op, typename T1>
auto inline eval(
const unsigned int ss,
const LatticeUnaryExpression<Op, T1> &expr) // eval one operand
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
return expr.first.func(eval(ss, std::get<0>(expr.second)));
///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand
///////////////////////////////////////////////////
template <typename Op, typename T1> accelerator_inline
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
-> decltype(expr.op.func( eval(ss, expr.arg1)))
{
return expr.op.func( eval(ss, expr.arg1) );
}
template <typename Op, typename T1, typename T2>
auto inline eval(
const unsigned int ss,
const LatticeBinaryExpression<Op, T1, T2> &expr) // eval two operands
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)))) {
return expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)));
///////////////////////
// eval two operands
///////////////////////
template <typename Op, typename T1, typename T2> accelerator_inline
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
{
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
}
template <typename Op, typename T1, typename T2, typename T3>
auto inline eval(const unsigned int ss,
const LatticeTrinaryExpression<Op, T1, T2, T3>
&expr) // eval three operands
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)),
eval(ss, std::get<2>(expr.second)))) {
return expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)),
eval(ss, std::get<2>(expr.second)));
///////////////////////
// eval three operands
///////////////////////
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
{
return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
}
//////////////////////////////////////////////////////////////////////////
// Obtain the grid from an expression, ensuring conformable. This must follow a
// tree recursion
// tree recursion; must retain grid pointer in the LatticeView class which sucks
// Use a different method, and make it void *.
// Perhaps a conformable method.
//////////////////////////////////////////////////////////////////////////
template <class T1,
typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
accelerator_inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf
{
if (grid) {
conformable(grid, lat._grid);
}
grid = lat._grid;
lat.Conformable(grid);
}
template <class T1,
typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void GridFromExpression(GridBase *&grid,
const T1 &notlat) // non-lattice leaf
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
accelerator_inline
void GridFromExpression(GridBase *&grid,const T1 &notlat) // non-lattice leaf
{}
template <typename Op, typename T1>
inline void GridFromExpression(GridBase *&grid,
const LatticeUnaryExpression<Op, T1> &expr) {
GridFromExpression(grid, std::get<0>(expr.second)); // recurse
accelerator_inline
void GridFromExpression(GridBase *&grid,const LatticeUnaryExpression<Op, T1> &expr)
{
GridFromExpression(grid, expr.arg1); // recurse
}
template <typename Op, typename T1, typename T2>
inline void GridFromExpression(
GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
GridFromExpression(grid, std::get<0>(expr.second)); // recurse
GridFromExpression(grid, std::get<1>(expr.second));
accelerator_inline
void GridFromExpression(GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr)
{
GridFromExpression(grid, expr.arg1); // recurse
GridFromExpression(grid, expr.arg2);
}
template <typename Op, typename T1, typename T2, typename T3>
inline void GridFromExpression(
GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
GridFromExpression(grid, std::get<0>(expr.second)); // recurse
GridFromExpression(grid, std::get<1>(expr.second));
GridFromExpression(grid, std::get<2>(expr.second));
accelerator_inline
void GridFromExpression(GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
GridFromExpression(grid, expr.arg1); // recurse
GridFromExpression(grid, expr.arg2); // recurse
GridFromExpression(grid, expr.arg3); // recurse
}
//////////////////////////////////////////////////////////////////////////
// Obtain the CB from an expression, ensuring conformable. This must follow a
// tree recursion
//////////////////////////////////////////////////////////////////////////
template <class T1,
typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{
if ((cb == Odd) || (cb == Even)) {
assert(cb == lat.checkerboard);
assert(cb == lat.Checkerboard());
}
cb = lat.checkerboard;
// std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
cb = lat.Checkerboard();
}
template <class T1,
typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf
{
// std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
}
template <typename Op, typename T1>
inline void CBFromExpression(int &cb,
const LatticeUnaryExpression<Op, T1> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse
// std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
}
template <typename Op, typename T1, typename T2>
inline void CBFromExpression(int &cb,
const LatticeBinaryExpression<Op, T1, T2> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse
CBFromExpression(cb, std::get<1>(expr.second));
// std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
template <typename Op, typename T1> inline
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
{
CBFromExpression(cb, expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
{
CBFromExpression(cb, expr.arg1); // recurse AST
CBFromExpression(cb, expr.arg2); // recurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void CBFromExpression(
int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse
CBFromExpression(cb, std::get<1>(expr.second));
CBFromExpression(cb, std::get<2>(expr.second));
// std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
CBFromExpression(cb, expr.arg1); // recurse AST
CBFromExpression(cb, expr.arg2); // recurse AST
CBFromExpression(cb, expr.arg3); // recurse AST
}
////////////////////////////////////////////
// Unary operators and funcs
////////////////////////////////////////////
#define GridUnopClass(name, ret) \
template <class arg> \
struct name { \
static auto inline func(const arg a) -> decltype(ret) { return ret; } \
#define GridUnopClass(name, ret) \
template <class arg> \
struct name { \
static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
};
GridUnopClass(UnarySub, -a);
@@ -250,19 +239,21 @@ GridUnopClass(UnaryExp, exp(a));
////////////////////////////////////////////
// Binary operators
////////////////////////////////////////////
#define GridBinOpClass(name, combination) \
template <class left, class right> \
struct name { \
static auto inline func(const left &lhs, const right &rhs) \
-> decltype(combination) const { \
return combination; \
} \
}
#define GridBinOpClass(name, combination) \
template <class left, class right> \
struct name { \
static auto accelerator_inline \
func(const left &lhs, const right &rhs) \
-> decltype(combination) const \
{ \
return combination; \
} \
};
GridBinOpClass(BinaryAdd, lhs + rhs);
GridBinOpClass(BinarySub, lhs - rhs);
GridBinOpClass(BinaryMul, lhs *rhs);
GridBinOpClass(BinaryDiv, lhs /rhs);
GridBinOpClass(BinaryAnd, lhs &rhs);
GridBinOpClass(BinaryOr, lhs | rhs);
GridBinOpClass(BinaryAndAnd, lhs &&rhs);
@@ -271,92 +262,71 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
////////////////////////////////////////////////////
// Trinary conditional op
////////////////////////////////////////////////////
#define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \
struct name { \
static auto inline func(const predicate &pred, const left &lhs, \
const right &rhs) -> decltype(combination) const { \
return combination; \
} \
}
#define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \
struct name { \
static auto accelerator_inline \
func(const predicate &pred, const left &lhs, const right &rhs) \
-> decltype(combination) const \
{ \
return combination; \
} \
};
GridTrinOpClass(
TrinaryWhere,
(predicatedWhere<predicate, typename std::remove_reference<left>::type,
typename std::remove_reference<right>::type>(pred, lhs,
rhs)));
GridTrinOpClass(TrinaryWhere,
(predicatedWhere<predicate,
typename std::remove_reference<left>::type,
typename std::remove_reference<right>::type>(pred, lhs,rhs)));
////////////////////////////////////////////
// Operator syntactical glue
////////////////////////////////////////////
#define GRID_UNOP(name) name<decltype(eval(0, arg))>
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_TRINOP(name) \
name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_UNOP(name) name<decltype(eval(0, arg))>
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_DEF_UNOP(op, name) \
template <typename T1, \
typename std::enable_if<is_lattice<T1>::value || \
is_lattice_expr<T1>::value, \
T1>::type * = nullptr> \
inline auto op(const T1 &arg) \
->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg))); \
#define GRID_DEF_UNOP(op, name) \
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg)) \
{ \
return LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg); \
}
#define GRID_BINOP_LEFT(op, name) \
template <typename T1, typename T2, \
typename std::enable_if<is_lattice<T1>::value || \
is_lattice_expr<T1>::value, \
T1>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype( \
LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), \
std::forward_as_tuple(lhs, rhs)))) { \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
#define GRID_BINOP_LEFT(op, name) \
template <typename T1, typename T2, \
typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs)) \
{ \
return LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs);\
}
#define GRID_BINOP_RIGHT(op, name) \
template <typename T1, typename T2, \
typename std::enable_if<!is_lattice<T1>::value && \
!is_lattice_expr<T1>::value, \
T1>::type * = nullptr, \
typename std::enable_if<is_lattice<T2>::value || \
is_lattice_expr<T2>::value, \
T2>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype( \
LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), \
std::forward_as_tuple(lhs, rhs)))) { \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
#define GRID_BINOP_RIGHT(op, name) \
template <typename T1, typename T2, \
typename std::enable_if<!is_lattice<T1>::value&&!is_lattice_expr<T1>::value,T1>::type * = nullptr, \
typename std::enable_if< is_lattice<T2>::value|| is_lattice_expr<T2>::value,T2>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs)) \
{ \
return LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs); \
}
#define GRID_DEF_BINOP(op, name) \
GRID_BINOP_LEFT(op, name); \
#define GRID_DEF_BINOP(op, name) \
GRID_BINOP_LEFT(op, name); \
GRID_BINOP_RIGHT(op, name);
#define GRID_DEF_TRINOP(op, name) \
template <typename T1, typename T2, typename T3> \
inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \
->decltype( \
LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
const T3 &>(std::make_pair( \
GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) { \
return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
const T3 &>(std::make_pair( \
GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs))); \
#define GRID_DEF_TRINOP(op, name) \
template <typename T1, typename T2, typename T3> \
inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \
->decltype(LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs)) \
{ \
return LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs); \
}
////////////////////////
// Operator definitions
////////////////////////
GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot);
@@ -400,29 +370,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
/////////////////////////////////////////////////////////////
template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
expr);
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
return ret;
}
template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, std::get<1>(expr.second))))> {
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, std::get<1>(expr.second))))>
ret(expr);
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
{
Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
return ret;
}
template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, std::get<1>(expr.second)),
eval(0, std::get<2>(expr.second))))> {
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, std::get<1>(expr.second)),
eval(0, std::get<2>(expr.second))))>
ret(expr);
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, expr.arg2),
eval(0, expr.arg3)))>
{
Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, expr.arg2),
eval(0, expr.arg3)))> ret(expr);
return ret;
}
@@ -433,34 +401,7 @@ auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
#undef GRID_DEF_UNOP
#undef GRID_DEF_BINOP
#undef GRID_DEF_TRINOP
}
#if 0
using namespace Grid;
int main(int argc,char **argv){
Lattice<double> v1(16);
Lattice<double> v2(16);
Lattice<double> v3(16);
BinaryAdd<double,double> tmp;
LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &>
expr(std::make_pair(tmp,
std::forward_as_tuple(v1,v2)));
tmp.func(eval(0,v1),eval(0,v2));
auto var = v1+v2;
std::cout<<GridLogMessage<<typeid(var).name()<<std::endl;
v3=v1+v2;
v3=v1+v2+v1*v2;
};
void testit(Lattice<double> &v1,Lattice<double> &v2,Lattice<double> &v3)
{
v3=v1+v2+v1*v2;
}
#endif
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,233 +23,235 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_ARITH_H
#define GRID_LATTICE_ARITH_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(lhs,ret);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
obj1 tmp;
mult(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
}
}
template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,lhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
obj1 tmp;
mac(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
}
}
template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,lhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(lhs,ret);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs._odata[ss],&rhs);
#endif
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mult(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mac(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class sobj,class vobj> strong_inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard;
conformable(ret,x);
conformable(x,y);
parallel_for(int ss=0;ss<x._grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = a*x._odata[ss]+y._odata[ss];
vstream(ret._odata[ss],tmp);
#else
ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
#endif
}
}
template<class sobj,class vobj> strong_inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard;
conformable(ret,x);
conformable(x,y);
parallel_for(int ss=0;ss<x._grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vstream(ret._odata[ss],tmp);
#else
ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
#endif
}
}
template<class sobj,class vobj> strong_inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
return axpy_norm_fast(ret,a,x,y);
}
template<class sobj,class vobj> strong_inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
return axpby_norm_fast(ret,a,b,x,y);
}
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
conformable(ret,rhs);
conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t = lhs_v(ss);
auto rhs_t = rhs_v(ss);
mult(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss);
sub(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss);
add(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
mac(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
sub(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
add(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
mult(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
sub(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
add(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class sobj,class vobj> inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
auto ret_v = ret.View();
auto x_v = x.View();
auto y_v = y.View();
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class sobj,class vobj> inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
auto ret_v = ret.View();
auto x_v = x.View();
auto y_v = y.View();
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
return axpy_norm_fast(ret,a,x,y);
}
template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
return axpby_norm_fast(ret,a,b,x,y);
}
NAMESPACE_END(Grid);
#endif

View File

@@ -27,349 +27,443 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_BASE_H
#define GRID_LATTICE_BASE_H
/* END LEGAL */
#pragma once
#define STREAMING_STORES
namespace Grid {
// TODO:
// mac,real,imag
// Functionality:
// -=,+=,*=,()
// add,+,sub,-,mult,mac,*
// adj,conjugate
// real,imag
// transpose,transposeIndex
// trace,traceIndex
// peekIndex
// innerProduct,outerProduct,
// localNorm2
// localInnerProduct
NAMESPACE_BEGIN(Grid);
extern int GridCshiftPermuteMap[4][16];
////////////////////////////////////////////////
// Basic expressions used in Expression Template
////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
class LatticeBase
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
virtual ~LatticeBase(void) = default;
GridBase *_grid;
// Rvalue
#ifdef __CUDA_ARCH__
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename Op, typename T1>
class LatticeUnaryExpression : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
public:
LatticeUnaryExpression(const std::pair<Op,std::tuple<T1> > &arg): std::pair<Op,std::tuple<T1> >(arg) {};
};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template <typename Op, typename T1, typename T2>
class LatticeBinaryExpression : public std::pair<Op,std::tuple<T1,T2> > , public LatticeExpressionBase {
public:
LatticeBinaryExpression(const std::pair<Op,std::tuple<T1,T2> > &arg): std::pair<Op,std::tuple<T1,T2> >(arg) {};
};
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename T1, typename T2, typename T3>
class LatticeTrinaryExpression :public std::pair<Op,std::tuple<T1,T2,T3> >, public LatticeExpressionBase {
public:
LatticeTrinaryExpression(const std::pair<Op,std::tuple<T1,T2,T3> > &arg): std::pair<Op,std::tuple<T1,T2,T3> >(arg) {};
};
void inline conformable(GridBase *lhs,GridBase *rhs)
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
assert((lhs == rhs) && " conformable check pointers mismatch ");
}
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
/////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class Lattice : public LatticeBase
class Lattice : public LatticeAccelerator<vobj>
{
public:
int checkerboard;
Vector<vobj> _odata;
GridBase *Grid(void) const { return this->_grid; }
///////////////////////////////////////////////////
// Member types
///////////////////////////////////////////////////
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef vobj vector_object;
// to pthread need a computable loop where loop induction is not required
int begin(void) { return 0;};
int end(void) { return _odata.size(); }
vobj & operator[](int i) { return _odata[i]; };
const vobj & operator[](int i) const { return _odata[i]; };
private:
void dealloc(void)
{
if( this->_odata_size ) {
alignedAllocator<vobj> alloc;
alloc.deallocate(this->_odata,this->_odata_size);
this->_odata=nullptr;
this->_odata_size=0;
}
}
void resize(uint64_t size)
{
if ( this->_odata_size != size ) {
alignedAllocator<vobj> alloc;
dealloc();
this->_odata_size = size;
if ( size )
this->_odata = alloc.allocate(this->_odata_size);
else
this->_odata = nullptr;
}
}
public:
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef vobj vector_object;
/////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas
/////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
return accessor;
}
~Lattice() {
if ( this->_odata_size ) {
dealloc();
}
}
////////////////////////////////////////////////////////////////////////////////
// Expression Template closure support
////////////////////////////////////////////////////////////////////////////////
template <typename Op, typename T1> strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
{
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
conformable(_grid,egrid);
conformable(this->_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
this->checkerboard=cb;
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
}
auto me = View();
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr);
vstream(me[ss],tmp);
});
return *this;
}
template <typename Op, typename T1,typename T2> strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
{
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
conformable(_grid,egrid);
conformable(this->_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
this->checkerboard=cb;
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
}
auto me = View();
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr);
vstream(me[ss],tmp);
});
return *this;
}
template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
{
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
conformable(_grid,egrid);
conformable(this->_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
//vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,eval(ss,expr));
#else
_odata[ss] = eval(ss,expr);
#endif
}
this->checkerboard=cb;
auto me = View();
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr);
vstream(me[ss],tmp);
});
return *this;
}
//GridFromExpression is tricky to do
template<class Op,class T1>
Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
_grid = nullptr;
GridFromExpression(_grid,expr);
assert(_grid!=nullptr);
Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
this->_grid = nullptr;
GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
this->checkerboard=cb;
_odata.resize(_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
}
};
resize(this->_grid->oSites());
*this = expr;
}
template<class Op,class T1, class T2>
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
_grid = nullptr;
GridFromExpression(_grid,expr);
assert(_grid!=nullptr);
this->_grid = nullptr;
GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
this->checkerboard=cb;
_odata.resize(_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
}
};
resize(this->_grid->oSites());
*this = expr;
}
template<class Op,class T1, class T2, class T3>
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
_grid = nullptr;
GridFromExpression(_grid,expr);
assert(_grid!=nullptr);
this->_grid = nullptr;
GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
this->checkerboard=cb;
_odata.resize(_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
vstream(_odata[ss] ,eval(ss,expr));
}
};
resize(this->_grid->oSites());
*this = expr;
}
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View();
thread_for(ss,me.size(),{
me[ss] = r;
});
return *this;
}
//////////////////////////////////////////////////////////////////
// Constructor requires "grid" passed.
// what about a default grid?
//////////////////////////////////////////////////////////////////
Lattice(GridBase *grid) : _odata(grid->oSites()) {
_grid = grid;
// _odata.reserve(_grid->oSites());
// _odata.resize(_grid->oSites());
// std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
assert((((uint64_t)&_odata[0])&0xF) ==0);
checkerboard=0;
// Follow rule of five, with Constructor requires "grid" passed
// to user defined constructor
///////////////////////////////////////////
// user defined constructor
///////////////////////////////////////////
Lattice(GridBase *grid) {
this->_grid = grid;
resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0;
}
Lattice(const Lattice& r){ // copy constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
}
Lattice(Lattice&& r){ // move constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata=std::move(r._odata);
}
inline Lattice<vobj> & operator = (Lattice<vobj> && r)
{
_grid = r._grid;
checkerboard = r.checkerboard;
_odata =std::move(r._odata);
return *this;
}
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
return *this;
}
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard;
conformable(*this,r);
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss];
}
return *this;
}
virtual ~Lattice(void) = default;
// virtual ~Lattice(void) = default;
void reset(GridBase* grid) {
if (_grid != grid) {
_grid = grid;
_odata.resize(grid->oSites());
checkerboard = 0;
if (this->_grid != grid) {
this->_grid = grid;
this->resize(grid->oSites());
this->checkerboard = 0;
}
}
template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r;
}
///////////////////////////////////////////
// copy constructor
///////////////////////////////////////////
Lattice(const Lattice& r){
// std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl;
this->_grid = r.Grid();
resize(this->_grid->oSites());
*this = r;
}
///////////////////////////////////////////
// move constructor
///////////////////////////////////////////
Lattice(Lattice && r){
this->_grid = r.Grid();
this->_odata = r._odata;
this->_odata_size = r._odata_size;
this->checkerboard= r.Checkerboard();
r._odata = nullptr;
r._odata_size = 0;
}
///////////////////////////////////////////
// assignment template
///////////////////////////////////////////
template<class robj> inline Lattice<vobj> & operator = (const Lattice<robj> & r){
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r);
this->checkerboard = r.Checkerboard();
auto me = View();
auto him= r.View();
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
return *this;
}
///////////////////////////////////////////
// Copy assignment
///////////////////////////////////////////
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard();
conformable(*this,r);
auto me = View();
auto him= r.View();
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
return *this;
}
///////////////////////////////////////////
// Move assignment possible if same type
///////////////////////////////////////////
inline Lattice<vobj> & operator = (Lattice<vobj> && r){
resize(0); // deletes if appropriate
this->_grid = r.Grid();
this->_odata = r._odata;
this->_odata_size = r._odata_size;
this->checkerboard= r.Checkerboard();
r._odata = nullptr;
r._odata_size = 0;
return *this;
}
/////////////////////////////////////////////////////////////////////////////
// *=,+=,-= operators inherit behvour from correspond */+/- operation
template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
/////////////////////////////////////////////////////////////////////////////
template<class T> inline Lattice<vobj> &operator *=(const T &r) {
*this = (*this)*r;
return *this;
}
template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
template<class T> inline Lattice<vobj> &operator -=(const T &r) {
*this = (*this)-r;
return *this;
}
template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
template<class T> inline Lattice<vobj> &operator +=(const T &r) {
*this = (*this)+r;
return *this;
}
}; // class Lattice
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
std::vector<int> gcoor;
typedef typename vobj::scalar_object sobj;
sobj ss;
for(int g=0;g<o._grid->_gsites;g++){
o._grid->GlobalIndexToGlobalCoor(g,gcoor);
peekSite(ss,o,gcoor);
stream<<"[";
for(int d=0;d<gcoor.size();d++){
stream<<gcoor[d];
if(d!=gcoor.size()-1) stream<<",";
}
stream<<"]\t";
stream<<ss<<std::endl;
}
return stream;
friend inline void swap(Lattice &l, Lattice &r) {
conformable(l,r);
LatticeAccelerator<vobj> tmp;
LatticeAccelerator<vobj> *lp = (LatticeAccelerator<vobj> *)&l;
LatticeAccelerator<vobj> *rp = (LatticeAccelerator<vobj> *)&r;
tmp = *lp; *lp=*rp; *rp=tmp;
}
}; // class Lattice
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
typedef typename vobj::scalar_object sobj;
for(int g=0;g<o.Grid()->_gsites;g++){
Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
sobj ss;
peekSite(ss,o,gcoor);
stream<<"[";
for(int d=0;d<gcoor.size();d++){
stream<<gcoor[d];
if(d!=gcoor.size()-1) stream<<",";
}
stream<<"]\t";
stream<<ss<<std::endl;
}
return stream;
}
NAMESPACE_END(Grid);
#include "Lattice_conformable.h"
#define GRID_LATTICE_EXPRESSION_TEMPLATES
#ifdef GRID_LATTICE_EXPRESSION_TEMPLATES
#include "Lattice_ET.h"
#else
#include "Lattice_overload.h"
#endif
#include "Lattice_arith.h"
#include "Lattice_trace.h"
#include "Lattice_transpose.h"
#include "Lattice_local.h"
#include "Lattice_reduction.h"
#include "Lattice_peekpoke.h"
#include "Lattice_reality.h"
#include "Lattice_comparison_utils.h"
#include "Lattice_comparison.h"
#include "Lattice_coordinate.h"
#include "Lattice_where.h"
#include "Lattice_rng.h"
#include "Lattice_unary.h"
#include "Lattice_transfer.h"
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,146 +24,184 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_COMPARISON_H
#define GRID_LATTICE_COMPARISON_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////
// relational operators
//
// Support <,>,<=,>=,==,!=
//
//Query supporting bitwise &, |, ^, !
//Query supporting logical &&, ||,
//////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
// relational operators
//
// Support <,>,<=,>=,==,!=
//
//Query supporting bitwise &, |, ^, !
//Query supporting logical &&, ||,
//////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
// compare lattice to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{
Lattice<vInteger> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare lattice to scalar
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{
Lattice<vInteger> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare scalar to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{
Lattice<vInteger> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
typedef iScalar<vInteger> vPredicate ;
/*
template <class iobj, class vobj, class robj> accelerator_inline
vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
{
typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
ExtractBuffer<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
extract<vInteger, Integer>(TensorRemove(predicate), mask);
for (int s = 0; s < Nsimd; s++) {
if (mask[s]) falsevals[s] = truevals[s];
}
//////////////////////////////////////////////////////////////////////////
// Map to functors
//////////////////////////////////////////////////////////////////////////
// Less than
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
}
// Less than equal
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vle<lobj,robj>(),lhs,rhs);
}
// Greater than
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vgt<lobj,robj>(),lhs,rhs);
}
// Greater than equal
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vge<lobj,robj>(),lhs,rhs);
}
// equal
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(veq<lobj,robj>(),lhs,rhs);
}
// not equal
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vne<lobj,robj>(),lhs,rhs);
}
merge(ret, falsevals);
return ret;
}
*/
//////////////////////////////////////////////////////////////////////////
// compare lattice to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{
Lattice<vPredicate> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
});
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare lattice to scalar
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{
Lattice<vPredicate> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs);
});
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare scalar to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{
Lattice<vPredicate> ret(rhs.Grid());
auto rhs_v = rhs.View();
auto ret_v = ret.View();
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]);
});
return ret;
}
//////////////////////////////////////////////////////////////////////////
// Map to functors
//////////////////////////////////////////////////////////////////////////
// Less than
template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
}
// Less than equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vle<lobj,robj>(),lhs,rhs);
}
// Greater than
template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vgt<lobj,robj>(),lhs,rhs);
}
// Greater than equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vge<lobj,robj>(),lhs,rhs);
}
// equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(veq<lobj,robj>(),lhs,rhs);
}
// not equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vne<lobj,robj>(),lhs,rhs);
}
NAMESPACE_END(Grid);
#endif

View File

@@ -26,10 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMPARISON_H
#define GRID_COMPARISON_H
namespace Grid {
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////
// This implementation is a bit poor.
@@ -44,42 +44,42 @@ namespace Grid {
//
template<class lobj,class robj> class veq {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) == (rhs);
}
};
template<class lobj,class robj> class vne {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) != (rhs);
}
};
template<class lobj,class robj> class vlt {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) < (rhs);
}
};
template<class lobj,class robj> class vle {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) <= (rhs);
}
};
template<class lobj,class robj> class vgt {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) > (rhs);
}
};
template<class lobj,class robj> class vge {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) >= (rhs);
}
@@ -88,42 +88,42 @@ namespace Grid {
// Generic list of functors
template<class lobj,class robj> class seq {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) == (rhs);
}
};
template<class lobj,class robj> class sne {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) != (rhs);
}
};
template<class lobj,class robj> class slt {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) < (rhs);
}
};
template<class lobj,class robj> class sle {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) <= (rhs);
}
};
template<class lobj,class robj> class sgt {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) > (rhs);
}
};
template<class lobj,class robj> class sge {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) >= (rhs);
}
@@ -133,12 +133,12 @@ namespace Grid {
// Integer and real get extra relational functions.
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
{
typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<scalar> vrhs(vsimd::Nsimd());
std::vector<Integer> vpred(vsimd::Nsimd());
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
ExtractBuffer<scalar> vrhs(vsimd::Nsimd());
ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret;
extract<vsimd,scalar>(lhs,vlhs);
extract<vsimd,scalar>(rhs,vrhs);
@@ -150,11 +150,11 @@ namespace Grid {
}
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
{
typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<Integer> vpred(vsimd::Nsimd());
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret;
extract<vsimd,scalar>(lhs,vlhs);
for(int s=0;s<vsimd::Nsimd();s++){
@@ -165,11 +165,11 @@ namespace Grid {
}
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
{
typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<Integer> vpred(vsimd::Nsimd());
ExtractBuffer<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret;
extract<vsimd,scalar>(rhs,vrhs);
for(int s=0;s<vsimd::Nsimd();s++){
@@ -181,30 +181,30 @@ namespace Grid {
#define DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
{ \
return lhs._internal op rhs; \
} \
template<class vsimd>\
inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
{ \
return lhs op rhs._internal; \
} \
@@ -212,7 +212,7 @@ namespace Grid {
#define DECLARE_RELATIONAL(op,functor) \
DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \
return lhs._internal op rhs._internal; \
}
@@ -226,7 +226,7 @@ DECLARE_RELATIONAL(!=,sne);
#undef DECLARE_RELATIONAL
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,18 +23,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_CONFORMABLE_H
#define GRID_LATTICE_CONFORMABLE_H
namespace Grid {
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{
assert(lhs._grid == rhs._grid);
assert(lhs.checkerboard == rhs.checkerboard);
}
NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{
assert(lhs.Grid() == rhs.Grid());
assert(lhs.Checkerboard() == rhs.Checkerboard());
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,34 +23,51 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_COORDINATE_H
#define GRID_LATTICE_COORDINATE_H
*************************************************************************************/
/* END LEGAL */
#pragma once
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
{
typedef typename iobj::scalar_type scalar_type;
typedef typename iobj::vector_type vector_type;
template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
{
typedef typename iobj::scalar_type scalar_type;
typedef typename iobj::vector_type vector_type;
GridBase *grid = l._grid;
int Nsimd = grid->iSites();
GridBase *grid = l.Grid();
int Nsimd = grid->iSites();
std::vector<int> gcoor;
std::vector<scalar_type> mergebuf(Nsimd);
auto l_v = l.View();
thread_for( o, grid->oSites(), {
vector_type vI;
Coordinate gcoor;
ExtractBuffer<scalar_type> mergebuf(Nsimd);
for(int i=0;i<grid->iSites();i++){
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
mergebuf[i]=(Integer)gcoor[mu];
}
merge<vector_type,scalar_type>(vI,mergebuf);
l_v[o]=vI;
});
};
vector_type vI;
for(int o=0;o<grid->oSites();o++){
for(int i=0;i<grid->iSites();i++){
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
mergebuf[i]=(Integer)gcoor[mu];
}
merge<vector_type,scalar_type>(vI,mergebuf);
l._odata[o]=vI;
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
};
}}
}
#endif
NAMESPACE_END(Grid);

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_LOCALREDUCTION_H
#define GRID_LATTICE_LOCALREDUCTION_H
@@ -32,44 +32,56 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// localInner, localNorm, outerProduct
///////////////////////////////////////////////
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////
// Non site, reduced locally reduced routines
/////////////////////////////////////////////////////
/////////////////////////////////////////////////////
// Non site, reduced locally reduced routines
/////////////////////////////////////////////////////
// localNorm2,
template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// localInnerProduct
template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix
template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
{
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// localNorm2,
template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto rhs_v = rhs.View();
auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
});
return ret;
}
// localInnerProduct
template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
});
return ret;
}
// outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix
template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(ll(),rr()))>
{
typedef decltype(coalescedRead(ll())) sll;
typedef decltype(coalescedRead(rr())) srr;
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop
ret_v[ss]=outerProduct(lhs_v[ss],rhs_v[ss]);
});
return ret;
}
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,202 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reduction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
#endif
NAMESPACE_BEGIN(Grid);
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto Y_v = Y.View();
auto R_v = R.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto R_v = R.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
thread_region {
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_loop_collapse2((int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
ComplexD z = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(z),imag(z));
}}
}});
thread_critical {
mat += mat_thread;
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
}
NAMESPACE_END(Grid);

View File

@@ -1,138 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_overload.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_OVERLOAD_H
#define GRID_LATTICE_OVERLOAD_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////////
// unary negation
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> operator -(const Lattice<vobj> &r)
{
Lattice<vobj> ret(r._grid);
parallel_for(int ss=0;ss<r._grid->oSites();ss++){
vstream(ret._odata[ss], -r._odata[ss]);
}
return ret;
}
/////////////////////////////////////////////////////////////////////////////////////
// Lattice BinOp Lattice,
//NB mult performs conformable check. Do not reapply here for performance.
/////////////////////////////////////////////////////////////////////////////////////
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
mult(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
add(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
sub(ret,lhs,rhs);
return ret;
}
// Scalar BinOp Lattice ;generate return type
template<class left,class right>
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs*rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
{
Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs+rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
{
Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
}
return ret;
}
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]*rhs;
}
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
{
Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]+rhs;
}
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
{
Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]-rhs;
}
return ret;
}
}
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -25,8 +25,8 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_PEEK_H
#define GRID_LATTICE_PEEK_H
@@ -34,172 +34,184 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
// Peeking and poking around
///////////////////////////////////////////////
namespace Grid {
////////////////////////////////////////////////////////////////////////////////////////////////////
// Peek internal indices of a Lattice object
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
{
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
ret.checkerboard=lhs.checkerboard;
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
}
return ret;
};
template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
{
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
ret.checkerboard=lhs.checkerboard;
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Poke internal indices of a Lattice object
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
{
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
}
}
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
{
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
}
}
//////////////////////////////////////////////////////
// Poke a scalar object into the SIMD array
//////////////////////////////////////////////////////
template<class vobj,class sobj>
void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){
GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx;
// Optional to broadcast from node 0.
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
grid->Broadcast(grid->BossRank(),s);
std::vector<sobj> buf(Nsimd);
// extract-modify-merge cycle is easiest way and this is not perf critical
if ( rank == grid->ThisRank() ) {
extract(l._odata[odx],buf);
buf[idx] = s;
merge(l._odata[odx],buf);
}
return;
};
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj,class sobj>
void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){
GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard == l._grid->CheckerBoard(site));
int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
std::vector<sobj> buf(Nsimd);
extract(l._odata[odx],buf);
s = buf[idx];
grid->Broadcast(rank,s);
return;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj,class sobj>
void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
GridBase *grid = l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
pt[w] = vp[idx+w*Nsimd];
}
return;
};
template<class vobj,class sobj>
void pokeLocalSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w];
}
return;
};
// FIXME accelerator_loop and accelerator_inline these
////////////////////////////////////////////////////////////////////////////////////////////////////
// Peek internal indices of a Lattice object
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(vobj(),i))>
{
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
});
return ret;
};
template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(vobj(),i,j))>
{
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
});
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Poke internal indices of a Lattice object
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
});
}
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
});
}
//////////////////////////////////////////////////////
// Poke a scalar object into the SIMD array
//////////////////////////////////////////////////////
template<class vobj,class sobj>
void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx;
// Optional to broadcast from node 0.
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
grid->Broadcast(grid->BossRank(),s);
// extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf);
buf[idx] = s;
merge(l_v[odx],buf);
}
return;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj,class sobj>
void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
extract(l_v[odx],buf);
s = buf[idx];
grid->Broadcast(rank,s);
return;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj,class sobj>
accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
GridBase *grid = l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
pt[w] = vp[idx+w*Nsimd];
}
return;
};
template<class vobj,class sobj>
accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w];
}
return;
};
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -25,8 +25,8 @@ Author: neo <cossu@post.kek.jp>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_REALITY_H
#define GRID_LATTICE_REALITY_H
@@ -36,22 +36,28 @@ Author: neo <cossu@post.kek.jp>
// The choice of burying complex in the SIMD
// is making the use of "real" and "imag" very cumbersome
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = adj(lhs._odata[ss]);
}
return ret;
};
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
});
return ret;
};
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
});
return ret;
};
NAMESPACE_END(Grid);
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = conjugate(lhs._odata[ss]);
}
return ret;
};
}
#endif

View File

@@ -19,22 +19,76 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_REDUCTION_H
#define GRID_LATTICE_REDUCTION_H
#pragma once
#include <Grid/Grid_Eigen_Dense.h>
namespace Grid {
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
#ifdef GRID_NVCC
#include <Grid/lattice/Lattice_reduction_gpu.h>
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////
// FIXME this should promote to double and accumulate
//////////////////////////////////////////////////////
template<class vobj>
inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_object sobj;
const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#ifdef GRID_NVCC
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto arg_v = arg.View();
Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites);
arg.Grid()->GlobalSum(ssum);
return ssum;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
auto nrm = innerProduct(arg,arg);
return std::real(nrm);
ComplexD nrm = innerProduct(arg,arg);
return real(nrm);
}
// Double inner product
@@ -43,32 +97,49 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
GridBase *grid = left._grid;
const int pad = 8;
ComplexD nrm;
ComplexD inner;
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
GridBase *grid = left.Grid();
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
// Might make all code paths go this way.
auto left_v = left.View();
auto right_v=right.View();
decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
}
// All threads sum across SIMD; reduce serial work at end
// one write per cacheline with streaming store
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
vstream(sumarray[thr*pad],tmp);
}
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
inner=0.0;
for(int i=0;i<grid->SumArraySize();i++){
inner = inner+sumarray[i*pad];
}
right._grid->GlobalSum(inner);
return inner;
#ifdef GRID_NVCC
// GPU - SIMT lane compliance...
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
})
// This is in single precision and fails some tests
// Need a sumD that sums in double
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
})
nrm = TensorRemove(sum(inner_tmp_v,sites));
#endif
grid->GlobalSum(nrm);
return nrm;
}
/////////////////////////
@@ -86,8 +157,7 @@ axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj
template<class sobj,class vobj> strong_inline RealD
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
const int pad = 8;
z.checkerboard = x.checkerboard;
z.Checkerboard() = x.Checkerboard();
conformable(z,x);
conformable(x,y);
@@ -95,43 +165,57 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
typedef typename vobj::vector_typeD vector_type;
RealD nrm;
GridBase *grid = x._grid;
GridBase *grid = x.Grid();
Vector<RealD> sumarray(grid->SumArraySize()*pad);
auto x_v=x.View();
auto y_v=y.View();
auto z_v=z.View();
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
// private to thread; sub summation
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vnrm = vnrm + innerProductD(tmp,tmp);
vstream(z._odata[ss],tmp);
}
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
}
#ifdef GRID_NVCC
// GPU
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
nrm = 0.0; // sum across threads; linear in thread count but fast
for(int i=0;i<grid->SumArraySize();i++){
nrm = nrm+sumarray[i*pad];
}
z._grid->GlobalSum(nrm);
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp;
});
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm);
return nrm;
}
template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object
{
return sum(closure(expr));
}
@@ -139,54 +223,14 @@ inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))
->typename decltype(expr.op.func(eval(0,expr.arg1),
eval(0,expr.arg2),
eval(0,expr.arg3)
))::scalar_object
{
return sum(closure(expr));
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
GridBase *grid=arg._grid;
int Nsimd = grid->Nsimd();
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;
sobj ssum=zero;
std::vector<sobj> buf(Nsimd);
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg._grid->GlobalSum(ssum);
return ssum;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -199,7 +243,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// But easily avoided by using double precision fields
///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
GridBase *grid = Data._grid;
GridBase *grid = Data.Grid();
assert(grid!=NULL);
const int Nd = grid->_ndimension;
@@ -212,13 +256,13 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars
std::vector<sobj> extracted(Nsimd); // splitting the SIMD
Vector<vobj> lvSum(rd); // will locally sum vectors first
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node
for(int r=0;r<rd;r++){
lvSum[r]=zero;
lvSum[r]=Zero();
}
int e1= grid->_slice_nblock[orthogdim];
@@ -227,20 +271,19 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
parallel_for(int r=0;r<rd;r++){
auto Data_v=Data.View();
thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data._odata[ss];
lvSum[r]=lvSum[r]+Data_v[ss];
}
}
}
});
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){
@@ -265,7 +308,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt];
} else {
gsum=zero;
gsum=Zero();
}
grid->GlobalSum(gsum);
@@ -274,123 +317,14 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
}
}
template<class vobj>
static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
// std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
typedef typename vobj::scalar_type scalar_type;
std::vector<scalar_type> lsSum;
localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
// std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
}
template <class vobj>
static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
// std::cout << GridLogMessage << "Start prep" << std::endl;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
assert(grid!=NULL);
conformable(grid,rhs._grid);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
// std::cout << GridLogMessage << "Start alloc" << std::endl;
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type>> extracted(Nsimd); // splitting the SIMD
// std::cout << GridLogMessage << "End alloc" << std::endl;
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// std::cout << GridLogMessage << "End prep" << std::endl;
// std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
vector_type vv;
parallel_for(int r=0;r<rd;r++)
{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss = so + n * stride + b;
vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss]));
lvSum[r] = lvSum[r] + vv;
}
}
}
// std::cout << GridLogMessage << "End parallel inner product" << std::endl;
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
temp._internal = lvSum[rt];
extract(temp,extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
}
}
// std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
}
template <class vobj>
static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
int fd = result.size();
int ld = lsSum.size();
// sum over nodes.
std::vector<scalar_type> gsum;
gsum.resize(fd, scalar_type(0.0));
// std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum[t]=lsSum[lt];
}
}
// std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
// std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
grid->GlobalSumVector(&gsum[0], fd);
// std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
result = gsum;
}
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
GridBase *grid = lhs.Grid();
assert(grid!=NULL);
conformable(grid,rhs._grid);
conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
@@ -402,34 +336,36 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
Vector<vector_type> lvSum(rd); // will locally sum vectors first
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=zero;
lvSum[r]=Zero();
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
parallel_for(int r=0;r<rd;r++){
auto lhv=lhs.View();
auto rhv=rhs.View();
thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss]));
vector_type vv = TensorRemove(innerProduct(lhv[ss],rhv[ss]));
lvSum[r]=lvSum[r]+vv;
}
}
}
});
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
@@ -470,7 +406,7 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = rhs._grid->GlobalDimensions()[Orthog];
int Nblock = rhs.Grid()->GlobalDimensions()[Orthog];
std::vector<ComplexD> ip(Nblock);
sn.resize(Nblock);
@@ -492,7 +428,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
scalar_type zscale(scale);
GridBase *grid = X._grid;
GridBase *grid = X.Grid();
int Nsimd =grid->Nsimd();
int Nblock =grid->GlobalDimensions()[orthogdim];
@@ -505,8 +441,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
int e2 =grid->_slice_block [orthogdim];
int stride =grid->_slice_stride[orthogdim];
std::vector<int> icoor;
Coordinate icoor;
for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@@ -522,12 +457,15 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av;
parallel_for_nest2(int n=0;n<e1;n++){
auto Rv=R.View();
auto Xv=X.View();
auto Yv=Y.View();
thread_for_collapse(2, n, e1, {
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
R._odata[ss] = at*X._odata[ss]+Y._odata[ss];
Rv[ss] = at*Xv[ss]+Yv[ss];
}
}
});
}
};
@@ -559,18 +497,18 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@@ -578,28 +516,31 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){
auto X_v=X.View();
auto Y_v=Y.View();
auto R_v=R.View();
thread_region
{
Vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, {
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride];
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y[o+i*ostride];
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R[o+i*ostride]=dot;
R_v[o+i*ostride]=dot;
}
}}
}});
}
};
@@ -610,17 +551,17 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl=1;
// int nl=1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@@ -628,17 +569,19 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
auto R_v = R.View();
auto X_v = X.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
thread_for_collapse_in_region( 2 ,n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride];
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
@@ -647,11 +590,10 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R[o+i*ostride]=dot;
R_v[o+i*ostride]=dot;
}
}}
}});
}
};
@@ -662,7 +604,7 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs._grid;
GridBase *FullGrid = lhs.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
@@ -673,9 +615,9 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@@ -686,31 +628,33 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD;
#pragma omp parallel
auto lhs_v=lhs.View();
auto rhs_v=rhs.View();
thread_region
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
thread_for_collapse_in_region( 2, n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs[o+i*ostride];
Right[i] = rhs[o+i*ostride];
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
mat_thread(i,j) += Reduce(rtmp);
auto red = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}}
}}
#pragma omp critical
}});
thread_critical
{
mat += mat_thread;
}
@@ -726,8 +670,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
return;
}
} /*END NAMESPACE GRID*/
#endif
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,226 @@
NAMESPACE_BEGIN(Grid);
#define WARP_SIZE 32
extern cudaDeviceProp *gpu_props;
__device__ unsigned int retirementCount = 0;
template <class Iterator>
unsigned int nextPow2(Iterator x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device;
cudaGetDevice(&device);
Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
std::cout << GridLogDebug << "GPU has:" << std::endl;
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
if (warpSize != WARP_SIZE) {
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
exit(EXIT_FAILURE);
}
// let the number of threads in a block be a multiple of 2, starting from warpSize
threads = warpSize;
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount);
}
template <class sobj, class Iterator>
__device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid) {
Iterator blockSize = blockDim.x;
// cannot use overloaded operators for sobj as they are not volatile-qualified
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
__syncwarp();
const Iterator VEC = WARP_SIZE;
const Iterator vid = tid & (VEC-1);
sobj beta, temp;
memcpy((void *)&beta, (void *)&mySum, sizeof(sobj));
for (int i = VEC/2; i > 0; i>>=1) {
if (vid < i) {
memcpy((void *)&temp, (void *)&sdata[tid+i], sizeof(sobj));
beta += temp;
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
}
__syncwarp();
}
__syncthreads();
if (threadIdx.x == 0) {
beta = Zero();
for (Iterator i = 0; i < blockSize; i += VEC) {
memcpy((void *)&temp, (void *)&sdata[i], sizeof(sobj));
beta += temp;
}
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
}
__syncthreads();
}
template <class vobj, class sobj, class Iterator>
__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n)
{
constexpr Iterator nsimd = vobj::Nsimd();
Iterator blockSize = blockDim.x;
// force shared memory alignment
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
// it's not possible to have two extern __shared__ arrays with same name
// but different types in different scopes -- need to cast each time
sobj *sdata = (sobj *)shmem_pointer;
// first level of reduction,
// each thread writes result in mySum
Iterator tid = threadIdx.x;
Iterator i = blockIdx.x*(blockSize*2) + threadIdx.x;
Iterator gridSize = blockSize*2*gridDim.x;
sobj mySum = Zero();
while (i < n) {
Iterator lane = i % nsimd;
Iterator ss = i / nsimd;
auto tmp = extractLane(lane,g_idata[ss]);
sobj tmpD;
tmpD=tmp;
mySum +=tmpD;
if (i + blockSize < n) {
lane = (i+blockSize) % nsimd;
ss = (i+blockSize) / nsimd;
tmp = extractLane(lane,g_idata[ss]);
tmpD = tmp;
mySum += tmpD;
}
i += gridSize;
}
// copy mySum to shared memory and perform
// reduction for all threads in this block
reduceBlock(sdata, mySum, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class vobj, class sobj,class Iterator>
__global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
Iterator blockSize = blockDim.x;
// perform reduction for this block and
// write result to global memory buffer
reduceBlocks(lat, buffer, n);
if (gridDim.x > 1) {
const Iterator tid = threadIdx.x;
__shared__ bool amLast;
// force shared memory alignment
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
// it's not possible to have two extern __shared__ arrays with same name
// but different types in different scopes -- need to cast each time
sobj *smem = (sobj *)shmem_pointer;
// wait until all outstanding memory instructions in this thread are finished
__threadfence();
if (tid==0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
// true if this block is the last block to be done
amLast = (ticket == gridDim.x-1);
}
// each thread must read the correct value of amLast
__syncthreads();
if (amLast) {
// reduce buffer[0], ..., buffer[gridDim.x-1]
Iterator i = tid;
sobj mySum = Zero();
while (i < gridDim.x) {
mySum += buffer[i];
i += blockSize;
}
reduceBlock(smem, mySum, tid);
if (tid==0) {
buffer[0] = smem[0];
// reset count variable
retirementCount = 0;
}
}
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
Integer smemSize = numThreads * sizeof(sobj);
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if ( cudaSuccess != err ) {
printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0);
}
auto result = buffer_v[0];
return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result;
}
NAMESPACE_END(Grid);

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,8 +24,8 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_RNG_H
#define GRID_LATTICE_RNG_H
@@ -41,282 +41,289 @@
#undef RNG_FAST_DISCARD
#endif
namespace Grid {
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////
// Allow the RNG state to be less dense than the fine grid
//////////////////////////////////////////////////////////////
inline int RNGfillable(GridBase *coarse,GridBase *fine)
{
//////////////////////////////////////////////////////////////
// Allow the RNG state to be less dense than the fine grid
//////////////////////////////////////////////////////////////
inline int RNGfillable(GridBase *coarse,GridBase *fine)
{
int rngdims = coarse->_ndimension;
int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension;
assert(lowerdims >= 0);
for(int d=0;d<lowerdims;d++){
assert(fine->_simd_layout[d]==1);
assert(fine->_processors[d]==1);
}
int multiplicity=1;
for(int d=0;d<lowerdims;d++){
multiplicity=multiplicity*fine->_rdimensions[d];
}
// local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){
int fd= d+lowerdims;
assert(coarse->_processors[d] == fine->_processors[fd]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
}
return multiplicity;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension;
assert(lowerdims >= 0);
for(int d=0;d<lowerdims;d++){
assert(fine->_simd_layout[d]==1);
assert(fine->_processors[d]==1);
}
int multiplicity=1;
for(int d=0;d<lowerdims;d++){
multiplicity=multiplicity*fine->_rdimensions[d];
}
// local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){
int fd= d+lowerdims;
assert(coarse->_processors[d] == fine->_processors[fd]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
}
return multiplicity;
}
// merge of April 11 2017
// this function is necessary for the LS vectorised field
inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
{
int rngdims = coarse->_ndimension;
// this function is necessary for the LS vectorised field
inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
{
int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors
// all further divisions are local
for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors
// all further divisions are local
for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same
assert(fine->Nsimd() == coarse->Nsimd());
// then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same
assert(fine->Nsimd() == coarse->Nsimd());
// check that the two grids divide cleanly
assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
// check that the two grids divide cleanly
assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
return fine->lSites() / coarse->lSites();
}
return fine->lSites() / coarse->lSites();
}
// real scalars are one component
template<class scalar,class distribution,class generator>
void fillScalar(scalar &s,distribution &dist,generator & gen)
{
s=dist(gen);
}
template<class distribution,class generator>
void fillScalar(ComplexF &s,distribution &dist, generator &gen)
{
s=ComplexF(dist(gen),dist(gen));
}
template<class distribution,class generator>
void fillScalar(ComplexD &s,distribution &dist,generator &gen)
{
s=ComplexD(dist(gen),dist(gen));
}
// real scalars are one component
template<class scalar,class distribution,class generator>
void fillScalar(scalar &s,distribution &dist,generator & gen)
{
s=dist(gen);
}
template<class distribution,class generator>
void fillScalar(ComplexF &s,distribution &dist, generator &gen)
{
// s=ComplexF(dist(gen),dist(gen));
s.real(dist(gen));
s.imag(dist(gen));
}
template<class distribution,class generator>
void fillScalar(ComplexD &s,distribution &dist,generator &gen)
{
// s=ComplexD(dist(gen),dist(gen));
s.real(dist(gen));
s.imag(dist(gen));
}
class GridRNGbase {
public:
// One generator per site.
// Uniform and Gaussian distributions from these generators.
class GridRNGbase {
public:
// One generator per site.
// Uniform and Gaussian distributions from these generators.
#ifdef RNG_RANLUX
typedef std::ranlux48 RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 15;
typedef std::ranlux48 RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 15;
#endif
#ifdef RNG_MT19937
typedef std::mt19937 RngEngine;
typedef uint32_t RngStateType;
static const int RngStateCount = std::mt19937::state_size;
typedef std::mt19937 RngEngine;
typedef uint32_t RngStateType;
static const int RngStateCount = std::mt19937::state_size;
#endif
#ifdef RNG_SITMO
typedef sitmo::prng_engine RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 13;
typedef sitmo::prng_engine RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 13;
#endif
std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD> > _uniform;
std::vector<std::normal_distribution<RealD> > _gaussian;
std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::uniform_int_distribution<uint32_t> > _uid;
std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD> > _uniform;
std::vector<std::normal_distribution<RealD> > _gaussian;
std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::uniform_int_distribution<uint32_t> > _uid;
///////////////////////
// support for parallel init
///////////////////////
///////////////////////
// support for parallel init
///////////////////////
#ifdef RNG_FAST_DISCARD
static void Skip(RngEngine &eng,uint64_t site)
{
/////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites
// This goes by 10^12.
// Consider quenched updating; likely never exceeding rate of 1000 sweeps
// per second on any machine. This gives us of order 10^9 seconds, or 100 years
// skip ahead.
// For HMC unlikely to go at faster than a solve per second, and
// tens of seconds per trajectory so this is clean in all reasonable cases,
// and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary
static void Skip(RngEngine &eng,uint64_t site)
{
/////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites
// This goes by 10^12.
// Consider quenched updating; likely never exceeding rate of 1000 sweeps
// per second on any machine. This gives us of order 10^9 seconds, or 100 years
// skip ahead.
// For HMC unlikely to go at faster than a solve per second, and
// tens of seconds per trajectory so this is clean in all reasonable cases,
// and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary
//
// Replace with 2^30 ; avoid problem on large volumes
//
/////////////////////////////////////////////////////////////////////////////////////
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30;
/////////////////////////////////////////////////////////////////////////////////////
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30;
uint64_t skip = site;
////////////////////////////////////////////////////////////////////
// Weird compiler bug in Intel 2018.1 under O3 was generating 32bit and not 64 bit left shift.
////////////////////////////////////////////////////////////////////
volatile uint64_t skip = site;
skip = skip<<shift;
skip = skip<<shift;
assert((skip >> shift)==site); // check for overflow
assert((skip >> shift)==site); // check for overflow
eng.discard(skip);
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
}
eng.discard(skip);
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
}
#endif
static RngEngine Reseed(RngEngine &eng)
{
std::vector<uint32_t> newseed;
std::uniform_int_distribution<uint32_t> uid;
return Reseed(eng,newseed,uid);
}
static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
std::uniform_int_distribution<uint32_t> &uid)
{
const int reseeds=4;
static RngEngine Reseed(RngEngine &eng)
{
std::vector<uint32_t> newseed;
std::uniform_int_distribution<uint32_t> uid;
return Reseed(eng,newseed,uid);
}
static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
std::uniform_int_distribution<uint32_t> &uid)
{
const int reseeds=4;
newseed.resize(reseeds);
for(int i=0;i<reseeds;i++){
newseed[i] = uid(eng);
}
std::seed_seq sseq(newseed.begin(),newseed.end());
return RngEngine(sseq);
newseed.resize(reseeds);
for(int i=0;i<reseeds;i++){
newseed[i] = uid(eng);
}
std::seed_seq sseq(newseed.begin(),newseed.end());
return RngEngine(sseq);
}
void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
saved.resize(RngStateCount);
std::stringstream ss;
ss<<eng;
ss.seekg(0,ss.beg);
for(int i=0;i<RngStateCount;i++){
ss>>saved[i];
}
}
void GetState(std::vector<RngStateType> & saved,int gen) {
GetState(saved,_generators[gen]);
}
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
assert(saved.size()==RngStateCount);
std::stringstream ss;
for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" ";
}
ss.seekg(0,ss.beg);
ss>>eng;
}
void SetState(std::vector<RngStateType> & saved,int gen){
SetState(saved,_generators[gen]);
}
void SetEngine(RngEngine &Eng, int gen){
_generators[gen]=Eng;
}
void GetEngine(RngEngine &Eng, int gen){
Eng=_generators[gen];
}
template<class source> void Seed(source &src, int gen)
{
_generators[gen] = RngEngine(src);
}
};
class GridSerialRNG : public GridRNGbase {
public:
GridSerialRNG() : GridRNGbase() {
_generators.resize(1);
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_uid.resize(1,std::uniform_int_distribution<uint32_t>() );
}
template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
typedef typename sobj::scalar_type scalar_type;
int words = sizeof(sobj)/sizeof(scalar_type);
scalar_type *buf = (scalar_type *) & l;
dist[0].reset();
for(int idx=0;idx<words;idx++){
fillScalar(buf[idx],dist[0],_generators[0]);
}
void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
saved.resize(RngStateCount);
std::stringstream ss;
ss<<eng;
ss.seekg(0,ss.beg);
for(int i=0;i<RngStateCount;i++){
ss>>saved[i];
}
}
void GetState(std::vector<RngStateType> & saved,int gen) {
GetState(saved,_generators[gen]);
}
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
assert(saved.size()==RngStateCount);
std::stringstream ss;
for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" ";
}
ss.seekg(0,ss.beg);
ss>>eng;
}
void SetState(std::vector<RngStateType> & saved,int gen){
SetState(saved,_generators[gen]);
}
void SetEngine(RngEngine &Eng, int gen){
_generators[gen]=Eng;
}
void GetEngine(RngEngine &Eng, int gen){
Eng=_generators[gen];
}
template<class source> void Seed(source &src, int gen)
{
_generators[gen] = RngEngine(src);
}
};
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
class GridSerialRNG : public GridRNGbase {
public:
}
GridSerialRNG() : GridRNGbase() {
_generators.resize(1);
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_uid.resize(1,std::uniform_int_distribution<uint32_t>() );
template <class distribution> inline void fill(ComplexF &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(ComplexD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealF &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
// vector fill
template <class distribution> inline void fill(vComplexF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vComplexD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<vRealF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<vRealD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
typedef typename sobj::scalar_type scalar_type;
int words = sizeof(sobj)/sizeof(scalar_type);
scalar_type *buf = (scalar_type *) & l;
dist[0].reset();
for(int idx=0;idx<words;idx++){
fillScalar(buf[idx],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
};
template <class distribution> inline void fill(ComplexF &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(ComplexD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealF &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
// vector fill
template <class distribution> inline void fill(vComplexF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vComplexD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<vRealF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<vRealD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
void SeedFixedIntegers(const std::vector<int> &seeds){
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq src(seeds.begin(),seeds.end());
Seed(src,0);
}
void SeedFixedIntegers(const std::vector<int> &seeds){
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq src(seeds.begin(),seeds.end());
Seed(src,0);
}
void SeedUniqueString(const std::string &s){
std::vector<int> seeds;
@@ -330,65 +337,67 @@ namespace Grid {
std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
SeedFixedIntegers(seeds);
}
};
};
class GridParallelRNG : public GridRNGbase {
class GridParallelRNG : public GridRNGbase {
private:
double _time_counter;
GridBase *_grid;
unsigned int _vol;
double _time_counter;
public:
GridBase *Grid(void) const { return _grid; }
int generator_idx(int os,int is) {
return is*_grid->oSites()+os;
}
public:
GridBase *_grid;
unsigned int _vol;
GridParallelRNG(GridBase *grid) : GridRNGbase() {
_grid = grid;
_vol =_grid->iSites()*_grid->oSites();
int generator_idx(int os,int is) {
return is*_grid->oSites()+os;
}
_generators.resize(_vol);
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
}
GridParallelRNG(GridBase *grid) : GridRNGbase() {
_grid = grid;
_vol =_grid->iSites()*_grid->oSites();
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
_generators.resize(_vol);
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
}
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
double inner_time_counter = usecond();
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l.Grid() too
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type);
double inner_time_counter = usecond();
auto l_v = l.View();
thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l._grid too
int osites = _grid->oSites(); // guaranteed to be <= l._grid->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type);
int sm = multiplicity * ss + m; // Maps the generator site to the fine site
parallel_for(int ss=0;ss<osites;ss++){
std::vector<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
for (int si = 0; si < Nsimd; si++) {
int sm = multiplicity * ss + m; // Maps the generator site to the fine site
for (int si = 0; si < Nsimd; si++) {
int gdx = generator_idx(ss, si); // index of generator state
scalar_type *pointer = (scalar_type *)&buf[si];
dist[gdx].reset();
for (int idx = 0; idx < words; idx++)
fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
}
// merge into SIMD lanes, FIXME suboptimal implementation
merge(l._odata[sm], buf);
}
int gdx = generator_idx(ss, si); // index of generator state
scalar_type *pointer = (scalar_type *)&buf[si];
dist[gdx].reset();
for (int idx = 0; idx < words; idx++)
fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
}
// merge into SIMD lanes, FIXME suboptimal implementation
merge(l_v[sm], buf);
}
});
// });
_time_counter += usecond()- inner_time_counter;
};
_time_counter += usecond()- inner_time_counter;
}
void SeedUniqueString(const std::string &s){
std::vector<int> seeds;
@@ -398,31 +407,32 @@ namespace Grid {
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
SeedFixedIntegers(seeds);
}
void SeedFixedIntegers(const std::vector<int> &seeds){
void SeedFixedIntegers(const std::vector<int> &seeds){
// Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
// Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq source(seeds.begin(),seeds.end());
std::seed_seq source(seeds.begin(),seeds.end());
RngEngine master_engine(source);
RngEngine master_engine(source);
#ifdef RNG_FAST_DISCARD
////////////////////////////////////////////////
// Skip ahead through a single stream.
// Applicable to SITMO and other has based/crypto RNGs
// Should be applicable to Mersenne Twister, but the C++11
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
// Everybody loops over global volume.
parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){
////////////////////////////////////////////////
// Skip ahead through a single stream.
// Applicable to SITMO and other has based/crypto RNGs
// Should be applicable to Mersenne Twister, but the C++11
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
// Everybody loops over global volume.
thread_for( gidx, _grid->_gsites, {
// Where is it?
int rank,o_idx,i_idx;
std::vector<int> gcoor;
int rank;
int o_idx;
int i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
@@ -432,85 +442,84 @@ namespace Grid {
_generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
}
}
});
#else
////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient
// and maximally parallel; but NOT reproducible from machine to machine.
// Not ideal, but fastest way to reseed all nodes.
////////////////////////////////////////////////////////////////
{
// Obtain one Reseed per processor
int Nproc = _grid->ProcessorCount();
std::vector<RngEngine> seeders(Nproc);
int me= _grid->ThisRank();
for(int p=0;p<Nproc;p++){
seeders[p] = Reseed(master_engine);
}
master_engine = seeders[me];
////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient
// and maximally parallel; but NOT reproducible from machine to machine.
// Not ideal, but fastest way to reseed all nodes.
////////////////////////////////////////////////////////////////
{
// Obtain one Reseed per processor
int Nproc = _grid->ProcessorCount();
std::vector<RngEngine> seeders(Nproc);
int me= _grid->ThisRank();
for(int p=0;p<Nproc;p++){
seeders[p] = Reseed(master_engine);
}
master_engine = seeders[me];
}
{
// Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads();
std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine);
}
{
// Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads();
std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine);
}
parallel_for(int t=0;t<Nthread;t++) {
// set up one per local site in threaded fashion
std::vector<uint32_t> newseeds;
std::uniform_int_distribution<uint32_t> uid;
for(int l=0;l<_grid->lSites();l++) {
if ( (l%Nthread)==t ) {
_generators[l] = Reseed(seeders[t],newseeds,uid);
}
thread_for( t, Nthread, {
// set up one per local site in threaded fashion
std::vector<uint32_t> newseeds;
std::uniform_int_distribution<uint32_t> uid;
for(int l=0;l<_grid->lSites();l++) {
if ( (l%Nthread)==t ) {
_generators[l] = Reseed(seeders[t],newseeds,uid);
}
}
}
});
}
#endif
}
void Report(){
std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
}
////////////////////////////////////////////////////////////////////////
// Support for rigorous test of RNG's
// Return uniform random uint32_t from requested site generator
////////////////////////////////////////////////////////////////////////
uint32_t GlobalU01(int gsite){
uint32_t the_number;
// who
int rank,o_idx,i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gsite,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// draw
int l_idx=generator_idx(o_idx,i_idx);
if( rank == _grid->ThisRank() ){
the_number = _uid[l_idx](_generators[l_idx]);
}
void Report(){
std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
}
// share & return
_grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
return the_number;
}
};
////////////////////////////////////////////////////////////////////////
// Support for rigorous test of RNG's
// Return uniform random uint32_t from requested site generator
////////////////////////////////////////////////////////////////////////
uint32_t GlobalU01(int gsite){
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); }
template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
uint32_t the_number;
// who
std::vector<int> gcoor;
int rank,o_idx,i_idx;
_grid->GlobalIndexToGlobalCoor(gsite,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); }
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
// draw
int l_idx=generator_idx(o_idx,i_idx);
if( rank == _grid->ThisRank() ){
the_number = _uid[l_idx](_generators[l_idx]);
}
// share & return
_grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
return the_number;
}
};
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); }
template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); }
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_TRACE_H
#define GRID_LATTICE_TRACE_H
@@ -32,36 +32,38 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// Tracing, transposing, peeking, poking
///////////////////////////////////////////////
namespace Grid {
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline auto trace(const Lattice<vobj> &lhs)
-> Lattice<decltype(trace(lhs._odata[0]))>
{
Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = trace(lhs._odata[ss]);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{
Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
});
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace Index level dependent operation
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
{
Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace Index level dependent operation
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
});
return ret;
};
}
NAMESPACE_END(Grid);
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_TRANSPOSE_H
#define GRID_LATTICE_TRANSPOSE_H
@@ -33,31 +33,36 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// Transpose
///////////////////////////////////////////////
namespace Grid {
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////////
// Transpose
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = transpose(lhs._odata[ss]);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Transpose
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
});
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Index level dependent transpose
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
{
Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
}
return ret;
};
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Index level dependent transpose
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
});
return ret;
};
NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -26,59 +26,55 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_UNARY_H
#define GRID_LATTICE_UNARY_H
namespace Grid {
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs,RealD y){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=pow(rhs._odata[ss],y);
}
return ret;
}
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs,Integer y){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=mod(rhs._odata[ss],y);
}
return ret;
}
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=div(rhs._odata[ss],y);
}
return ret;
}
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
}
return ret;
}
NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y);
});
return ret_i;
}
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],mod(rhs(ss),y));
});
return ret_i;
}
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid());
auto ret = ret_i.View();
auto rhs = rhs_i.View();
ret.Checkerboard() = rhs_i.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],div(rhs(ss),y));
});
return ret_i;
}
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
});
return ret_i;
}
NAMESPACE_END(Grid);
#endif

View File

@@ -28,27 +28,27 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/util/CompilerCompatible.h>
#include <cxxabi.h>
#include <memory>
namespace Grid {
NAMESPACE_BEGIN(Grid);
std::string demangle(const char* name) {
std::string demangle(const char* name) {
int status = -4; // some arbitrary value to eliminate the compiler warning
int status = -4; // some arbitrary value to eliminate the compiler warning
// enable c++11 by passing the flag -std=c++11 to g++
std::unique_ptr<char, void(*)(void*)> res {
abi::__cxa_demangle(name, NULL, NULL, &status),
std::free
};
// enable c++11 by passing the flag -std=c++11 to g++
std::unique_ptr<char, void(*)(void*)> res {
abi::__cxa_demangle(name, NULL, NULL, &status),
std::free
};
return (status==0) ? res.get() : name ;
}
return (status==0) ? res.get() : name ;
}
GridStopWatch Logger::GlobalStopWatch;
int Logger::timestamp;
@@ -109,8 +109,9 @@ void Grid_quiesce_nodes(void) {
}
void Grid_unquiesce_nodes(void) {
#ifdef GRID_COMMS_MPI
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
std::cout.clear();
#endif
}
}
NAMESPACE_END(Grid);

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -25,8 +25,8 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#include <map>
@@ -37,13 +37,12 @@
#include <execinfo.h>
#endif
namespace Grid {
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dress the output; use std::chrono for time stamping via the StopWatch class
//////////////////////////////////////////////////////////////////////////////////////////////////
class Colours{
protected:
bool is_active;
@@ -57,15 +56,15 @@ public:
void Active(bool activate){
is_active=activate;
if (is_active){
colour["BLACK"] ="\033[30m";
colour["RED"] ="\033[31m";
colour["GREEN"] ="\033[32m";
colour["YELLOW"] ="\033[33m";
colour["BLUE"] ="\033[34m";
colour["PURPLE"] ="\033[35m";
colour["CYAN"] ="\033[36m";
colour["WHITE"] ="\033[37m";
colour["NORMAL"] ="\033[0;39m";
colour["BLACK"] ="\033[30m";
colour["RED"] ="\033[31m";
colour["GREEN"] ="\033[32m";
colour["YELLOW"] ="\033[33m";
colour["BLUE"] ="\033[34m";
colour["PURPLE"] ="\033[35m";
colour["CYAN"] ="\033[36m";
colour["WHITE"] ="\033[37m";
colour["NORMAL"] ="\033[0;39m";
} else {
colour["BLACK"] ="";
colour["RED"] ="";
@@ -102,14 +101,14 @@ public:
std::string colour() {return Painter.colour[COLOUR];}
Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) : active(on),
name(nm),
topName(topNm),
Painter(col_class),
timing_mode(0),
COLOUR(col)
{
StopWatch = & GlobalStopWatch;
};
name(nm),
topName(topNm),
Painter(col_class),
timing_mode(0),
COLOUR(col)
{
StopWatch = & GlobalStopWatch;
};
void Active(int on) {active = on;};
int isActive(void) {return active;};
@@ -164,7 +163,7 @@ public:
class GridLogger: public Logger {
public:
GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
Logger("Grid", on, nm, col_class, col_key){};
Logger("Grid", on, nm, col_class, col_key){};
};
void GridLogConfigure(std::vector<std::string> &logstreams);
@@ -181,39 +180,39 @@ extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ;
extern Colours GridLogColours;
std::string demangle(const char* name) ;
std::string demangle(const char* name) ;
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#define BACKTRACEFILE() {\
char string[20]; \
std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
std::FILE * fp = std::fopen(string,"w"); \
BACKTRACEFP(fp)\
std::fclose(fp); \
}
#define BACKTRACEFILE() { \
char string[20]; \
std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
std::FILE * fp = std::fopen(string,"w"); \
BACKTRACEFP(fp) \
std::fclose(fp); \
}
#ifdef HAVE_EXECINFO_H
#define BACKTRACEFP(fp) { \
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);\
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
for (int i = 0; i < symbols; i++){\
std::fprintf (fp,"BackTrace Strings: %d %s\n",i, demangle(strings[i]).c_str()); std::fflush(fp); \
}\
}
#define BACKTRACEFP(fp) { \
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE); \
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols); \
for (int i = 0; i < symbols; i++){ \
std::fprintf (fp,"BackTrace Strings: %d %s\n",i, demangle(strings[i]).c_str()); std::fflush(fp); \
} \
}
#else
#define BACKTRACEFP(fp) { \
std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
}
#define BACKTRACEFP(fp) { \
std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
}
#endif
#define BACKTRACE() BACKTRACEFP(stdout)
NAMESPACE_END(Grid);
}
#endif

View File

@@ -26,8 +26,7 @@
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BINARY_IO_H
#define GRID_BINARY_IO_H
#pragma once
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
#define USE_MPI_IO
@@ -42,8 +41,7 @@
#include <arpa/inet.h>
#include <algorithm>
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// Byte reversal garbage
@@ -91,7 +89,7 @@ class BinaryIO {
{
typedef typename vobj::scalar_object sobj;
GridBase *grid = lat._grid;
GridBase *grid = lat.Grid();
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
@@ -111,21 +109,20 @@ class BinaryIO {
lsites = 1;
}
PARALLEL_REGION
thread_region
{
uint32_t nersc_csum_thr = 0;
PARALLEL_FOR_LOOP_INTERN
for (uint64_t local_site = 0; local_site < lsites; local_site++)
thread_for_in_region( local_site, lsites,
{
uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
for (uint64_t j = 0; j < size32; j++)
{
nersc_csum_thr = nersc_csum_thr + site_buf[j];
}
}
});
PARALLEL_CRITICAL
thread_critical
{
nersc_csum += nersc_csum_thr;
}
@@ -134,28 +131,25 @@ PARALLEL_CRITICAL
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
{
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
int nd = grid->_ndimension;
uint64_t lsites =grid->lSites();
if (fbuf.size()==1) {
lsites=1;
}
std::vector<int> local_vol =grid->LocalDimensions();
std::vector<int> local_start =grid->LocalStarts();
std::vector<int> global_vol =grid->FullDimensions();
Coordinate local_vol =grid->LocalDimensions();
Coordinate local_start =grid->LocalStarts();
Coordinate global_vol =grid->FullDimensions();
PARALLEL_REGION
thread_region
{
std::vector<int> coor(nd);
Coordinate coor(nd);
uint32_t scidac_csuma_thr=0;
uint32_t scidac_csumb_thr=0;
uint32_t site_crc=0;
PARALLEL_FOR_LOOP_INTERN
for(uint64_t local_site=0;local_site<lsites;local_site++){
thread_for_in_region( local_site, lsites,
{
uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
@@ -182,9 +176,9 @@ PARALLEL_FOR_LOOP_INTERN
// std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
}
});
PARALLEL_CRITICAL
thread_critical
{
scidac_csuma^= scidac_csuma_thr;
scidac_csumb^= scidac_csumb_thr;
@@ -202,9 +196,9 @@ PARALLEL_CRITICAL
{
uint32_t * f = (uint32_t *)file_object;
uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){
thread_for( i, count, {
f[i] = ntohl(f[i]);
}
});
}
// LE must Swap and switch to host
static inline void le32toh_v(void *file_object,uint64_t bytes)
@@ -212,13 +206,13 @@ PARALLEL_CRITICAL
uint32_t *fp = (uint32_t *)file_object;
uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){
thread_for(i,count,{
uint32_t f;
f = fp[i];
// got network order and the network to host
f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = ntohl(f);
}
});
}
// BE is same as network
@@ -226,9 +220,9 @@ PARALLEL_CRITICAL
{
uint64_t * f = (uint64_t *)file_object;
uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){
thread_for( i, count, {
f[i] = Grid_ntohll(f[i]);
}
});
}
// LE must swap and switch;
@@ -236,7 +230,7 @@ PARALLEL_CRITICAL
{
uint64_t *fp = (uint64_t *)file_object;
uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){
thread_for( i, count, {
uint64_t f,g;
f = fp[i];
// got network order and the network to host
@@ -245,7 +239,7 @@ PARALLEL_CRITICAL
f = f >> 32;
g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = Grid_ntohll(g);
}
});
}
/////////////////////////////////////////////////////////////////////////////
// Real action:
@@ -281,13 +275,13 @@ PARALLEL_CRITICAL
int nrank = grid->ProcessorCount();
int myrank = grid->ThisRank();
std::vector<int> psizes = grid->ProcessorGrid();
std::vector<int> pcoor = grid->ThisProcessorCoor();
std::vector<int> gLattice= grid->GlobalDimensions();
std::vector<int> lLattice= grid->LocalDimensions();
Coordinate psizes = grid->ProcessorGrid();
Coordinate pcoor = grid->ThisProcessorCoor();
Coordinate gLattice= grid->GlobalDimensions();
Coordinate lLattice= grid->LocalDimensions();
std::vector<int> lStart(ndim);
std::vector<int> gStart(ndim);
Coordinate lStart(ndim);
Coordinate gStart(ndim);
// Flatten the file
uint64_t lsites = grid->lSites();
@@ -546,7 +540,7 @@ PARALLEL_CRITICAL
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
GridBase *grid = Umu.Grid();
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
@@ -558,7 +552,7 @@ PARALLEL_CRITICAL
GridStopWatch timer;
timer.Start();
parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
thread_for(x,lsites, { munge(iodata[x], scalardata[x]); });
vectorizeFromLexOrdArray(scalardata,Umu);
grid->Barrier();
@@ -582,7 +576,7 @@ PARALLEL_CRITICAL
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
GridBase *grid = Umu.Grid();
uint64_t lsites = grid->lSites(), offsetCopy = offset;
int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);
@@ -596,7 +590,7 @@ PARALLEL_CRITICAL
GridStopWatch timer; timer.Start();
unvectorizeToLexOrdArray(scalardata,Umu);
parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
thread_for(x, lsites, { munge(scalardata[x],iodata[x]); });
grid->Barrier();
timer.Stop();
@@ -619,7 +613,7 @@ PARALLEL_CRITICAL
{
std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
offset = offsetCopy;
parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
thread_for(x,lsites, { munge(scalardata[x],iodata[x]); });
}
else
{
@@ -637,8 +631,8 @@ PARALLEL_CRITICAL
/////////////////////////////////////////////////////////////////////////////
// Read a RNG; use IOobject and lexico map to an array of state
//////////////////////////////////////////////////////////////////////////////////////
static inline void readRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
static inline void readRNG(GridSerialRNG &serial_rng,
GridParallelRNG &parallel_rng,
std::string file,
uint64_t offset,
uint32_t &nersc_csum,
@@ -652,7 +646,7 @@ PARALLEL_CRITICAL
std::string format = "IEEE32BIG";
GridBase *grid = parallel._grid;
GridBase *grid = parallel_rng.Grid();
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
@@ -669,11 +663,11 @@ PARALLEL_CRITICAL
nersc_csum,scidac_csuma,scidac_csumb);
timer.Start();
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel.SetState(tmp,lidx);
}
parallel_rng.SetState(tmp,lidx);
});
timer.Stop();
iodata.resize(1);
@@ -683,7 +677,7 @@ PARALLEL_CRITICAL
{
std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
serial.SetState(tmp,0);
serial_rng.SetState(tmp,0);
}
nersc_csum = nersc_csum + nersc_csum_tmp;
@@ -699,8 +693,8 @@ PARALLEL_CRITICAL
/////////////////////////////////////////////////////////////////////////////
// Write a RNG; lexico map to an array of state and use IOobject
//////////////////////////////////////////////////////////////////////////////////////
static inline void writeRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
static inline void writeRNG(GridSerialRNG &serial_rng,
GridParallelRNG &parallel_rng,
std::string file,
uint64_t offset,
uint32_t &nersc_csum,
@@ -712,7 +706,7 @@ PARALLEL_CRITICAL
const int RngStateCount = GridSerialRNG::RngStateCount;
typedef std::array<RngStateType,RngStateCount> RNGstate;
GridBase *grid = parallel._grid;
GridBase *grid = parallel_rng.Grid();
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
@@ -727,11 +721,11 @@ PARALLEL_CRITICAL
timer.Start();
std::vector<RNGstate> iodata(lsites);
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount);
parallel.GetState(tmp,lidx);
parallel_rng.GetState(tmp,lidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
}
});
timer.Stop();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
@@ -739,7 +733,7 @@ PARALLEL_CRITICAL
iodata.resize(1);
{
std::vector<RngStateType> tmp(RngStateCount);
serial.GetState(tmp,0);
serial_rng.GetState(tmp,0);
std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
}
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
@@ -756,5 +750,4 @@ PARALLEL_CRITICAL
}
};
}
#endif
NAMESPACE_END(Grid);

View File

@@ -24,8 +24,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ILDG_IO_H
#define GRID_ILDG_IO_H
#pragma once
#ifdef HAVE_LIME
#include <algorithm>
@@ -43,8 +42,7 @@ extern "C" {
#include "lime.h"
}
namespace Grid {
namespace QCD {
NAMESPACE_BEGIN(Grid);
#define GRID_FIELD_NORM "FieldNormMetaData"
#define GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) \
@@ -140,7 +138,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
/////////////////////////////////////
// Scidac Private File structure
/////////////////////////////////////
_scidacFile = scidacFile(field._grid);
_scidacFile = scidacFile(field.Grid());
/////////////////////////////////////
// Scidac Private Record structure
@@ -227,10 +225,10 @@ class GridLimeReader : public BinaryIO {
// std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
uint64_t PayloadSize = sizeof(sobj) * field.Grid()->_gsites;
// std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
// std::cout << "R Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "R Gsites " <<field.Grid()->_gsites<<std::endl;
// std::cout << "R Payload expected " <<PayloadSize<<std::endl;
// std::cout << "R file size " <<file_bytes <<std::endl;
@@ -406,7 +404,7 @@ class GridLimeWriter : public BinaryIO
////////////////////////////////////////////////////
// Write a generic lattice field and csum
// This routine is Collectively called by all nodes
// in communicator used by the field._grid
// in communicator used by the field.Grid()
////////////////////////////////////////////////////
template<class vobj>
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
@@ -425,8 +423,8 @@ class GridLimeWriter : public BinaryIO
// v) Continue writing scidac record.
////////////////////////////////////////////////////////////////////
GridBase *grid = field._grid;
assert(boss_node == field._grid->IsBoss() );
GridBase *grid = field.Grid();
assert(boss_node == field.Grid()->IsBoss() );
FieldNormMetaData FNMD; FNMD.norm2 = norm2(field);
@@ -443,7 +441,7 @@ class GridLimeWriter : public BinaryIO
}
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "W Gsites " <<field.Grid()->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl;
////////////////////////////////////////////////
@@ -515,7 +513,7 @@ class ScidacWriter : public GridLimeWriter {
void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
const unsigned int recordScientificPrec = 0)
{
GridBase * grid = field._grid;
GridBase * grid = field.Grid();
////////////////////////////////////////
// fill the Grid header
@@ -557,7 +555,7 @@ class ScidacReader : public GridLimeReader {
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
{
typedef typename vobj::scalar_object sobj;
GridBase * grid = field._grid;
GridBase * grid = field.Grid();
////////////////////////////////////////
// fill the Grid header
@@ -624,7 +622,7 @@ class IldgWriter : public ScidacWriter {
template <class vsimd>
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
{
GridBase * grid = Umu._grid;
GridBase * grid = Umu.Grid();
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj;
@@ -717,9 +715,9 @@ class IldgReader : public GridLimeReader {
typedef LorentzColourMatrixF fobj;
typedef LorentzColourMatrixD dobj;
GridBase *grid = Umu._grid;
GridBase *grid = Umu.Grid();
std::vector<int> dims = Umu._grid->FullDimensions();
Coordinate dims = Umu.Grid()->FullDimensions();
assert(dims.size()==4);
@@ -853,6 +851,7 @@ class IldgReader : public GridLimeReader {
// Minimally must find binary segment and checksum
// Since this is an ILDG reader require ILDG format
//////////////////////////////////////////////////////
assert(found_ildgLFN);
assert(found_ildgBinary);
assert(found_ildgFormat);
assert(found_scidacChecksum);
@@ -930,9 +929,9 @@ class IldgReader : public GridLimeReader {
}
};
}}
NAMESPACE_END(Grid);
//HAVE_LIME
#endif
#endif

View File

@@ -23,7 +23,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
/* END LEGAL */
#ifndef GRID_ILDGTYPES_IO_H
#define GRID_ILDGTYPES_IO_H
@@ -32,7 +32,7 @@ extern "C" { // for linkage
#include "lime.h"
}
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// Data representation of records that enter ILDG and SciDac formats
@@ -51,12 +51,12 @@ namespace Grid {
// Unused SCIDAC records names; could move to support this functionality
#define SCIDAC_SITELIST "scidac-sitelist"
////////////////////////////////////////////////////////////
const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat
const int GRID_IO_MULTIFILE = 1; // hardcode lift from QIO compat
const int GRID_IO_FIELD = 0; // hardcode lift from QIO compat
const int GRID_IO_GLOBAL = 1; // hardcode lift from QIO compat
////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////
const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat
const int GRID_IO_MULTIFILE = 1; // hardcode lift from QIO compat
const int GRID_IO_FIELD = 0; // hardcode lift from QIO compat
const int GRID_IO_GLOBAL = 1; // hardcode lift from QIO compat
////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////
// QIO uses mandatory "private" records fixed format
@@ -74,7 +74,7 @@ struct emptyUserRecord : Serializable {
// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
////////////////////////
struct scidacFile : Serializable {
public:
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile,
double, version,
int, spacetime,
@@ -91,7 +91,7 @@ struct scidacFile : Serializable {
return dimensions;
}
void setDimensions(std::vector<int> dimensions) {
void setDimensions(Coordinate dimensions) {
char delimiter = ' ';
std::stringstream stream;
for(int i=0;i<dimensions.size();i++){
@@ -124,7 +124,7 @@ struct scidacFile : Serializable {
///////////////////////////////////////////////////////////////////////
struct scidacRecord : Serializable {
public:
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord,
double, version,
std::string, date,
@@ -160,7 +160,7 @@ public:
// USQCD info
////////////////////////
struct usqcdInfo : Serializable {
public:
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo,
double, version,
double, plaq,
@@ -174,7 +174,7 @@ struct usqcdInfo : Serializable {
// Scidac Checksum
////////////////////////
struct scidacChecksum : Serializable {
public:
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
double, version,
std::string, suma,
@@ -201,7 +201,7 @@ struct scidacChecksum : Serializable {
// From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf
////////////////////////////////////////////////////////////////////////////////////////
struct usqcdPropFile : Serializable {
public:
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile,
double, version,
std::string, type,
@@ -211,7 +211,7 @@ struct usqcdPropFile : Serializable {
};
};
struct usqcdSourceInfo : Serializable {
public:
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo,
double, version,
std::string, info);
@@ -220,7 +220,7 @@ struct usqcdSourceInfo : Serializable {
};
};
struct usqcdPropInfo : Serializable {
public:
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo,
double, version,
int, spin,
@@ -232,6 +232,6 @@ struct usqcdPropInfo : Serializable {
};
#endif
}
NAMESPACE_END(Grid);
#endif
#endif

View File

@@ -36,23 +36,24 @@
#include <sys/utsname.h>
#include <pwd.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////
// Precision mapping
///////////////////////////////////////////////////////
template<class vobj> static std::string getFormatString (void)
{
std::string format;
typedef typename getPrecision<vobj>::real_scalar_type stype;
if ( sizeof(stype) == sizeof(float) ) {
format = std::string("IEEE32BIG");
}
if ( sizeof(stype) == sizeof(double) ) {
format = std::string("IEEE64BIG");
}
return format;
///////////////////////////////////////////////////////
// Precision mapping
///////////////////////////////////////////////////////
template<class vobj> static std::string getFormatString (void)
{
std::string format;
typedef typename getPrecision<vobj>::real_scalar_type stype;
if ( sizeof(stype) == sizeof(float) ) {
format = std::string("IEEE32BIG");
}
if ( sizeof(stype) == sizeof(double) ) {
format = std::string("IEEE64BIG");
}
return format;
};
////////////////////////////////////////////////////////////////////////////////
// header specification/interpretation
////////////////////////////////////////////////////////////////////////////////
@@ -93,146 +94,145 @@ namespace Grid {
link_trace(0.), plaquette(0.), checksum(0),
scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
{}
};
};
namespace QCD {
// PB disable using namespace - this is a header and forces namesapce visibility for all
// including files
//using namespace Grid;
using namespace Grid;
//////////////////////////////////////////////////////////////////////
// Bit and Physical Checksumming and QA of data
//////////////////////////////////////////////////////////////////////
inline void GridMetaData(GridBase *grid,FieldMetaData &header)
{
int nd = grid->_ndimension;
header.nd = nd;
header.dimension.resize(nd);
header.boundary.resize(nd);
header.data_start = 0;
for(int d=0;d<nd;d++) {
header.dimension[d] = grid->_fdimensions[d];
}
for(int d=0;d<nd;d++) {
header.boundary[d] = std::string("PERIODIC");
}
}
inline void MachineCharacteristics(FieldMetaData &header)
{
// Who
struct passwd *pw = getpwuid (getuid());
if (pw) header.creator = std::string(pw->pw_name);
//////////////////////////////////////////////////////////////////////
// Bit and Physical Checksumming and QA of data
//////////////////////////////////////////////////////////////////////
inline void GridMetaData(GridBase *grid,FieldMetaData &header)
{
int nd = grid->_ndimension;
header.nd = nd;
header.dimension.resize(nd);
header.boundary.resize(nd);
header.data_start = 0;
for(int d=0;d<nd;d++) {
header.dimension[d] = grid->_fdimensions[d];
}
for(int d=0;d<nd;d++) {
header.boundary[d] = std::string("PERIODIC");
}
}
// When
std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t);
std::ostringstream oss;
// oss << std::put_time(&tm_, "%c %Z");
header.creation_date = oss.str();
header.archive_date = header.creation_date;
inline void MachineCharacteristics(FieldMetaData &header)
{
// Who
struct passwd *pw = getpwuid (getuid());
if (pw) header.creator = std::string(pw->pw_name);
// When
std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t);
std::ostringstream oss;
// oss << std::put_time(&tm_, "%c %Z");
header.creation_date = oss.str();
header.archive_date = header.creation_date;
// What
struct utsname name; uname(&name);
header.creator_hardware = std::string(name.nodename)+"-";
header.creator_hardware+= std::string(name.machine)+"-";
header.creator_hardware+= std::string(name.sysname)+"-";
header.creator_hardware+= std::string(name.release);
}
// What
struct utsname name; uname(&name);
header.creator_hardware = std::string(name.nodename)+"-";
header.creator_hardware+= std::string(name.machine)+"-";
header.creator_hardware+= std::string(name.sysname)+"-";
header.creator_hardware+= std::string(name.release);
}
#define dump_meta_data(field, s) \
s << "BEGIN_HEADER" << std::endl; \
s << "HDR_VERSION = " << field.hdr_version << std::endl; \
s << "DATATYPE = " << field.data_type << std::endl; \
s << "STORAGE_FORMAT = " << field.storage_format << std::endl; \
for(int i=0;i<4;i++){ \
s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
} \
s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
s << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl; \
for(int i=0;i<4;i++){ \
s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl; \
} \
s << "BEGIN_HEADER" << std::endl; \
s << "HDR_VERSION = " << field.hdr_version << std::endl; \
s << "DATATYPE = " << field.data_type << std::endl; \
s << "STORAGE_FORMAT = " << field.storage_format << std::endl; \
for(int i=0;i<4;i++){ \
s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
} \
s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
s << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl; \
for(int i=0;i<4;i++){ \
s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl; \
} \
\
s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
s << "ENSEMBLE_ID = " << field.ensemble_id << std::endl; \
s << "ENSEMBLE_LABEL = " << field.ensemble_label << std::endl; \
s << "SEQUENCE_NUMBER = " << field.sequence_number << std::endl; \
s << "CREATOR = " << field.creator << std::endl; \
s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl; \
s << "CREATION_DATE = " << field.creation_date << std::endl; \
s << "ARCHIVE_DATE = " << field.archive_date << std::endl; \
s << "FLOATING_POINT = " << field.floating_point << std::endl; \
s << "END_HEADER" << std::endl;
s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
s << "ENSEMBLE_ID = " << field.ensemble_id << std::endl; \
s << "ENSEMBLE_LABEL = " << field.ensemble_label << std::endl; \
s << "SEQUENCE_NUMBER = " << field.sequence_number << std::endl; \
s << "CREATOR = " << field.creator << std::endl; \
s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl; \
s << "CREATION_DATE = " << field.creation_date << std::endl; \
s << "ARCHIVE_DATE = " << field.archive_date << std::endl; \
s << "FLOATING_POINT = " << field.floating_point << std::endl; \
s << "END_HEADER" << std::endl;
template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
{
GridBase *grid = field._grid;
GridBase *grid = field.Grid();
std::string format = getFormatString<vobj>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
MachineCharacteristics(header);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
{
// How to convert data precision etc...
header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data);
header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{
// How to convert data precision etc...
header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data);
header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
MachineCharacteristics(header);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
{
// How to convert data precision etc...
header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{
// How to convert data precision etc...
header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{
GridBase *grid = field._grid;
std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{
GridBase *grid = field._grid;
std::string format = getFormatString<vLorentzColourMatrixD>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{
GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixD>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
//////////////////////////////////////////////////////////////////////
// Utilities ; these are QCD aware
//////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm)
{
const int x=0;
const int y=1;
const int z=2;
for(int mu=0;mu<Nd;mu++){
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
}
}
//////////////////////////////////////////////////////////////////////
// Utilities ; these are QCD aware
//////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm)
{
const int x=0;
const int y=1;
const int z=2;
for(int mu=0;mu<Nd;mu++){
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
}
}
////////////////////////////////////////////////////////////////////////////////
// Some data types for intermediate storage
////////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
////////////////////////////////////////////////////////////////////////////////
// Some data types for intermediate storage
////////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
typedef iLorentzColour2x3<Complex> LorentzColour2x3;
typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
typedef iLorentzColour2x3<Complex> LorentzColour2x3;
typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
/////////////////////////////////////////////////////////////////////////////////
// Simple classes for precision conversion
@@ -276,56 +276,55 @@ struct BinarySimpleMunger {
};
template<class fobj,class sobj>
struct GaugeSimpleMunger{
void operator()(fobj &in, sobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template<class fobj,class sobj>
struct GaugeSimpleMunger{
void operator()(fobj &in, sobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template <class fobj, class sobj>
struct GaugeSimpleUnmunger {
template <class fobj, class sobj>
struct GaugeSimpleUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template<class fobj,class sobj>
struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)()(i,j) = in(mu)(i)(j);
}}
}
reconstruct3(out);
}
};
template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}
}
};
template<class fobj,class sobj>
struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)()(i,j) = in(mu)(i)(j);
}}
}
reconstruct3(out);
}
};
template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}
}
};
NAMESPACE_END(Grid);
}

View File

@@ -30,334 +30,330 @@
#ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H
namespace Grid {
namespace QCD {
NAMESPACE_BEGIN(Grid);
using namespace Grid;
using namespace Grid;
////////////////////////////////////////////////////////////////////////////////
// Write and read from fstream; comput header offset for payload
////////////////////////////////////////////////////////////////////////////////
class NerscIO : public BinaryIO {
public:
////////////////////////////////////////////////////////////////////////////////
// Write and read from fstream; comput header offset for payload
////////////////////////////////////////////////////////////////////////////////
class NerscIO : public BinaryIO {
public:
static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out);
}
static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out);
}
static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
{
std::ofstream fout(file,std::ios::out|std::ios::in);
fout.seekp(0,std::ios::beg);
dump_meta_data(field, fout);
field.data_start = fout.tellp();
return field.data_start;
}
static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
{
std::ofstream fout(file,std::ios::out|std::ios::in);
fout.seekp(0,std::ios::beg);
dump_meta_data(field, fout);
field.data_start = fout.tellp();
return field.data_start;
}
// for the header-reader
static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field)
{
uint64_t offset=0;
std::map<std::string,std::string> header;
std::string line;
// for the header-reader
static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field)
{
std::map<std::string,std::string> header;
std::string line;
//////////////////////////////////////////////////
// read the header
//////////////////////////////////////////////////
std::ifstream fin(file);
//////////////////////////////////////////////////
// read the header
//////////////////////////////////////////////////
std::ifstream fin(file);
getline(fin,line); // read one line and insist is
getline(fin,line); // read one line and insist is
removeWhitespace(line);
std::cout << GridLogMessage << "* " << line << std::endl;
removeWhitespace(line);
std::cout << GridLogMessage << "* " << line << std::endl;
assert(line==std::string("BEGIN_HEADER"));
assert(line==std::string("BEGIN_HEADER"));
do {
do {
getline(fin,line); // read one line
std::cout << GridLogMessage << "* "<<line<< std::endl;
int eq = line.find("=");
if(eq >0) {
std::string key=line.substr(0,eq);
std::string val=line.substr(eq+1);
removeWhitespace(key);
removeWhitespace(val);
std::string key=line.substr(0,eq);
std::string val=line.substr(eq+1);
removeWhitespace(key);
removeWhitespace(val);
header[key] = val;
}
header[key] = val;
}
} while( line.find("END_HEADER") == std::string::npos );
field.data_start = fin.tellg();
field.data_start = fin.tellg();
//////////////////////////////////////////////////
// chomp the values
//////////////////////////////////////////////////
field.hdr_version = header["HDR_VERSION"];
field.data_type = header["DATATYPE"];
field.storage_format = header["STORAGE_FORMAT"];
//////////////////////////////////////////////////
// chomp the values
//////////////////////////////////////////////////
field.hdr_version = header["HDR_VERSION"];
field.data_type = header["DATATYPE"];
field.storage_format = header["STORAGE_FORMAT"];
field.dimension[0] = std::stol(header["DIMENSION_1"]);
field.dimension[1] = std::stol(header["DIMENSION_2"]);
field.dimension[2] = std::stol(header["DIMENSION_3"]);
field.dimension[3] = std::stol(header["DIMENSION_4"]);
field.dimension[0] = std::stol(header["DIMENSION_1"]);
field.dimension[1] = std::stol(header["DIMENSION_2"]);
field.dimension[2] = std::stol(header["DIMENSION_3"]);
field.dimension[3] = std::stol(header["DIMENSION_4"]);
assert(grid->_ndimension == 4);
for(int d=0;d<4;d++){
assert(grid->_ndimension == 4);
for(int d=0;d<4;d++){
assert(grid->_fdimensions[d]==field.dimension[d]);
}
field.link_trace = std::stod(header["LINK_TRACE"]);
field.plaquette = std::stod(header["PLAQUETTE"]);
field.link_trace = std::stod(header["LINK_TRACE"]);
field.plaquette = std::stod(header["PLAQUETTE"]);
field.boundary[0] = header["BOUNDARY_1"];
field.boundary[1] = header["BOUNDARY_2"];
field.boundary[2] = header["BOUNDARY_3"];
field.boundary[3] = header["BOUNDARY_4"];
field.boundary[0] = header["BOUNDARY_1"];
field.boundary[1] = header["BOUNDARY_2"];
field.boundary[2] = header["BOUNDARY_3"];
field.boundary[3] = header["BOUNDARY_4"];
field.checksum = std::stoul(header["CHECKSUM"],0,16);
field.ensemble_id = header["ENSEMBLE_ID"];
field.ensemble_label = header["ENSEMBLE_LABEL"];
field.sequence_number = std::stol(header["SEQUENCE_NUMBER"]);
field.creator = header["CREATOR"];
field.creator_hardware = header["CREATOR_HARDWARE"];
field.creation_date = header["CREATION_DATE"];
field.archive_date = header["ARCHIVE_DATE"];
field.floating_point = header["FLOATING_POINT"];
field.checksum = std::stoul(header["CHECKSUM"],0,16);
field.ensemble_id = header["ENSEMBLE_ID"];
field.ensemble_label = header["ENSEMBLE_LABEL"];
field.sequence_number = std::stol(header["SEQUENCE_NUMBER"]);
field.creator = header["CREATOR"];
field.creator_hardware = header["CREATOR_HARDWARE"];
field.creation_date = header["CREATION_DATE"];
field.archive_date = header["ARCHIVE_DATE"];
field.floating_point = header["FLOATING_POINT"];
return field.data_start;
return field.data_start;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Now the meat: the object readers
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
FieldMetaData& header,
std::string file)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu.Grid();
uint64_t offset = readHeader(file,Umu.Grid(),header);
FieldMetaData clone(header);
std::string format(header.floating_point);
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else {
assert(0);
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Now the meat: the object readers
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
GaugeStatistics(Umu,clone);
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
FieldMetaData& header,
std::string file)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
<<" header "<<header.plaquette<<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
<<" header "<<header.link_trace<<std::endl;
GridBase *grid = Umu._grid;
uint64_t offset = readHeader(file,Umu._grid,header);
FieldMetaData clone(header);
std::string format(header.floating_point);
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else {
assert(0);
}
GaugeStatistics(Umu,clone);
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
<<" header "<<header.plaquette<<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
<<" header "<<header.link_trace<<std::endl;
if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) {
std::cout << " Plaquette mismatch "<<std::endl;
std::cout << Umu[0]<<std::endl;
std::cout << Umu[1]<<std::endl;
}
if ( nersc_csum != header.checksum ) {
std::cerr << " checksum mismatch " << std::endl;
std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0);
}
assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) {
std::cout << " Plaquette mismatch "<<std::endl;
}
if ( nersc_csum != header.checksum ) {
std::cerr << " checksum mismatch " << std::endl;
std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0);
}
assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum );
template<class vsimd>
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
std::string file,
int two_row,
int bits32)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
}
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj;
template<class vsimd>
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
std::string file,
int two_row,
int bits32)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
FieldMetaData header;
///////////////////////////////////////////
// Following should become arguments
///////////////////////////////////////////
header.sequence_number = 1;
header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF";
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj;
typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D;
FieldMetaData header;
///////////////////////////////////////////
// Following should become arguments
///////////////////////////////////////////
header.sequence_number = 1;
header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF";
GridBase *grid = Umu._grid;
typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D;
GridMetaData(grid,header);
assert(header.nd==4);
GaugeStatistics(Umu,header);
MachineCharacteristics(header);
GridBase *grid = Umu.Grid();
GridMetaData(grid,header);
assert(header.nd==4);
GaugeStatistics(Umu,header);
MachineCharacteristics(header);
uint64_t offset;
// Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
// Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
writeHeader(header,file);
writeHeader(header,file);
}
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
<<std::hex<<header.checksum
<<std::dec<<" plaq "<< header.plaquette <<std::endl;
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
<<std::hex<<header.checksum
<<std::dec<<" plaq "<< header.plaquette <<std::endl;
}
///////////////////////////////
// RNG state
///////////////////////////////
static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file)
{
typedef typename GridParallelRNG::RngStateType RngStateType;
}
///////////////////////////////
// RNG state
///////////////////////////////
static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file)
{
typedef typename GridParallelRNG::RngStateType RngStateType;
// Following should become arguments
FieldMetaData header;
header.sequence_number = 1;
header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF";
// Following should become arguments
FieldMetaData header;
header.sequence_number = 1;
header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF";
GridBase *grid = parallel._grid;
GridBase *grid = parallel.Grid();
GridMetaData(grid,header);
assert(header.nd==4);
header.link_trace=0.0;
header.plaquette=0.0;
MachineCharacteristics(header);
GridMetaData(grid,header);
assert(header.nd==4);
header.link_trace=0.0;
header.plaquette=0.0;
MachineCharacteristics(header);
uint64_t offset;
#ifdef RNG_RANLUX
header.floating_point = std::string("UINT64");
header.data_type = std::string("RANLUX48");
header.floating_point = std::string("UINT64");
header.data_type = std::string("RANLUX48");
#endif
#ifdef RNG_MT19937
header.floating_point = std::string("UINT32");
header.data_type = std::string("MT19937");
header.floating_point = std::string("UINT32");
header.data_type = std::string("MT19937");
#endif
#ifdef RNG_SITMO
header.floating_point = std::string("UINT64");
header.data_type = std::string("SITMO");
header.floating_point = std::string("UINT64");
header.data_type = std::string("SITMO");
#endif
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
truncate(file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
offset = writeHeader(header,file);
offset = writeHeader(header,file);
}
std::cout<<GridLogMessage
<<"Written NERSC RNG STATE "<<file<< " checksum "
<<std::hex<<header.checksum
<<std::dec<<std::endl;
std::cout<<GridLogMessage
<<"Written NERSC RNG STATE "<<file<< " checksum "
<<std::hex<<header.checksum
<<std::dec<<std::endl;
}
}
static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
{
typedef typename GridParallelRNG::RngStateType RngStateType;
static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
{
typedef typename GridParallelRNG::RngStateType RngStateType;
GridBase *grid = parallel._grid;
GridBase *grid = parallel.Grid();
uint64_t offset = readHeader(file,grid,header);
FieldMetaData clone(header);
FieldMetaData clone(header);
std::string format(header.floating_point);
std::string data_type(header.data_type);
std::string format(header.floating_point);
std::string data_type(header.data_type);
#ifdef RNG_RANLUX
assert(format == std::string("UINT64"));
assert(data_type == std::string("RANLUX48"));
assert(format == std::string("UINT64"));
assert(data_type == std::string("RANLUX48"));
#endif
#ifdef RNG_MT19937
assert(format == std::string("UINT32"));
assert(data_type == std::string("MT19937"));
assert(format == std::string("UINT32"));
assert(data_type == std::string("MT19937"));
#endif
#ifdef RNG_SITMO
assert(format == std::string("UINT64"));
assert(data_type == std::string("SITMO"));
assert(format == std::string("UINT64"));
assert(data_type == std::string("SITMO"));
#endif
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
if ( nersc_csum != header.checksum ) {
std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
exit(0);
}
assert(nersc_csum == header.checksum );
if ( nersc_csum != header.checksum ) {
std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
exit(0);
}
assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
}
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
}
};
};
NAMESPACE_END(Grid);
}}
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -23,13 +23,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/perfmon/PerfCount.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
#define RawConfig(A,B) (A<<8|B)
@@ -39,16 +39,16 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." , CACHE_REFERENCES},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES },
// 4
// 4
#ifdef KNL
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
{ PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
// 11
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
{ PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
// 11
#else
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS....",INSTRUCTIONS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......",L1D_READ_ACCESS},
@@ -57,19 +57,20 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
// 11
// 11
#endif
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS.......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS.....",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
//15
//15
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......",INSTRUCTIONS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS....",INSTRUCTIONS}
//19
//19
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
#endif
};
}
NAMESPACE_END(Grid);

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -25,8 +25,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PERFCOUNT_H
#define GRID_PERFCOUNT_H
@@ -44,10 +44,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <sys/syscall.h>
#endif
#ifdef __x86_64__
#ifdef GRID_NVCC
accelerator_inline uint64_t __rdtsc(void) { return 0; }
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
#else
#include <x86intrin.h>
#endif
#endif
namespace Grid {
NAMESPACE_BEGIN(Grid);
#ifdef __linux__
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
@@ -84,20 +89,18 @@ inline uint64_t cyclecount(void){
#ifdef __bgq__
inline uint64_t cyclecount(void){
uint64_t tmp;
asm volatile ("mfspr %0,0x10C" : "=&r" (tmp) );
return tmp;
uint64_t tmp;
asm volatile ("mfspr %0,0x10C" : "=&r" (tmp) );
return tmp;
}
#elif defined __x86_64__
inline uint64_t cyclecount(void){
return __rdtsc();
// unsigned int dummy;
// return __rdtscp(&dummy);
}
#else
inline uint64_t cyclecount(void){
return 0;
return 0;
}
#endif
@@ -212,7 +215,7 @@ public:
::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
ign=::read(fd, &count, sizeof(long long));
ign+=::read(cyclefd, &cycles, sizeof(long long));
assert(ign=2*sizeof(long long));
assert(ign==2*sizeof(long long));
}
elapsed = cyclecount() - begin;
#else
@@ -225,8 +228,8 @@ public:
int N = PerformanceCounterConfigs[PCT].normalisation;
const char * sn = PerformanceCounterConfigs[N].name ;
const char * sc = PerformanceCounterConfigs[PCT].name;
std::printf("tsc = %llu %s = %llu %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles,
sc, count, sc,sn, (double)count/(double)cycles);
std::printf("tsc = %llu %s = %llu %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles,
sc, count, sc,sn, (double)count/(double)cycles);
#else
std::printf("%llu cycles \n", elapsed );
#endif
@@ -241,5 +244,6 @@ public:
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -2,7 +2,7 @@
#include <Grid/perfmon/PerfCount.h>
#include <Grid/perfmon/Stat.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
bool PmuStat::pmu_initialized=false;
@@ -175,39 +175,39 @@ void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
}
void PmuStat::KNLsetup(void){
void PmuStat::KNLsetup(void){
int ret;
char fname[1024];
int ret;
char fname[1024];
// MC RPQ inserts and WPQ inserts (reads & writes)
for (int mc = 0; mc < NMC; ++mc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
// RPQ Inserts
KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
// WPQ Inserts
KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
}
// EDC RPQ inserts and WPQ inserts
for (int edc=0; edc < NEDC; ++edc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
// RPQ inserts
KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
// WPQ inserts
KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
}
// EDC HitE, HitM, MissE, MissM
for (int edc=0; edc < NEDC; ++edc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
}
}
// MC RPQ inserts and WPQ inserts (reads & writes)
for (int mc = 0; mc < NMC; ++mc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
// RPQ Inserts
KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
// WPQ Inserts
KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
}
// EDC RPQ inserts and WPQ inserts
for (int edc=0; edc < NEDC; ++edc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
// RPQ inserts
KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
// WPQ inserts
KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
}
// EDC HitE, HitM, MissE, MissM
for (int edc=0; edc < NEDC; ++edc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
}
}
uint64_t PmuStat::KNLreadctr(int fd)
{
@@ -242,4 +242,5 @@ void PmuStat::KNLreadctrs(ctrs &c)
}
#endif
}
NAMESPACE_END(Grid);

View File

@@ -5,7 +5,7 @@
#define _KNIGHTS_LANDING_ROOTONLY
#endif
namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////
// Extra KNL counters from MCDRAM
@@ -15,14 +15,14 @@ namespace Grid {
#define NEDC 8
struct ctrs
{
uint64_t mcrd[NMC];
uint64_t mcwr[NMC];
uint64_t edcrd[NEDC];
uint64_t edcwr[NEDC];
uint64_t edchite[NEDC];
uint64_t edchitm[NEDC];
uint64_t edcmisse[NEDC];
uint64_t edcmissm[NEDC];
uint64_t mcrd[NMC];
uint64_t mcwr[NMC];
uint64_t edcrd[NEDC];
uint64_t edcwr[NEDC];
uint64_t edchite[NEDC];
uint64_t edchitm[NEDC];
uint64_t edcmisse[NEDC];
uint64_t edcmissm[NEDC];
};
// Peter/Azusa:
// Our modification of a code provided by Larry Meadows from Intel
@@ -44,61 +44,62 @@ struct knl_gbl_
class PmuStat
{
uint64_t counters[8][256];
uint64_t counters[8][256];
#ifdef _KNIGHTS_LANDING_
static struct knl_gbl_ gbl;
static struct knl_gbl_ gbl;
#endif
const char *name;
const char *name;
uint64_t reads; // memory reads
uint64_t writes; // memory writes
uint64_t mrstart; // memory read counter at start of parallel region
uint64_t mrend; // memory read counter at end of parallel region
uint64_t mwstart; // memory write counter at start of parallel region
uint64_t mwend; // memory write counter at end of parallel region
uint64_t reads; // memory reads
uint64_t writes; // memory writes
uint64_t mrstart; // memory read counter at start of parallel region
uint64_t mrend; // memory read counter at end of parallel region
uint64_t mwstart; // memory write counter at start of parallel region
uint64_t mwend; // memory write counter at end of parallel region
// cumulative counters
uint64_t count; // number of invocations
uint64_t tregion; // total time in parallel region (from thread 0)
uint64_t tcycles; // total cycles inside parallel region
uint64_t inst, ref, cyc; // fixed counters
uint64_t pmc0, pmc1;// pmu
// add memory counters here
// temp variables
uint64_t tstart; // tsc at start of parallel region
uint64_t tend; // tsc at end of parallel region
// map for ctrs values
// 0 pmc0 start
// 1 pmc0 end
// 2 pmc1 start
// 3 pmc1 end
// 4 tsc start
// 5 tsc end
static bool pmu_initialized;
// cumulative counters
uint64_t count; // number of invocations
uint64_t tregion; // total time in parallel region (from thread 0)
uint64_t tcycles; // total cycles inside parallel region
uint64_t inst, ref, cyc; // fixed counters
uint64_t pmc0, pmc1;// pmu
// add memory counters here
// temp variables
uint64_t tstart; // tsc at start of parallel region
uint64_t tend; // tsc at end of parallel region
// map for ctrs values
// 0 pmc0 start
// 1 pmc0 end
// 2 pmc1 start
// 3 pmc1 end
// 4 tsc start
// 5 tsc end
static bool pmu_initialized;
public:
static bool is_init(void){ return pmu_initialized;}
static void pmu_init(void);
static void pmu_fini(void);
static void pmu_start(void);
static void pmu_stop(void);
void accum(int nthreads);
static void xmemctrs(uint64_t *mr, uint64_t *mw);
void start(void);
void enter(int t);
void exit(int t);
void print(void);
void init(const char *regname);
void clear(void);
static bool is_init(void){ return pmu_initialized;}
static void pmu_init(void);
static void pmu_fini(void);
static void pmu_start(void);
static void pmu_stop(void);
void accum(int nthreads);
static void xmemctrs(uint64_t *mr, uint64_t *mw);
void start(void);
void enter(int t);
void exit(int t);
void print(void);
void init(const char *regname);
void clear(void);
#ifdef _KNIGHTS_LANDING_
static void KNLsetup(void);
static uint64_t KNLreadctr(int fd);
static void KNLreadctrs(ctrs &c);
static void KNLevsetup(const char *ename, int &fd, int event, int umask);
static void KNLsetup(void);
static uint64_t KNLreadctr(int fd);
static void KNLreadctrs(ctrs &c);
static void KNLevsetup(const char *ename, int &fd, int event, int umask);
#endif
};
};
NAMESPACE_END(Grid);
}
#endif

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_TIME_H
#define GRID_TIME_H
@@ -33,11 +33,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <ctime>
#include <chrono>
namespace Grid {
// Dress the output; use std::chrono
NAMESPACE_BEGIN(Grid)
// Dress the output; use std::chrono
// C++11 time facilities better?
inline double usecond(void) {
struct timeval tv;
@@ -125,5 +123,6 @@ public:
}
};
}
NAMESPACE_END(Grid)
#endif

View File

@@ -14,7 +14,12 @@
#ifndef SOURCE_PUGIXML_CPP
#define SOURCE_PUGIXML_CPP
#include <Grid/pugixml/pugixml.h>
#ifdef __NVCC__
#pragma push
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#endif
#include "pugixml.h"
#include <stdlib.h>
#include <stdio.h>
@@ -202,7 +207,7 @@ PUGI__NS_BEGIN
// Without a template<> we'll get multiple definitions of the same static
template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
template struct xml_memory_management_function_storage<int>;
typedef xml_memory_management_function_storage<int> xml_memory;
PUGI__NS_END
@@ -12768,6 +12773,10 @@ namespace pugi
#undef PUGI__THROW_ERROR
#undef PUGI__CHECK_ERROR
#ifdef GRID_NVCC
#pragma pop
#endif
#endif
/**

View File

@@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@@ -27,113 +27,111 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_BASE_H
#define GRID_QCD_BASE_H
namespace Grid{
namespace QCD {
*************************************************************************************/
/* END LEGAL */
#pragma once
static const int Xdir = 0;
static const int Ydir = 1;
static const int Zdir = 2;
static const int Tdir = 3;
NAMESPACE_BEGIN(Grid);
static constexpr int Xdir = 0;
static constexpr int Ydir = 1;
static constexpr int Zdir = 2;
static constexpr int Tdir = 3;
static const int Xp = 0;
static const int Yp = 1;
static const int Zp = 2;
static const int Tp = 3;
static const int Xm = 4;
static const int Ym = 5;
static const int Zm = 6;
static const int Tm = 7;
static constexpr int Xp = 0;
static constexpr int Yp = 1;
static constexpr int Zp = 2;
static constexpr int Tp = 3;
static constexpr int Xm = 4;
static constexpr int Ym = 5;
static constexpr int Zm = 6;
static constexpr int Tm = 7;
static const int Nc=3;
static const int Ns=4;
static const int Nd=4;
static const int Nhs=2; // half spinor
static const int Nds=8; // double stored gauge field
static const int Ngp=2; // gparity index range
static constexpr int Nc=3;
static constexpr int Ns=4;
static constexpr int Nd=4;
static constexpr int Nhs=2; // half spinor
static constexpr int Nds=8; // double stored gauge field
static constexpr int Ngp=2; // gparity index range
//////////////////////////////////////////////////////////////////////////////
// QCD iMatrix types
// Index conventions: Lorentz x Spin x Colour
// note: static const int or constexpr will work for type deductions
// with the intel compiler (up to version 17)
//////////////////////////////////////////////////////////////////////////////
#define ColourIndex 2
#define SpinIndex 1
#define LorentzIndex 0
//////////////////////////////////////////////////////////////////////////////
// QCD iMatrix types
// Index conventions: Lorentz x Spin x Colour
// note: static constexpr int or constexpr will work for type deductions
// with the intel compiler (up to version 17)
//////////////////////////////////////////////////////////////////////////////
#define ColourIndex (2)
#define SpinIndex (1)
#define LorentzIndex (0)
// Also should make these a named enum type
static const int DaggerNo=0;
static const int DaggerYes=1;
static const int InverseNo=0;
static const int InverseYes=1;
// Also should make these a named enum type
static constexpr int DaggerNo=0;
static constexpr int DaggerYes=1;
static constexpr int InverseNo=0;
static constexpr int InverseYes=1;
// Useful traits is this a spin index
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
// Useful traits is this a spin index
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
const int SpinorIndex = 2;
template<typename T> struct isSpinor {
static const bool value = (SpinorIndex==T::TensorLevel);
};
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
const int SpinorIndex = 2;
template<typename T> struct isSpinor {
static constexpr bool value = (SpinorIndex==T::TensorLevel);
};
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
// ChrisK very keen to add extra space for Gparity doubling.
//
// Also add domain wall index, in a way where Wilson operator
// naturally distributes across the 5th dimensions.
//
// That probably makes for GridRedBlack4dCartesian grid.
// ChrisK very keen to add extra space for Gparity doubling.
//
// Also add domain wall index, in a way where Wilson operator
// naturally distributes across the 5th dimensions.
//
// That probably makes for GridRedBlack4dCartesian grid.
// s,sp,c,spc,lc
// s,sp,c,spc,lc
template<typename vtype> using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
template<typename vtype> using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
template<typename vtype> using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
template<typename vtype> using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
template<typename vtype> using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
template<typename vtype> using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
// Spin matrix
typedef iSpinMatrix<Complex > SpinMatrix;
typedef iSpinMatrix<ComplexF > SpinMatrixF;
typedef iSpinMatrix<ComplexD > SpinMatrixD;
// Spin matrix
typedef iSpinMatrix<Complex > SpinMatrix;
typedef iSpinMatrix<ComplexF > SpinMatrixF;
typedef iSpinMatrix<ComplexD > SpinMatrixD;
typedef iSpinMatrix<vComplex > vSpinMatrix;
typedef iSpinMatrix<vComplexF> vSpinMatrixF;
typedef iSpinMatrix<vComplexD> vSpinMatrixD;
typedef iSpinMatrix<vComplex > vSpinMatrix;
typedef iSpinMatrix<vComplexF> vSpinMatrixF;
typedef iSpinMatrix<vComplexD> vSpinMatrixD;
// Colour Matrix
typedef iColourMatrix<Complex > ColourMatrix;
typedef iColourMatrix<ComplexF > ColourMatrixF;
typedef iColourMatrix<ComplexD > ColourMatrixD;
// Colour Matrix
typedef iColourMatrix<Complex > ColourMatrix;
typedef iColourMatrix<ComplexF > ColourMatrixF;
typedef iColourMatrix<ComplexD > ColourMatrixD;
typedef iColourMatrix<vComplex > vColourMatrix;
typedef iColourMatrix<vComplexF> vColourMatrixF;
typedef iColourMatrix<vComplexD> vColourMatrixD;
typedef iColourMatrix<vComplex > vColourMatrix;
typedef iColourMatrix<vComplexF> vColourMatrixF;
typedef iColourMatrix<vComplexD> vColourMatrixD;
// SpinColour matrix
typedef iSpinColourMatrix<Complex > SpinColourMatrix;
typedef iSpinColourMatrix<ComplexF > SpinColourMatrixF;
typedef iSpinColourMatrix<ComplexD > SpinColourMatrixD;
// SpinColour matrix
typedef iSpinColourMatrix<Complex > SpinColourMatrix;
typedef iSpinColourMatrix<ComplexF > SpinColourMatrixF;
typedef iSpinColourMatrix<ComplexD > SpinColourMatrixD;
typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF;
typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD;
typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF;
typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD;
// SpinColourSpinColour matrix
typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix;
@@ -153,383 +151,379 @@ namespace QCD {
typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD;
// LorentzColour
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
// LorentzColour
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
// DoubleStored gauge field
typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
// DoubleStored gauge field
typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
// Spin vector
typedef iSpinVector<Complex > SpinVector;
typedef iSpinVector<ComplexF> SpinVectorF;
typedef iSpinVector<ComplexD> SpinVectorD;
// Spin vector
typedef iSpinVector<Complex > SpinVector;
typedef iSpinVector<ComplexF> SpinVectorF;
typedef iSpinVector<ComplexD> SpinVectorD;
typedef iSpinVector<vComplex > vSpinVector;
typedef iSpinVector<vComplexF> vSpinVectorF;
typedef iSpinVector<vComplexD> vSpinVectorD;
typedef iSpinVector<vComplex > vSpinVector;
typedef iSpinVector<vComplexF> vSpinVectorF;
typedef iSpinVector<vComplexD> vSpinVectorD;
// Colour vector
typedef iColourVector<Complex > ColourVector;
typedef iColourVector<ComplexF> ColourVectorF;
typedef iColourVector<ComplexD> ColourVectorD;
// Colour vector
typedef iColourVector<Complex > ColourVector;
typedef iColourVector<ComplexF> ColourVectorF;
typedef iColourVector<ComplexD> ColourVectorD;
typedef iColourVector<vComplex > vColourVector;
typedef iColourVector<vComplexF> vColourVectorF;
typedef iColourVector<vComplexD> vColourVectorD;
typedef iColourVector<vComplex > vColourVector;
typedef iColourVector<vComplexF> vColourVectorF;
typedef iColourVector<vComplexD> vColourVectorD;
// SpinColourVector
typedef iSpinColourVector<Complex > SpinColourVector;
typedef iSpinColourVector<ComplexF> SpinColourVectorF;
typedef iSpinColourVector<ComplexD> SpinColourVectorD;
// SpinColourVector
typedef iSpinColourVector<Complex > SpinColourVector;
typedef iSpinColourVector<ComplexF> SpinColourVectorF;
typedef iSpinColourVector<ComplexD> SpinColourVectorD;
typedef iSpinColourVector<vComplex > vSpinColourVector;
typedef iSpinColourVector<vComplexF> vSpinColourVectorF;
typedef iSpinColourVector<vComplexD> vSpinColourVectorD;
typedef iSpinColourVector<vComplex > vSpinColourVector;
typedef iSpinColourVector<vComplexF> vSpinColourVectorF;
typedef iSpinColourVector<vComplexD> vSpinColourVectorD;
// HalfSpin vector
typedef iHalfSpinVector<Complex > HalfSpinVector;
typedef iHalfSpinVector<ComplexF> HalfSpinVectorF;
typedef iHalfSpinVector<ComplexD> HalfSpinVectorD;
// HalfSpin vector
typedef iHalfSpinVector<Complex > HalfSpinVector;
typedef iHalfSpinVector<ComplexF> HalfSpinVectorF;
typedef iHalfSpinVector<ComplexD> HalfSpinVectorD;
typedef iHalfSpinVector<vComplex > vHalfSpinVector;
typedef iHalfSpinVector<vComplexF> vHalfSpinVectorF;
typedef iHalfSpinVector<vComplexD> vHalfSpinVectorD;
typedef iHalfSpinVector<vComplex > vHalfSpinVector;
typedef iHalfSpinVector<vComplexF> vHalfSpinVectorF;
typedef iHalfSpinVector<vComplexD> vHalfSpinVectorD;
// HalfSpinColour vector
typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
// HalfSpinColour vector
typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
// singlets
typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<ComplexF> TComplexF; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<ComplexD> TComplexD; // FIXME This is painful. Tensor singlet complex type.
// singlets
typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<ComplexF> TComplexF; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<ComplexD> TComplexD; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<vComplex > vTComplex ; // what if we don't know the tensor structure
typedef iSinglet<vComplexF> vTComplexF; // what if we don't know the tensor structure
typedef iSinglet<vComplexD> vTComplexD; // what if we don't know the tensor structure
typedef iSinglet<vComplex > vTComplex ; // what if we don't know the tensor structure
typedef iSinglet<vComplexF> vTComplexF; // what if we don't know the tensor structure
typedef iSinglet<vComplexD> vTComplexD; // what if we don't know the tensor structure
typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealF> TRealF; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealD> TRealD; // Shouldn't need these; can I make it work without?
typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealF> TRealF; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealD> TRealD; // Shouldn't need these; can I make it work without?
typedef iSinglet<vReal > vTReal;
typedef iSinglet<vRealF> vTRealF;
typedef iSinglet<vRealD> vTRealD;
typedef iSinglet<vReal > vTReal;
typedef iSinglet<vRealF> vTRealF;
typedef iSinglet<vRealD> vTRealD;
typedef iSinglet<vInteger> vTInteger;
typedef iSinglet<Integer > TInteger;
typedef iSinglet<vInteger> vTInteger;
typedef iSinglet<Integer > TInteger;
// Lattices of these
typedef Lattice<vColourMatrix> LatticeColourMatrix;
typedef Lattice<vColourMatrixF> LatticeColourMatrixF;
typedef Lattice<vColourMatrixD> LatticeColourMatrixD;
// Lattices of these
typedef Lattice<vColourMatrix> LatticeColourMatrix;
typedef Lattice<vColourMatrixF> LatticeColourMatrixF;
typedef Lattice<vColourMatrixD> LatticeColourMatrixD;
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
typedef Lattice<vSpinMatrixF> LatticeSpinMatrixF;
typedef Lattice<vSpinMatrixD> LatticeSpinMatrixD;
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
typedef Lattice<vSpinMatrixF> LatticeSpinMatrixF;
typedef Lattice<vSpinMatrixD> LatticeSpinMatrixD;
typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix;
typedef Lattice<vSpinColourMatrixF> LatticeSpinColourMatrixF;
typedef Lattice<vSpinColourMatrixD> LatticeSpinColourMatrixD;
typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix;
typedef Lattice<vSpinColourMatrixF> LatticeSpinColourMatrixF;
typedef Lattice<vSpinColourMatrixD> LatticeSpinColourMatrixD;
typedef Lattice<vSpinColourSpinColourMatrix> LatticeSpinColourSpinColourMatrix;
typedef Lattice<vSpinColourSpinColourMatrixF> LatticeSpinColourSpinColourMatrixF;
typedef Lattice<vSpinColourSpinColourMatrixD> LatticeSpinColourSpinColourMatrixD;
typedef Lattice<vSpinColourSpinColourMatrix> LatticeSpinColourSpinColourMatrix;
typedef Lattice<vSpinColourSpinColourMatrixF> LatticeSpinColourSpinColourMatrixF;
typedef Lattice<vSpinColourSpinColourMatrixD> LatticeSpinColourSpinColourMatrixD;
typedef Lattice<vLorentzColourMatrix> LatticeLorentzColourMatrix;
typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
typedef Lattice<vLorentzColourMatrix> LatticeLorentzColourMatrix;
typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
// DoubleStored gauge field
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
// DoubleStored gauge field
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
typedef Lattice<vSpinVector> LatticeSpinVector;
typedef Lattice<vSpinVectorF> LatticeSpinVectorF;
typedef Lattice<vSpinVectorD> LatticeSpinVectorD;
typedef Lattice<vSpinVector> LatticeSpinVector;
typedef Lattice<vSpinVectorF> LatticeSpinVectorF;
typedef Lattice<vSpinVectorD> LatticeSpinVectorD;
typedef Lattice<vColourVector> LatticeColourVector;
typedef Lattice<vColourVectorF> LatticeColourVectorF;
typedef Lattice<vColourVectorD> LatticeColourVectorD;
typedef Lattice<vColourVector> LatticeColourVector;
typedef Lattice<vColourVectorF> LatticeColourVectorF;
typedef Lattice<vColourVectorD> LatticeColourVectorD;
typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
typedef Lattice<vSpinColourVectorF> LatticeSpinColourVectorF;
typedef Lattice<vSpinColourVectorD> LatticeSpinColourVectorD;
typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
typedef Lattice<vSpinColourVectorF> LatticeSpinColourVectorF;
typedef Lattice<vSpinColourVectorD> LatticeSpinColourVectorD;
typedef Lattice<vHalfSpinVector> LatticeHalfSpinVector;
typedef Lattice<vHalfSpinVectorF> LatticeHalfSpinVectorF;
typedef Lattice<vHalfSpinVectorD> LatticeHalfSpinVectorD;
typedef Lattice<vHalfSpinVector> LatticeHalfSpinVector;
typedef Lattice<vHalfSpinVectorF> LatticeHalfSpinVectorF;
typedef Lattice<vHalfSpinVectorD> LatticeHalfSpinVectorD;
typedef Lattice<vHalfSpinColourVector> LatticeHalfSpinColourVector;
typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
typedef Lattice<vHalfSpinColourVector> LatticeHalfSpinColourVector;
typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
typedef Lattice<vTReal> LatticeReal;
typedef Lattice<vTRealF> LatticeRealF;
typedef Lattice<vTRealD> LatticeRealD;
typedef Lattice<vTReal> LatticeReal;
typedef Lattice<vTRealF> LatticeRealF;
typedef Lattice<vTRealD> LatticeRealD;
typedef Lattice<vTComplex> LatticeComplex;
typedef Lattice<vTComplexF> LatticeComplexF;
typedef Lattice<vTComplexD> LatticeComplexD;
typedef Lattice<vTComplex> LatticeComplex;
typedef Lattice<vTComplexF> LatticeComplexF;
typedef Lattice<vTComplexD> LatticeComplexD;
typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
///////////////////////////////////////////
// Physical names for things
///////////////////////////////////////////
typedef LatticeHalfSpinColourVector LatticeHalfFermion;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
///////////////////////////////////////////
// Physical names for things
///////////////////////////////////////////
typedef LatticeHalfSpinColourVector LatticeHalfFermion;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
typedef LatticeSpinColourVector LatticeFermion;
typedef LatticeSpinColourVectorF LatticeFermionF;
typedef LatticeSpinColourVectorD LatticeFermionD;
typedef LatticeSpinColourVector LatticeFermion;
typedef LatticeSpinColourVectorF LatticeFermionF;
typedef LatticeSpinColourVectorD LatticeFermionD;
typedef LatticeSpinColourMatrix LatticePropagator;
typedef LatticeSpinColourMatrixF LatticePropagatorF;
typedef LatticeSpinColourMatrixD LatticePropagatorD;
typedef LatticeSpinColourMatrix LatticePropagator;
typedef LatticeSpinColourMatrixF LatticePropagatorF;
typedef LatticeSpinColourMatrixD LatticePropagatorD;
typedef LatticeLorentzColourMatrix LatticeGaugeField;
typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF;
typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD;
typedef LatticeLorentzColourMatrix LatticeGaugeField;
typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF;
typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD;
typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField;
typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF;
typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD;
typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField;
typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF;
typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD;
template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
// Uhgg... typing this hurt ;)
// (my keyboard got burning hot when I typed this, must be the anti-Fermion)
typedef Lattice<vColourVector> LatticeStaggeredFermion;
typedef Lattice<vColourVectorF> LatticeStaggeredFermionF;
typedef Lattice<vColourVectorD> LatticeStaggeredFermionD;
// Uhgg... typing this hurt ;)
// (my keyboard got burning hot when I typed this, must be the anti-Fermion)
typedef Lattice<vColourVector> LatticeStaggeredFermion;
typedef Lattice<vColourVectorF> LatticeStaggeredFermionF;
typedef Lattice<vColourVectorD> LatticeStaggeredFermionD;
typedef Lattice<vColourMatrix> LatticeStaggeredPropagator;
typedef Lattice<vColourMatrixF> LatticeStaggeredPropagatorF;
typedef Lattice<vColourMatrixD> LatticeStaggeredPropagatorD;
typedef Lattice<vColourMatrix> LatticeStaggeredPropagator;
typedef Lattice<vColourMatrixF> LatticeStaggeredPropagatorF;
typedef Lattice<vColourMatrixD> LatticeStaggeredPropagatorD;
//////////////////////////////////////////////////////////////////////////////
// Peek and Poke named after physics attributes
//////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
// Peek and Poke named after physics attributes
//////////////////////////////////////////////////////////////////////////////
//spin
template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
{
return PeekIndex<SpinIndex>(rhs,i);
}
template<class vobj> auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
{
return PeekIndex<SpinIndex>(rhs,i,j);
}
template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
{
return PeekIndex<SpinIndex>(rhs,i);
}
template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
{
return PeekIndex<SpinIndex>(rhs,i,j);
}
//colour
template<class vobj> auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
{
return PeekIndex<ColourIndex>(rhs,i);
}
template<class vobj> auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
{
return PeekIndex<ColourIndex>(rhs,i,j);
}
template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
{
return PeekIndex<ColourIndex>(rhs,i);
}
template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
{
return PeekIndex<ColourIndex>(rhs,i,j);
}
//lorentz
template<class vobj> auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
{
return PeekIndex<LorentzIndex>(rhs,i);
}
template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
{
return PeekIndex<LorentzIndex>(rhs,i);
}
//spin
template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
{
return PeekIndex<SpinIndex>(rhs,i);
}
template<class vobj> auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
{
return PeekIndex<SpinIndex>(rhs,i,j);
}
template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
{
return PeekIndex<SpinIndex>(rhs,i);
}
template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
{
return PeekIndex<SpinIndex>(rhs,i,j);
}
//colour
template<class vobj> auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
{
return PeekIndex<ColourIndex>(rhs,i);
}
template<class vobj> auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
{
return PeekIndex<ColourIndex>(rhs,i,j);
}
template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
{
return PeekIndex<ColourIndex>(rhs,i);
}
template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
{
return PeekIndex<ColourIndex>(rhs,i,j);
}
//lorentz
template<class vobj> auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
{
return PeekIndex<LorentzIndex>(rhs,i);
}
template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
{
return PeekIndex<LorentzIndex>(rhs,i);
}
//////////////////////////////////////////////
// Poke lattice
//////////////////////////////////////////////
template<class vobj>
void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
//////////////////////////////////////////////
// Poke lattice
//////////////////////////////////////////////
template<class vobj>
void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0))> & rhs,
int i)
{
PokeIndex<ColourIndex>(lhs,rhs,i);
}
template<class vobj>
void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0,0))> & rhs,
int i,int j)
{
PokeIndex<ColourIndex>(lhs,rhs,i,j);
}
template<class vobj>
void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0))> & rhs,
int i)
{
PokeIndex<ColourIndex>(lhs,rhs,i);
}
template<class vobj>
void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
{
PokeIndex<SpinIndex>(lhs,rhs,i);
}
template<class vobj>
void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0,0))> & rhs,
int i,int j)
{
PokeIndex<ColourIndex>(lhs,rhs,i,j);
}
template<class vobj>
void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
int i)
{
PokeIndex<SpinIndex>(lhs,rhs,i);
}
template<class vobj>
void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
int i,int j)
{
PokeIndex<SpinIndex>(lhs,rhs,i,j);
}
template<class vobj>
void pokeLorentz(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
int i)
{
PokeIndex<LorentzIndex>(lhs,rhs,i);
}
{
PokeIndex<SpinIndex>(lhs,rhs,i,j);
}
template<class vobj>
void pokeLorentz(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<LorentzIndex>(vobj(),0))> & rhs,
int i)
{
PokeIndex<LorentzIndex>(lhs,rhs,i);
}
//////////////////////////////////////////////
// Poke scalars
//////////////////////////////////////////////
template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
{
pokeIndex<SpinIndex>(lhs,rhs,i);
}
template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0,0)) & rhs,int i,int j)
{
pokeIndex<SpinIndex>(lhs,rhs,i,j);
}
//////////////////////////////////////////////
// Poke scalars
//////////////////////////////////////////////
template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
{
pokeIndex<SpinIndex>(lhs,rhs,i);
}
template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0,0)) & rhs,int i,int j)
{
pokeIndex<SpinIndex>(lhs,rhs,i,j);
}
template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0)) & rhs,int i)
{
pokeIndex<ColourIndex>(lhs,rhs,i);
}
template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0,0)) & rhs,int i,int j)
{
pokeIndex<ColourIndex>(lhs,rhs,i,j);
}
template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0)) & rhs,int i)
{
pokeIndex<ColourIndex>(lhs,rhs,i);
}
template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0,0)) & rhs,int i,int j)
{
pokeIndex<ColourIndex>(lhs,rhs,i,j);
}
template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<LorentzIndex>(lhs,0)) & rhs,int i)
{
pokeIndex<LorentzIndex>(lhs,rhs,i);
}
template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<LorentzIndex>(lhs,0)) & rhs,int i)
{
pokeIndex<LorentzIndex>(lhs,rhs,i);
}
//////////////////////////////////////////////
// Fermion <-> propagator assignements
//////////////////////////////////////////////
//////////////////////////////////////////////
// Fermion <-> propagator assignements
//////////////////////////////////////////////
//template <class Prop, class Ferm>
template <class Fimpl>
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
{
for(int j = 0; j < Ns; ++j)
{
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j);
auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j);
for(int i = 0; i < Fimpl::Dimension; ++i)
{
pokeColour(pjs, peekColour(fj, i), i, c);
}
pokeSpin(p, pjs, j, s);
}
{
pokeColour(pjs, peekColour(fj, i), i, c);
}
pokeSpin(p, pjs, j, s);
}
}
//template <class Prop, class Ferm>
template <class Fimpl>
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
{
for(int j = 0; j < Ns; ++j)
{
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j);
auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j);
for(int i = 0; i < Fimpl::Dimension; ++i)
{
pokeColour(fj, peekColour(pjs, i, c), i);
}
pokeSpin(f, fj, j);
}
{
pokeColour(fj, peekColour(pjs, i, c), i);
}
pokeSpin(f, fj, j);
}
}
//////////////////////////////////////////////
// transpose array and scalar
//////////////////////////////////////////////
template<int Index,class vobj> inline Lattice<vobj> transposeSpin(const Lattice<vobj> &lhs){
return transposeIndex<SpinIndex>(lhs);
}
template<int Index,class vobj> inline Lattice<vobj> transposeColour(const Lattice<vobj> &lhs){
return transposeIndex<ColourIndex>(lhs);
}
template<int Index,class vobj> inline vobj transposeSpin(const vobj &lhs){
return transposeIndex<SpinIndex>(lhs);
}
template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
return transposeIndex<ColourIndex>(lhs);
}
//////////////////////////////////////////////
// transpose array and scalar
//////////////////////////////////////////////
template<int Index,class vobj> inline Lattice<vobj> transposeSpin(const Lattice<vobj> &lhs){
return transposeIndex<SpinIndex>(lhs);
}
template<int Index,class vobj> inline Lattice<vobj> transposeColour(const Lattice<vobj> &lhs){
return transposeIndex<ColourIndex>(lhs);
}
template<int Index,class vobj> inline vobj transposeSpin(const vobj &lhs){
return transposeIndex<SpinIndex>(lhs);
}
template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
return transposeIndex<ColourIndex>(lhs);
}
//////////////////////////////////////////
// Trace lattice and non-lattice
//////////////////////////////////////////
template<int Index,class vobj>
inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs._odata[0]))>
{
return traceIndex<SpinIndex>(lhs);
}
template<int Index,class vobj>
inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs._odata[0]))>
{
return traceIndex<ColourIndex>(lhs);
}
template<int Index,class vobj>
inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
{
return traceIndex<SpinIndex>(lhs);
}
template<int Index,class vobj>
inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs))>
{
return traceIndex<ColourIndex>(lhs);
}
//////////////////////////////////////////
// Trace lattice and non-lattice
//////////////////////////////////////////
template<int Index,class vobj>
inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(vobj()))>
{
return traceIndex<SpinIndex>(lhs);
}
template<int Index,class vobj>
inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(vobj()))>
{
return traceIndex<ColourIndex>(lhs);
}
template<int Index,class vobj>
inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
{
return traceIndex<SpinIndex>(lhs);
}
template<int Index,class vobj>
inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs))>
{
return traceIndex<ColourIndex>(lhs);
}
//////////////////////////////////////////
// Current types
//////////////////////////////////////////
GRID_SERIALIZABLE_ENUM(Current, undef,
Vector, 0,
Axial, 1,
Tadpole, 2);
//////////////////////////////////////////
// Current types
//////////////////////////////////////////
GRID_SERIALIZABLE_ENUM(Current, undef,
Vector, 0,
Axial, 1,
Tadpole, 2);
} //namespace QCD
} // Grid
NAMESPACE_END(Grid);
#endif

View File

@@ -37,14 +37,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// Abstract base interface
////////////////////////////////////////////
#include <Grid/qcd/action/ActionCore.h>
NAMESPACE_CHECK(ActionCore);
////////////////////////////////////////////////////////////////////////
// Fermion actions; prevent coupling fermion.cc files to other headers
////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/FermionCore.h>
NAMESPACE_CHECK(FermionCore);
#include <Grid/qcd/action/fermion/Fermion.h>
NAMESPACE_CHECK(Fermion);
////////////////////////////////////////
// Pseudo fermion combinations for HMC
////////////////////////////////////////
#include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
NAMESPACE_CHECK(PseudoFermion);
#endif

View File

@@ -27,19 +27,18 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
/* END LEGAL */
#ifndef ACTION_BASE_H
#define ACTION_BASE_H
namespace Grid {
namespace QCD {
NAMESPACE_BEGIN(Grid);
template <class GaugeField >
class Action
{
public:
public:
bool is_smeared = false;
// Heatbath?
virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
@@ -50,7 +49,6 @@ class Action
virtual ~Action(){}
};
}
}
NAMESPACE_END(Grid);
#endif // ACTION_BASE_H

Some files were not shown because too many files have changed in this diff Show More