1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-14 13:57:07 +01:00

Compare commits

..

462 Commits

Author SHA1 Message Date
c4435e6beb Merge branch 'release/v0.7.0' 2017-05-12 01:15:59 +01:00
7a8f6af5f8 Drop verbose compiler predefine check 2017-05-11 12:48:40 +01:00
49a5d9bac7 Clang major, minor trailing underscore 2017-05-11 12:25:02 +01:00
2b3fdd4a58 Print CXX predefines 2017-05-11 12:05:50 +01:00
34502ec471 4.8 dropped as buggy. 2017-05-11 11:43:39 +01:00
8a43e88b4f Compiler check early in build 2017-05-11 11:43:06 +01:00
238df20370 Still working on the compiler compat checks 2017-05-11 11:30:14 +01:00
97a32a6145 Add 4.8 test 2017-05-11 11:24:21 +01:00
655492a443 Compiler detection 2017-05-11 11:21:11 +01:00
1cab06f6bd Compat checks for compilers 2017-05-11 10:20:24 +01:00
f8024c262b Update Eigen 2017-05-10 13:30:09 +01:00
4cc5f01f4a Small change in the readme about the intel compiler 2017-05-09 15:38:59 +01:00
9c12c37aaf Confirming the fix on the complex boundary conditions 2017-05-09 08:41:29 +01:00
806eaa0530 Adding back the IO tests in the list 2017-05-08 22:26:44 +01:00
01d0e54594 Merge branch 'release/v0.7.0' into develop 2017-05-08 22:02:51 +01:00
5aafa335fe Fixing JSON error for complex numbers 2017-05-08 21:56:44 +01:00
8ba0494485 Fixing JSON for complex numbers 2017-05-08 21:41:39 +01:00
d99d98d9fd Merge branch 'release/v0.7.0' of https://github.com/paboyle/Grid into release/v0.7.0 2017-05-08 15:08:20 -04:00
95a017a4ae Relax force constraints to pass in single precision. 2017-05-08 15:06:41 -04:00
92f92379e6 Adding olivers test version 2017-05-08 18:42:19 +01:00
529e78d43f Restart the v0.7.0 release 2017-05-08 18:20:04 +01:00
4ec746d262 Merge branch 'release/v0.7.0' into develop 2017-05-06 18:43:03 +01:00
51bf1501fc Merge branch 'release/v0.7.0' 2017-05-06 18:42:50 +01:00
66d819c054 More info on gcc bug 2017-05-06 18:42:11 +01:00
3f3686f869 formattign 2017-05-06 18:41:27 +01:00
26bb829f8c Formatting 2017-05-06 18:40:55 +01:00
67cb04fc66 README update 2017-05-06 18:39:54 +01:00
a40bd68aed Version update 2017-05-06 17:00:14 +01:00
36495e0fd2 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-05-06 16:39:27 +01:00
93f6c15772 Warning squash 2017-05-06 16:38:58 +01:00
cb93eeff21 Update README 2017-05-06 16:28:12 +01:00
c7cc7e6101 Fix 2017-05-06 16:10:09 +01:00
c349aa6511 DEFINE warning elimination 2017-05-06 16:08:35 +01:00
3bae0a2d5c Drop a gcc warning 2017-05-06 15:51:42 +01:00
c1c7566089 GCC bug work around in 5.0 through 6.2 inclusive. 2017-05-06 15:20:25 +01:00
2439999ec8 Warning elimination; drop to -O2 on G++ bad versions 2017-05-06 14:44:49 +01:00
1d96f662e3 Fixed 4d fermion gparity force. Put strong tests on make check force tests 2017-05-06 00:46:31 +01:00
41d1889941 trusty ubuntu 2017-05-05 21:25:35 +01:00
0c3981e0c3 Trying to force recent automake 2017-05-05 21:15:22 +01:00
c727bd4609 Trying to work around automake version 2017-05-05 21:00:00 +01:00
db23749b67 Adding travis to make check 2017-05-05 20:42:08 +01:00
751f2b9703 Better check and benchmark driving 2017-05-05 19:54:38 +01:00
697c0603ce SITMO I/O for NERSC working now bit repro 2017-05-05 16:54:44 +01:00
14bedebb11 Source pointed to 2017-05-05 16:17:27 +01:00
47b5c07ffb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-05-05 14:27:02 +01:00
da86a2bf54 Merge branch 'feature/hmc_generalise' into develop 2017-05-05 14:23:02 +01:00
c1cb60a0b3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-05-05 14:22:37 +01:00
5ed5b4bfbf Merge branch 'develop' into feature/hmc_generalise 2017-05-05 14:22:33 +01:00
de84aacdfd Fixing a configure error for the smearing tests 2017-05-05 13:59:10 +01:00
2888003765 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-05-05 13:02:24 +01:00
da06bf5b95 Zmobius force test added 2017-05-05 12:52:45 +01:00
20999c1370 Merge branch 'develop' into feature/hmc_generalise 2017-05-05 12:47:17 +01:00
33f0ed1a33 No compile fix 2017-05-05 11:04:30 +01:00
50be56433b Delete old and defunct tests 2017-05-04 23:41:16 +01:00
43924007db Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-05-04 19:53:41 +01:00
78ef10e60f Mobius force improvement 2017-05-04 19:53:21 +01:00
679ae98b14 Merge branch 'feature/better-external-library' into develop 2017-05-04 15:42:12 +01:00
90f6bc16bb No compile clang fix 2017-05-04 12:15:06 +01:00
9b5b639546 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-05-03 20:51:40 -04:00
945767c6d8 More info 2017-05-03 20:26:35 -04:00
422cdf4979 Some checks 2017-05-03 18:37:38 -04:00
38db174f3b Print statement 2017-05-03 18:25:26 -04:00
92e364a35f Better reporting in benchmark for MPI3 2017-05-03 15:43:36 -04:00
58299b8ba2 Git info separated from version in git-config 2017-05-02 20:04:41 +01:00
124bf4d829 git ref in config summary 2017-05-02 19:41:01 +01:00
e8e56b3414 Config summary saved in git-config 2017-05-02 19:40:47 +01:00
89c430136d grid-config program 2017-05-02 19:13:13 +01:00
ea9aef7baa New header for standard headers (was an issue with Remez.h and external compilation) 2017-05-02 18:26:11 +01:00
c9e9e8061d Merge branch 'feature/hadrons' into develop 2017-05-02 18:23:47 +01:00
453cf2a1c6 Moving the topological charge outside the HMC related routines 2017-05-02 14:40:12 +01:00
de7bbfa5f9 Adding ParameterFile option for the HMC 2017-05-02 12:16:16 +01:00
dda8d77c87 Merge branch 'feature/hadrons' into feature/rare_kaon 2017-05-01 17:50:57 +01:00
aa29f4346a Hadrons: weird bus error with recent macOS clang 2017-05-01 17:49:08 +01:00
86116dbed6 Adding boundary condition switch (compile time) for the Mobius HMC example 2017-05-01 16:33:11 +01:00
7bd31e3f7c Adding external file support in the Mobius example (JSON) 2017-05-01 16:30:24 +01:00
74f451715f Fix for Mac compilation on the size_t uint64_t types 2017-05-01 15:12:07 +01:00
655be8ed76 Adding tests for the mobius operator 2017-05-01 14:42:16 +01:00
4063238943 Adding HMC test file example for Mobius + smearing 2017-05-01 13:44:00 +01:00
3344788fa1 Merge branch 'develop' into feature/hmc_generalise 2017-05-01 12:13:56 +01:00
99220f6531 Fixes and better timing 2017-04-26 17:24:11 -04:00
2a6d093749 move the sudo: required to match locatoin on Guido's branch 2017-04-26 09:15:34 +01:00
c947947fad sudo required suggested by guido 2017-04-26 08:45:36 +01:00
f555b50547 Merge branch 'feature/half-prec-comms' into develop 2017-04-26 08:43:40 +01:00
738c1a11c2 longer nloop 2017-04-26 08:43:20 +01:00
f8797e1e3e bug fix. works now and great face performance 2017-04-26 03:14:02 -04:00
fd1eb7de13 Clean implementation of the exterior faces listing only those points on the boudary 2017-04-26 02:34:52 -04:00
2ce898efa3 Pretty code 2017-04-26 02:34:25 -04:00
ab66bac4e6 Think I'm getting on top of the reduced cost exterior precomputed list of links 2017-04-25 08:50:26 +01:00
56277a11c8 Build a list of whats on the surface 2017-04-24 17:06:15 +01:00
916e9e1d3e Merge branch 'feature/half-prec-comms' of https://github.com/paboyle/Grid into feature/half-prec-comms 2017-04-24 10:39:19 +01:00
5b55867a7a Slightly cheaper Ext assembly 2017-04-24 05:36:11 -04:00
3accb1ef89 Debugged assemply split phase with interior suppression 2017-04-23 19:30:19 -04:00
e3d0e31525 Debugged assemply split phase with interior suppression 2017-04-23 19:29:27 -04:00
5812eb8a8c Partially fixed. But the comms-overlap does not work yet. 2017-04-22 18:50:25 -04:00
4dd3763294 Use OMP as much as possible 2017-04-22 20:35:20 +01:00
c429ace748 Cleaner OpenMP use 2017-04-22 20:28:42 +01:00
ac58565d0a Dangerous rewrite of the assembly. If I make a mistake the debug will be painful. 2017-04-22 19:31:04 +01:00
3703b718aa Mark up a table if a given site only receives from itself; including MPI3 splitting info. 2017-04-22 19:28:37 +01:00
b722889234 Try a better load balancing loop 2017-04-22 19:27:41 +01:00
abba44a837 Hand unrolled for overlapped comms 2017-04-22 17:45:17 +01:00
f301be94ce Fixed 2017-04-22 17:42:31 +01:00
1d1b225497 Hand unrolled Nc=3 kernels support split phase compute (on-node, off-node). 2017-04-22 09:05:28 -04:00
53a785a3dd Fixing the KNL compile 2017-04-22 08:11:51 -04:00
736bf3c866 Major rework of stencil. Half precision and MPI3 now working. 2017-04-22 11:33:50 +01:00
b9bbe5d188 L1p config bg/q 2017-04-22 11:33:09 +01:00
3844bcf800 If no f16c instructions supported must use software half precision conversion.
This will also become useful on BG/Q, so will move out from SSE4 into a general area.
Lifted the Eigen half precision from web. Looks sensible, but not extensively regressed
against the intrinsics implementation yet.
2017-04-20 15:30:52 +01:00
e1a2319d01 Simple compressor moved out of cshift into stencil 2017-04-20 13:18:15 +01:00
180c732b4c Move compressors out of Cshift.
Slice iterators would help
2017-04-20 13:17:55 +01:00
957a706d0b Useful script 2017-04-20 13:17:44 +01:00
d2312e9874 Drop compressor entirely from Cshift to only Stencil. 2017-04-20 13:16:55 +01:00
fc4ab9ccd5 Working half precision comms 2017-04-20 11:20:26 +01:00
4a340aa5ca Massive compressor rework to support reduced precision comms 2017-04-20 09:28:27 +01:00
3b7de792d5 Type comparison in the traits work 2017-04-18 13:28:04 +01:00
557c3fa109 Pretty change 2017-04-18 13:27:38 +01:00
ec18e9f7f6 Merge branch 'develop' into feature/half-prec-comms 2017-04-18 11:39:39 +01:00
a839d5bc55 Updated todo list 2017-04-18 11:22:17 +01:00
de41b84c5c Merge branch 'feature/normHP' into develop 2017-04-18 10:57:21 +01:00
8e161152e4 MultiRHS solver improvements with slice operations moved into lattice and sped up.
Block solver requires a lot of performance work.
2017-04-18 10:51:55 +01:00
3141ebac10 MultiRHS working, starting to optimise. Block doesn't and I thought it already was; puzzled. 2017-04-17 10:50:19 +01:00
7ede696126 Non compile of tests fixed 2017-04-16 23:40:00 +01:00
bf516c3b81 higher precision reduction variables in norm and inner product 2017-04-15 12:27:28 +01:00
441a52ee5d First cut at higher precision reduction 2017-04-15 10:57:21 +01:00
a8db024c92 Cleaning up the dense matrix and lanczos sector 2017-04-15 08:54:11 +01:00
a9c22d5f43 Verbose removal 2017-04-14 14:38:49 +01:00
3ca41458a3 Fix to no USE_FP16 case 2017-04-14 14:20:54 +01:00
9e2d29c644 USE_FP16 macro 2017-04-14 14:17:14 +01:00
951be75292 Half precision conversion working on AVX512 now too 2017-04-13 17:35:11 +01:00
b9113ed310 Patches for knl 2017-04-13 12:02:12 -04:00
42fb49d3fd Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-04-13 14:12:47 +01:00
2a54c9aaab Merge branch 'feature/block-cg' into develop 2017-04-13 14:12:24 +01:00
0957378679 Fixing conditional ugly way 2017-04-13 13:47:56 +01:00
2ed6c76fc5 Getting multiline if then fi working 2017-04-13 13:43:13 +01:00
d3b9a7fa14 F16c apparently requires AVX, even if the 128 bit are used.
Seems odd.
2017-04-13 13:19:11 +01:00
75ea306ce9 Another try at travis 2017-04-13 13:05:32 +01:00
4226c633c4 Default to FP16 off again 2017-04-13 12:51:39 +01:00
5a4eafbf7e .travis 2017-04-13 12:50:43 +01:00
eb8e26018b Travis update for macos 2017-04-13 12:35:11 +01:00
db5ea001a3 Update to use Xcode 8.3 since -mfp16 causes SIGILL 2017-04-13 12:22:40 +01:00
2846f079e5 Predicate tests on fp16 being enabled 2017-04-13 12:08:05 +01:00
1d502e4ed6 FP16 optional compile time 2017-04-13 11:55:24 +01:00
73cdf0fffe Drop f16c from SSE because of a macos compile error on travis 2017-04-13 11:23:41 +01:00
1c25773319 Trap illegal instructions 2017-04-13 10:51:40 +01:00
c38400b26f Trap signals 2017-04-13 10:35:20 +01:00
9c3065b860 Debug flags off again 2017-04-13 10:01:32 +01:00
94eb829d08 Align cast fixed for __mm128i gcc complained 2017-04-13 08:40:44 +01:00
68392ddb5b Exchange in generic
Precision change in AVX, SSE, AVX512, Generic. QPX still to do.
2017-04-13 08:38:12 +01:00
cb6b81ae82 Half precision conversion 2017-04-12 19:32:37 +01:00
90ec6eda0c Rare K test solver name fix 2017-04-10 17:48:58 +01:00
fe8d625694 Merge commit '5e477ec553aa48d7d19b5a7c45d41acbb3392bcb' into feature/rare_kaon 2017-04-10 17:23:37 +01:00
53e76b41d2 Merge branch 'develop' into feature/hadrons 2017-04-10 17:00:53 +01:00
8ef4300412 spurious .dirstamp files removed 2017-04-10 17:00:22 +01:00
98a24ebf31 The macro “magics” is very intensive for the preprocessor in the measurement code which has numerous serialisable classes. Reducing the number of serialisable fields to 64 (instead of 1024) helps a lot, this is enough for now and can be extended trivially if needed in the future. 2017-04-10 16:58:54 +01:00
b12dc89d26 Commenting and clean up 2017-04-10 20:38:20 +09:00
d80d802f9d MultiRHS solver test 2017-04-10 00:12:12 +09:00
3d99b09dba Start of blockCG 2017-04-09 23:42:10 +09:00
db5f6d3ae3 Verbose fix 2017-04-09 23:41:30 +09:00
683550f116 Const args improvement 2017-04-09 23:41:04 +09:00
5e477ec553 Merge branch 'feature/hadrons' of https://github.com/paboyle/Grid into feature/rare_kaon 2017-04-07 11:51:09 +01:00
55d0329624 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-04-07 11:08:14 +09:00
86aaa35294 Christoph needs SchurDiagTwoKappa which is mobius specific. 2017-04-07 11:07:40 +09:00
172d3dc93a Correcting names in tests 2017-04-05 16:24:04 +01:00
8c540333d5 Merge branch 'develop' into feature/hmc_generalise 2017-04-05 14:41:04 +01:00
5592f7b8c1 Creation mode better implementation 2017-04-05 02:35:34 +09:00
35da4ece0b UID fix 2017-04-05 02:18:15 +09:00
061b15b9e9 Merge branch 'feature/sitmo-skipahead' into develop 2017-04-05 01:24:49 +09:00
ff4e54ef80 Merge branch 'develop' into feature/hadrons 2017-04-03 18:56:21 +01:00
561426f6eb Clean up 2017-04-02 23:13:48 +09:00
83f6fab8fa Big/Small crush test, and fast SITMO rng init, faster but not ideal
MT and Ranlux init.
2017-04-02 12:10:51 +09:00
0fade84ab2 No random device 2017-04-02 00:29:40 +09:00
9dc7ca4c3b Sitmo fast init 2017-04-02 00:28:22 +09:00
935d82f5b1 sanity checks 2017-04-02 00:27:28 +09:00
9cbcdd65d7 No random device seed 2017-04-02 00:26:57 +09:00
f18f5ed926 Drop random device 2017-04-02 00:26:26 +09:00
d1d63a4f2d sitmo default 2017-04-02 00:26:05 +09:00
7e5faa0f34 Multiple RNGs 2017-04-02 00:25:44 +09:00
6af459cae4 Christoph's coefficients. 2017-03-31 17:07:43 +09:00
1c4bc7ed38 Debugged staggered conventions 2017-03-31 14:41:48 +09:00
cd1bd921bd Reduced code duplication for Weak Hamiltonian contraction modules 2017-03-30 18:02:14 +01:00
fff5751b1a HADRONS: Updated rare kaon test program, including all contractions. Sink smearing still to be implemented. 2017-03-30 10:57:01 +01:00
2c81696fdd HADRONS: 4pt Weak + current disconnected topology (e.g. for rare neutral kaon decays) 2017-03-30 10:37:17 +01:00
c9dc22efa1 HADRONS: Standalone disconnected loop contraction. 2017-03-30 10:33:18 +01:00
0ab04a000f HADRONS: 3pt contraction with gamma insertion between two propagators. 2017-03-30 10:30:58 +01:00
93ea5d9468 Pretty code 2017-03-30 15:00:03 +09:00
1ec5d32369 Chulwoo's test to zmobius helped me shake out 2017-03-30 13:45:13 +09:00
9fd23faadf Pretty layout 2017-03-30 13:44:45 +09:00
10e4fa0dc8 Template instantiation improvements 2017-03-30 13:44:25 +09:00
c4aca1dde4 Conjugate coefficients on adjoint 2017-03-30 13:44:05 +09:00
b9e8ea3aaa conjugate coefficient on the dagger 2017-03-30 13:43:13 +09:00
077aa728b9 Fix the ZMobius (I think) 2017-03-30 13:42:09 +09:00
a8d83d886e Macro controls 2017-03-30 13:31:34 +09:00
7fd46eeec4 Trailing whitespace removal 2017-03-30 13:31:10 +09:00
e0c4eeb3ec Compiles again 2017-03-30 13:30:45 +09:00
cb9a297a0a Chulwoo's Zmobius test 2017-03-30 13:30:25 +09:00
2b115929dc Small AVX512 asm ifdef patch 2017-03-29 18:51:23 +09:00
5c6571dab1 Merge branch 'feature/bgq-asm' into develop 2017-03-29 18:48:55 +09:00
417ec56cca Release candidate 2017-03-29 05:45:33 -04:00
756bc25008 Verbose header print by default 2017-03-29 04:44:17 -04:00
35695ba57a Bug fix in MPI3 2017-03-29 04:43:55 -04:00
81ead48850 Log any errors to a file 2017-03-29 04:39:52 -04:00
d805867e02 Better init 2017-03-28 13:25:05 -04:00
e55a751e23 Merge branch 'feature/bgq-asm' of https://github.com/paboyle/Grid into feature/bgq-asm 2017-03-28 12:20:12 -04:00
358eb75995 Shorten loop 2017-03-28 12:20:02 -04:00
98f9318279 Build on AVX2 and MPI passing with clang++ 2017-03-28 23:16:04 +09:00
4b17e8eba8 Merge branch 'develop' into feature/bgq-asm
Conflicts:
	lib/qcd/action/fermion/Fermion.h
	lib/qcd/action/fermion/WilsonFermion.cc
	lib/util/Init.cc
	tests/Test_cayley_even_odd_vec.cc
2017-03-28 04:49:30 -04:00
75112a632a IO improvements to fail on IO error 2017-03-28 02:28:04 -04:00
18bde08d1b Merge branch 'feature/staggering' into develop 2017-03-28 15:25:55 +09:00
d45cd7e677 Adding a simple read of NERSC test 2017-03-26 09:24:26 -04:00
4e96679797 Added a bnl log 2017-03-25 09:25:46 -04:00
4c1ea8677e Small cosmetic changes and vscode gitignore 2017-03-23 14:09:35 +09:00
fc93f0b2ec Save some code for static huge tlb's. It is ifdef'ed out but an interesting root only experiment.
No gain from it.
2017-03-21 22:30:29 -04:00
8c8473998d Average over whole cluster the comm time. 2017-03-21 22:29:51 -04:00
120fb59978 Adding tests for WilsonFlow classes 2017-03-21 16:11:35 +09:00
fd56b3ff38 Merge branch 'develop' into feature/hmc_generalise 2017-03-21 13:33:41 +09:00
0ec6829edc Fixing compilation errors for the WilsonFlow 2017-03-21 13:06:32 +09:00
18b7845b7b Adding WilsonFlow smearing 2017-03-21 11:52:05 +09:00
3d0fe15374 Added topological charge measurement 2017-03-17 16:14:57 +09:00
91886068fe Fixed seg fault for observable modules 2017-03-17 13:59:31 +09:00
6d1e9e5f92 Small cleanup of the observables 2017-03-17 11:42:55 +09:00
b640230b1e Moving hmc observables in a different directory 2017-03-17 11:40:17 +09:00
e7c36771ed ZMobius prep for asm 2017-03-15 14:23:33 -04:00
8dc57a1e25 Layout change 2017-03-13 11:11:46 +00:00
f57bd770b0 Merge branch 'bugfix/dminus' into feature/bgq-asm 2017-03-13 11:11:03 +00:00
4ed10a3d06 Merge branch 'develop' into feature/bgq-asm 2017-03-13 11:10:10 +00:00
dfefc70b57 Merge pull request #93 from Lanny91/hotfix/qpx
Some fixes for QPX and generic SIMD types.
2017-03-13 09:31:26 +00:00
b64e004555 MPI run fail on macos 2017-03-13 01:59:01 +00:00
447c5e6cd7 Z mobius hermiticity correction 2017-03-13 01:30:43 +00:00
8b99d80d8c Merge branch 'bgq-asm-shmemfixes' into feature/bgq-asm 2017-03-12 23:30:09 +00:00
b3dede4dd3 Merge branch 'develop' into feature/hmc_generalise 2017-03-10 23:57:37 +09:00
4e34132f4d Correcting modules use in test files 2017-03-10 23:54:53 +09:00
c07cb10247 Merge branch 'feature/hmc_generalise' of https://github.com/paboyle/Grid into feature/hmc_generalise 2017-03-10 22:37:25 +09:00
d7767a2a62 Few more tests 2017-03-10 22:33:48 +09:00
ec035983fd Fixing the implicit integration 2017-03-01 11:56:35 +00:00
3901b17ade timeings from BNL 2017-02-28 17:06:45 -05:00
af230a1fb8 Average the time across the whole machine for outliers 2017-02-28 17:05:22 -05:00
06a132e3f9 Fixes to SHMEM comms 2017-02-28 13:31:54 -08:00
596dcd85b2 Auxiliary fields 2017-02-27 13:16:38 +00:00
96d44d5c55 Header fix 2017-02-24 19:12:11 -05:00
7270c6a150 Integrator works now 2017-02-24 17:03:42 +00:00
7fe797daf8 SIMD vector length sanity checks 2017-02-23 16:49:44 +00:00
486a01294a Corrected QPX SIMD width 2017-02-23 16:47:56 +00:00
586a7c90b7 Merge branch 'develop' into feature/bgq-asm 2017-02-23 00:26:59 +00:00
e099dcdae7 Merge branch 'develop' into feature/bgq-asm 2017-02-23 00:25:29 +00:00
4e7ab3166f Refactoring header layout 2017-02-22 18:09:33 +00:00
aac80cbb44 Bug fix from Chris K 2017-02-22 12:19:09 -05:00
c80948411b Added tRotate function and MaddRealPart struct for generic SIMD, bugfix in MultRealPart and minor cosmetic changes. 2017-02-22 14:57:10 +00:00
95625a7bd1 Use Grid Integer type 2017-02-22 13:09:32 +00:00
0796696733 Emulated integer vector type for QPX and generic SIMD instruction sets. 2017-02-22 12:01:36 +00:00
f8b9ad7d50 Merge pull request #91 from sunpho84/public_modules_memebers
making public same serializable parameters in HMC Module
2017-02-22 00:53:20 +00:00
04a1959895 Merge pull request #90 from sunpho84/liming
adding --with switch to pass lime path
2017-02-22 00:52:53 +00:00
1c30e9a961 Verified 2017-02-21 23:01:25 +00:00
93cc270016 making public same serializable parameters in HMC Module
RNGModuleParameters
GridModuleParameters
2017-02-21 23:11:56 +01:00
29b60f7e1a adding --with switch to pass lime path 2017-02-21 23:09:39 +01:00
bf7e3f20d4 Staggaered fermion optimised version 2017-02-21 14:35:42 +00:00
902afcfbaf Adding metric and the implicit steps 2017-02-21 11:30:57 +00:00
3ae92fa2e6 Global changes to parallel_for structure.
Move the comms flags to more sensible names
2017-02-21 05:24:27 -05:00
3906cd2149 Stencil fix on BNL KNL system 2017-02-20 17:51:31 -05:00
5a1fb29db7 Useful debug code info to preserve 2017-02-20 17:49:23 -05:00
661fc4d3d1 Debug AVX512 exchange code paths 2017-02-20 17:48:36 -05:00
41009cc142 Move excange into the stencil only; keep Cshift fully general 2017-02-20 17:48:04 -05:00
37720c4db7 Count bytes off node only 2017-02-20 17:47:40 -05:00
1a30455a10 1000 iters on bmark for more accurate timing 2017-02-20 17:47:01 -05:00
97a6b61551 Covariant laplacian and implicit integration 2017-02-20 11:17:27 +00:00
cd0da81196 Merge branch 'feature/bgq-asm' of https://github.com/paboyle/Grid into feature/bgq-asm 2017-02-16 18:52:30 -05:00
f246fe3304 Improvements to avx for invertible to avoid latent bug 2017-02-16 23:52:44 +00:00
8a29c16bde Faster gather exchange 2017-02-16 23:52:22 +00:00
d68907fc3e Debug temp 2017-02-16 18:51:35 -05:00
5c0adf7bf2 Make clang happy with parenthesis 2017-02-16 23:51:33 +00:00
be3a8249c6 Faster gather 2017-02-16 23:51:15 +00:00
bd600702cf Vectorise the XYZT face gathering better.
Hard coded for simd_layout <= 2 in any given spread out direction; full generality is inconsistent
with efficiency.
2017-02-15 11:11:04 +00:00
f011bdb869 Fixed overwrite of pminus projection in construction of 4d propagator from 5d. 2017-02-14 14:07:17 +00:00
bafb101e4f Testing different versions of the Laplacian 2017-02-13 15:38:11 +00:00
08fdf05528 Added and tested the covariant laplacian + CG solver 2017-02-13 15:05:01 +00:00
aca7a3ef0a Optimisation control improvements 2017-02-10 18:22:31 -05:00
9e72a6b22e Reverting to Xcode 7.3 2017-02-10 12:57:03 +00:00
1c12c5612c Xcode 8.2 for travis 2017-02-10 12:12:01 +00:00
a8193c4bcb Correcting travis compilation on gcc 2017-02-10 10:59:30 +00:00
c3d7ec65fa All tests compile. 2017-02-10 10:27:51 +00:00
8b6a6c8236 Resolving small merge conflict 2017-02-09 16:20:24 +00:00
e0571c872b Merge branch 'develop' into feature/hmc_generalise 2017-02-09 16:12:00 +00:00
c67f41887b Reverting parameters to original 2017-02-09 15:59:56 +00:00
84687ccf1f Handling an Intel compiler warning for Json class 2017-02-09 15:33:33 +00:00
3274561cf8 Cleanup 2017-02-09 15:18:38 +00:00
e08fbb3771 Merge pull request #84 from Lanny91/feature/rare_kaon
Rare Kaon decay contraction code
2017-02-08 08:23:34 -08:00
d7464aa0fe Switched from XmlWriter to CorrWriter in contraction code 2017-02-08 16:13:44 +00:00
00d29153f0 Merge branch 'feature/hadrons' of https://github.com/paboyle/Grid into feature/rare_kaon 2017-02-08 16:11:15 +00:00
2ce989f220 Hadrons: default I/O to HDF5 2017-02-08 07:50:05 -08:00
d7a1dc85be Revert "Hadrons: test for rare kaon contraction code."
This reverts commit 1e257a1251.
2017-02-08 13:23:05 +00:00
fc19503673 Removed MSink namespace. 2017-02-08 13:17:39 +00:00
beba824136 Make use of GammaL class in Weak Hamiltonian contractions 2017-02-08 12:45:39 +00:00
6ebf8b12b6 Removed unnecessary repeat of write in Weak Hamiltonian contractions 2017-02-08 12:43:13 +00:00
e5a7ed4362 Moved write outside of loop, some physics corrections 2017-02-08 12:29:33 +00:00
b9f7ea47c3 Access hasModule function directly from Environment instance. 2017-02-08 10:10:06 +00:00
06f7ee202e Revert "Add function to say whether or not a module exists in application class"
This reverts commit 522f6bf91a.
2017-02-08 10:08:18 +00:00
2b2fc6453f Fixed single precision compatibility issues 2017-02-07 13:59:29 +00:00
bdd2765461 Added missing allocation of Weak Hamiltonian result vector 2017-02-07 13:06:42 +00:00
2c246551d0 Overlap comms and compute options in wilson kernels 2017-02-07 01:37:10 -05:00
71ac2e7940 Faster RNG init 2017-02-07 01:33:23 -05:00
2bf4688e83 Running on BNL KNL 2017-02-07 01:32:10 -05:00
a48ee6f0f2 Don't use MPI3_leader any more. No real gain and complex 2017-02-07 01:31:24 -05:00
73547cca66 MPI3 working i think 2017-02-07 01:30:02 -05:00
123c673db7 Policy to control async or sync SendRecv 2017-02-07 01:24:54 -05:00
61f82216e2 Communicator Policy, NodeCount distinct from Rank count 2017-02-07 01:22:53 -05:00
8e7ca92278 Debugged cshift case 2017-02-07 01:21:32 -05:00
485ad6fde0 Stencil working in SHM MPI3 2017-02-07 01:20:39 -05:00
6ea2184e18 OMP define change 2017-02-07 01:17:16 -05:00
fdc170b8a3 Parallel fors in lattice transfer 2017-02-07 01:16:39 -05:00
060da786e9 Comms benchmark improvements 2017-02-07 01:07:39 -05:00
85c7bc4321 Bug fixes for cases that physics code couldn't hit but latent
and discovered on KNL (long vector, y SIMD dir) and checker dir set to y.
Remove the assertions on these code paths now they are tested.
2017-02-07 01:01:15 -05:00
0883d6a7ce Overlap comms compute support; make reg naming consistent with bgq aasm 2017-02-07 00:59:32 -05:00
9ff97b4711 Improved stencil tests passing all on KNL multinode 2017-02-07 00:58:34 -05:00
b5e9c900a4 Better printing and signal handling options 2017-02-07 00:57:55 -05:00
4bbdfb434c Overlap comms compute modifications 2017-02-07 00:57:01 -05:00
4a45c06dd7 Code cleaning and addition of Weak Hamiltonian contraction log message 2017-02-06 20:12:30 +00:00
d6a7d7d1e0 Hadrons: added missing momentum parameter in rare kaon contraction test 2017-02-06 18:15:49 +00:00
1a122a0dd8 Hadrons: corrected gamma matrix inputs in rare kaon test 2017-02-06 17:35:41 +00:00
20e20733e8 Merge branch 'feature/hadrons' into feature/rare_kaon 2017-02-06 14:12:21 +00:00
b7cd1a19e3 Utilities for reading and writing "pair" objects. 2017-02-06 14:08:59 +00:00
f510002a62 Merge remote-tracking branch 'paboyle/feature/hadrons' into feature/hadrons 2017-02-03 14:37:34 +00:00
1e257a1251 Hadrons: test for rare kaon contraction code. 2017-02-01 16:36:40 +00:00
522f6bf91a Add function to say whether or not a module exists in application class 2017-02-01 16:36:08 +00:00
d35d87d2c2 Weak Hamiltonian Eye-type contraction execution 2017-02-01 16:33:24 +00:00
74a5cda84b Removed unnecessary "3pt" labels 2017-02-01 15:03:49 +00:00
5be05d85b8 Fixed collision of Wall source and sink header ifdefs 2017-02-01 13:56:22 +00:00
35ac85aea8 Updated Weak Hamiltonian contractions to use zero-flop gamma matrices 2017-02-01 12:57:34 +00:00
fa237401ff Consistent variable name in macro 2017-02-01 12:56:55 +00:00
97053adcb5 Merge branch 'feature/hadrons' into feature/rare_kaon 2017-02-01 10:13:29 +00:00
f8fbe4d7a3 Merge remote-tracking branch 'paboyle/feature/hadrons' into feature/hadrons
# Conflicts:
#	extras/Hadrons/Modules/MContraction/Meson.hpp
#	tests/hadrons/Test_hadrons_meson_3pt.cc

Updated Meson.hpp to utilise zero-flop gamma matrices.
2017-02-01 09:27:00 +00:00
ef31c012bf Merge remote-tracking branch 'paboyle/develop' into feature/hadrons 2017-01-31 17:36:10 +00:00
9e9f621d5d Hadrons: added Weak Hamiltonian module dependencies, some reformatting. 2017-01-30 17:54:21 +00:00
651e1a7cbc Hadrons: Momentum inserted as multiples of 2*pi/L 2017-01-30 17:14:33 +00:00
c4d3672720 Hadrons: Momentum projection in meson module. 2017-01-30 17:09:04 +00:00
16be6d378c Now action factory support different Fields (templated) 2017-01-30 14:22:41 +00:00
f05d0565aa Adding ScalarField theory 2017-01-30 10:59:28 +00:00
9bf4108d1f Weak Hamiltonian contraction modules, for Eye and NonEye contraction topologies. Execution for NonEye type diagrams has been implemented, but not yet for Eye type. 2017-01-27 16:58:11 +00:00
6929a84c70 Reformatting files 2017-01-27 11:54:44 +00:00
5c779a789b Moving registrations in an independent file 2017-01-27 11:23:51 +00:00
e863a948e3 Cleaning up files and directories 2017-01-26 15:24:49 +00:00
977f34dca6 Added missing typename 2017-01-26 13:18:33 +00:00
90ad956340 Merge branch 'develop' of https://github.com/paboyle/Grid into feature/rare_kaon 2017-01-26 12:08:41 +00:00
7996f06335 Commented out registrations.
Move to an independent file that is linked only for the factory managed HMC
2017-01-25 18:27:45 +00:00
7b40a3e3e5 Reorganizing files 2017-01-25 18:09:46 +00:00
f7fbbaaca3 Compiles after merging 2017-01-25 12:11:58 +00:00
17629b8d9e Merge branch 'develop' into feature/hmc_generalise 2017-01-25 11:33:53 +00:00
0baa20d292 Againg fixing compilation on Travis, no LIME lib present 2017-01-25 11:18:44 +00:00
4571c918a4 Fixing compilation error when compiling without LIME 2017-01-25 11:14:43 +00:00
5251ea4d30 Adding more fermion action modules, generalised DWF 2017-01-25 11:10:44 +00:00
7f456b4173 👷 Added all pseudofermion actions to the serialiser 2017-01-24 13:57:32 +00:00
c291ef77b5 Merge branch 'feature/hadrons' of https://github.com/paboyle/Grid into feature/hadrons 2017-01-23 15:24:47 +00:00
7dd2764bb2 Wall sink smearing 2017-01-23 15:17:54 +00:00
244f8fb6dc Added JSON parser (without NextElement) 2017-01-23 14:57:38 +00:00
05c1924819 Timing loop change 2017-01-23 10:43:45 +00:00
27dfe816fa Added TwoFlavorsEO
Had to remove a conformability check in the Derivative of SchurDiff,
see the comments in the file
2017-01-20 16:59:31 +00:00
af29be2c90 Simplified operation of meson module. Result has been modified to output one contraction at a time for each pair of gamma insertions at source and sink. 2017-01-20 16:38:50 +00:00
f96fac0aee All functionalities ready.
Todo: add all the fermion action modules
2017-01-20 12:56:20 +00:00
07f2ebea1b Meson module now takes list of gamma matrices to insert at source and sink. 2017-01-19 22:18:42 +00:00
851f2ad8ef Adding fermions actions support in the factories 2017-01-19 10:00:02 +00:00
23e0561dd6 Added all required functionalities, time for cleaning
All actions to be added
2017-01-18 16:31:51 +00:00
8ae1a95ec6 Legal banners and module descriptions 2017-01-17 18:14:20 +00:00
82b7d4eaf0 Added noise loop dependencies 2017-01-17 15:58:32 +00:00
78774fbdc0 Construct loop propagator 2017-01-17 15:29:45 +00:00
924130833e Moved more parameters to serialization 2017-01-17 13:22:18 +00:00
0157274762 HMC factories 2017-01-17 10:46:49 +00:00
87e8aad5a0 Added support for input file HMC modules (missing the actions yet) 2017-01-16 16:07:12 +00:00
c6f59c2933 Adding factories 2017-01-16 10:18:09 +00:00
b7f90aa011 Added momentum choice for wall source 2017-01-13 15:54:19 +00:00
f22b79da8f Added missing type aliases 2017-01-12 12:52:12 +00:00
3855673ebf Added header for wall source 2017-01-12 11:42:37 +00:00
4db82da0db Wall sources 2017-01-12 11:41:10 +00:00
0cdc3d2fa5 Merge remote-tracking branch 'refs/remotes/paboyle/feature/hadrons' into feature/hadrons 2017-01-12 11:26:55 +00:00
0dfda4bb90 Working on the RNGModule 2017-01-09 11:06:18 +00:00
1189ebc8b5 Cleaning up the checkpointers interface 2017-01-05 15:52:52 +00:00
1bb8578173 Added module for checkpointers 2017-01-05 13:09:32 +00:00
c3b6d573b9 Merge branch 'feature/bgq-asm' of https://github.com/paboyle/Grid into feature/bgq-asm 2016-12-30 22:42:17 +00:00
1e179c903d Worried about integer; suspect where statements are broken 2016-12-27 17:46:38 +00:00
669cfca9b7 No inline 2016-12-27 17:45:40 +00:00
ff2f559a57 Remove inline on gather optimised path 2016-12-27 17:45:19 +00:00
03c81bd902 Merge branch 'feature/bgq-asm' of https://github.com/paboyle/Grid into feature/bgq-asm 2016-12-27 11:25:35 +00:00
a869addef1 Stats switch off 2016-12-27 11:25:22 +00:00
1caa3fbc2d LOCK UNLOCK only 2016-12-27 11:24:45 +00:00
3d21297bbb Call the fast path compressor for wilson kernels to avoid if else on projector 2016-12-27 11:23:13 +00:00
25efefc5b4 Back to original thread policy post test 2016-12-23 09:49:04 +00:00
eabf316ed9 BGQ performance ASM 2016-12-22 21:56:08 +00:00
04ae7929a3 BGQ or KNL assembler now 2016-12-22 17:53:22 +00:00
caba0d42a5 L1p controls 2016-12-22 17:52:55 +00:00
9ae81c06d2 L1p controls for BG/Q 2016-12-22 17:52:21 +00:00
0903c48caa Hot start SU3 2016-12-22 17:51:45 +00:00
7dc36628a1 QPX finishing 2016-12-22 17:50:48 +00:00
b8cdb3e90a Debug hack; raises from 62GF/s to 72 GF/s per node on BG/Q 2016-12-22 17:50:14 +00:00
5241245534 Default to static scheduling 2016-12-22 17:49:21 +00:00
960316e207 type conversion in printf 2016-12-22 17:27:01 +00:00
5214846341 Adding a resource manager 2016-12-22 12:41:56 +00:00
ce1a115e0b Removing redundant arguments for integrator functions, step 1 2016-12-20 17:51:30 +00:00
3f2d53a994 BGQ assembler beginning 2016-12-20 10:21:26 +00:00
df9108154d Debugged 2 versions of assembler; ls vectorised, xyzt vectorised 2016-12-17 23:47:51 +00:00
b3e7f600da Partial implementation of 4d vectorisation assembler 2016-12-16 23:50:30 +00:00
d4071daf2a Template specialise 2016-12-16 22:28:29 +00:00
a2a6329094 AVX512 only for ASM compilation 2016-12-16 22:03:29 +00:00
eabc577940 Assembler possibly working 2016-12-16 16:55:36 +00:00
0bd296dda4 Adding check of the Dag part in the benchmark 2016-12-14 03:15:09 +00:00
af0ccdd8e9 Moving output order 2016-12-14 02:02:42 +00:00
2fb92dbc6e Cleaning up previous debug lines 2016-12-13 07:53:43 +00:00
5c74b6028b Commit for debugging, lot of IO 2016-12-13 06:35:30 +00:00
e0be2b6e6c Adding a new tests for the Ls vec CG 2016-12-13 04:59:18 +00:00
ef72f322d2 consistency of tests 2016-12-13 02:24:20 +00:00
426197e446 Nc=3 2016-12-12 09:10:54 +00:00
99e2c1e666 Kernels options 2016-12-12 09:08:53 +00:00
1440565a10 Decrease verbosity 2016-12-12 09:08:04 +00:00
e9f0c0ea39 Staggered kernels options 2016-12-12 09:07:38 +00:00
7bc2065113 Adding report at the end of the DWF HMC tests 2016-12-12 04:21:34 +00:00
2bd4233919 Completed testing of the HMC for Ls vectorised version (on AVX2) 2016-12-07 04:56:37 +00:00
143c70e29f Debugged the threaded version. Cleaning up 2016-12-07 04:40:25 +00:00
b812d5e39c Added single threaded version of the derivative for the Ls vectorised DWF 2016-12-06 16:31:13 +00:00
01480da0a8 Merge branch 'develop' into feature/hmc_generalise 2016-12-05 05:10:27 +00:00
c097fd041a Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering 2016-11-29 13:44:17 +00:00
77fb25fb29 Push 5d tests 2016-11-29 13:43:56 +00:00
389e0a77bd Staggerd Fermion 5D 2016-11-29 13:13:56 +00:00
95f43d27ae Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering 2016-11-22 13:49:22 +00:00
668ca57702 Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering 2016-11-22 13:49:11 +00:00
62749d05a6 Naming the scalar action 2016-11-17 12:26:20 +00:00
3834feb4b7 Adding action names 2016-11-16 16:46:49 +00:00
454302414d Small modif at the test hmc 2016-11-15 12:31:13 +00:00
6f8b771a37 Adding date of the last commit 2016-11-10 18:52:00 +00:00
4e1ffdd17c Adding git info to the configure output 2016-11-10 18:44:36 +00:00
a783282b8b Merge branch 'develop' into feature/hmc_generalise 2016-11-10 18:13:07 +00:00
19b85d8486 Some comments in the hmc files 2016-11-10 17:55:58 +00:00
c363bdd784 Merge branch 'release/v0.6.0' 2016-11-09 12:43:14 +00:00
ee686a7d85 Compiles now 2016-11-03 16:58:23 +00:00
1c5b7a6be5 Staggered phases first cut, c1, c2, u0 2016-11-03 16:26:56 +00:00
164d3691db Staggered 2016-11-01 14:24:22 +00:00
1d666771f9 Debugging the RNG, eliminate the barrier after broadcast 2016-10-26 16:08:23 +01:00
d50055cd96 Making the ILDG support optional 2016-10-26 09:48:01 +01:00
47c7159177 ILDG reader/writer works
Fill the xml header with the required information, todo.
2016-10-24 21:57:54 +01:00
f415db583a Adding ILDG format 2016-10-24 15:48:22 +01:00
f55c16f984 Adding a barrier in the RNG save 2016-10-24 11:02:14 +01:00
df67e013ca More debug output for the RNG 2016-10-22 13:34:17 +01:00
3e990c9d0a Reverting the broadcast change 2016-10-22 13:26:43 +01:00
4b740fc8fd Debugging the RNG state save 2016-10-22 13:06:00 +01:00
cccd14b09e Small cleanup 2016-10-21 17:20:54 +01:00
e6acffdfc2 Fixing the plaquette computation 2016-10-21 16:06:34 +01:00
392130a537 Working on the 5d 2016-10-21 14:22:25 +01:00
deef2673b2 Separating the Lattice theories stub from the QCD.h file 2016-10-20 17:24:08 +01:00
977b0a6dd9 Merge branch 'develop' into feature/hmc_generalise 2016-10-20 17:04:41 +01:00
977d844394 Few modifications on stdout messages 2016-10-20 17:01:59 +01:00
590675e2ca Csum in hex format 2016-10-19 17:26:25 +01:00
8c65bdf6d3 Printing checksum for the RNG file 2016-10-19 16:56:11 +01:00
74f1ed3bc5 Adding some documentation for HMC 2016-10-19 10:51:13 +01:00
79270ef510 Added a test for EODWF Scaled Shamir with general HMC 2016-10-14 17:34:26 +01:00
e250e6b7bb Moving parameters outside of the HMCrunner 2016-10-14 17:22:32 +01:00
261342c15f Adding gh-pages 2016-10-13 11:51:25 +01:00
eda4dd622e Some more edit 2016-10-11 15:45:20 +01:00
c68a2b9637 Minor fix 2016-10-10 11:54:58 +01:00
293df6cd20 Generalising the HMCRunner and moving parameters to the user level 2016-10-10 11:49:55 +01:00
65f61bb3bf Reset QCD colours to 3 2016-10-10 09:46:17 +01:00
26b9740d53 Some fix for the GenericHMCrunner 2016-10-10 09:43:05 +01:00
6eb873dd96 Added scalar action phi^4
Check Norm2 output (Complex type assumption)
2016-10-07 17:28:46 +01:00
11b4c80b27 Added support for hmc and binary IO for a general field 2016-10-07 13:37:29 +01:00
c065e454c3 Adding Binrary IO, untested 2016-10-06 10:12:11 +01:00
d9b5fbd374 In the middle of adding a general binary writer 2016-10-04 11:24:08 +01:00
cfbc1a26b8 Now the gauge implementation has to take care of the Nexp 2016-10-03 16:20:06 +01:00
257f69f931 One more function to generalise the HMC integrator 2016-10-03 15:50:04 +01:00
e415260961 First cut on generalised HMC
Backward compatibility OK
2016-10-03 15:28:00 +01:00
446c768cd3 Merge branch 'hotfix/v0.5.1'
Double precision compile fix
2016-07-01 16:33:59 +01:00
382 changed files with 57078 additions and 9801 deletions

8
.gitignore vendored
View File

@ -92,6 +92,7 @@ build*/*
#####################
*.xcodeproj/*
build.sh
.vscode
# Eigen source #
################
@ -106,6 +107,10 @@ lib/fftw/*
m4/lt*
m4/libtool.m4
# github pages #
################
gh-pages/
# Buck files #
##############
.buck*
@ -116,4 +121,5 @@ make-bin-BUCK.sh
# generated sources #
#####################
lib/qcd/spin/gamma-gen/*.h
lib/qcd/spin/gamma-gen/*.cc
lib/qcd/spin/gamma-gen/*.cc

View File

@ -7,9 +7,11 @@ cache:
matrix:
include:
- os: osx
osx_image: xcode7.2
osx_image: xcode8.3
compiler: clang
- compiler: gcc
dist: trusty
sudo: required
addons:
apt:
sources:
@ -24,6 +26,8 @@ matrix:
- binutils-dev
env: VERSION=-4.9
- compiler: gcc
dist: trusty
sudo: required
addons:
apt:
sources:
@ -38,6 +42,7 @@ matrix:
- binutils-dev
env: VERSION=-5
- compiler: clang
dist: trusty
addons:
apt:
sources:
@ -52,6 +57,7 @@ matrix:
- binutils-dev
env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
- compiler: clang
dist: trusty
addons:
apt:
sources:
@ -73,13 +79,15 @@ before_install:
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
install:
- export CC=$CC$VERSION
- export CXX=$CXX$VERSION
- echo $PATH
- which autoconf
- autoconf --version
- which automake
- automake --version
- which $CC
- $CC --version
- which $CXX
@ -92,15 +100,15 @@ script:
- cd build
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- echo make clean
- ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check
- echo make clean
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
- make -j4
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi

View File

@ -3,10 +3,15 @@ SUBDIRS = lib benchmarks tests extras
include $(top_srcdir)/doxygen.inc
tests: all
$(MAKE) -C tests tests
bin_SCRIPTS=grid-config
.PHONY: tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
.PHONY: bench check tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)
tests-local: all
bench-local: all
check-local: all
AM_CXXFLAGS += -I$(top_builddir)/include
ACLOCAL_AMFLAGS = -I m4

View File

@ -22,6 +22,26 @@ Last update Nov 2016.
_Please do not send pull requests to the `master` branch which is reserved for releases._
### Compilers
Intel ICPC v16.0.3 and later
Clang v3.5 and later (need 3.8 and later for OpenMP)
GCC v4.9.x (recommended)
GCC v6.3 and later
### Important:
Some versions of GCC appear to have a bug under high optimisation (-O2, -O3).
The safety of these compiler versions cannot be guaranteed at this time. Follow Issue 100 for details and updates.
GCC v5.x
GCC v6.1, v6.2
### Bug report
_To help us tracking and solving more efficiently issues with Grid, please report problems using the issue system of GitHub rather than sending emails to Grid developers._
@ -32,7 +52,7 @@ When you file an issue, please go though the following checklist:
2. Give a description of the target platform (CPU, network, compiler). Please give the full CPU part description, using for example `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) or `sysctl machdep.cpu.brand_string` (macOS) and the full output the `--version` option of your compiler.
3. Give the exact `configure` command used.
4. Attach `config.log`.
5. Attach `config.summary`.
5. Attach `grid.config.summary`.
6. Attach the output of `make V=1`.
7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
@ -95,10 +115,10 @@ install Grid. Other options are detailed in the next section, you can also use `
`CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to
customise the build.
Finally, you can build and install Grid:
Finally, you can build, check, and install Grid:
``` bash
make; make install
make; make check; make install
```
To minimise the build time, only the tests at the root of the `tests` directory are built by default. If you want to build tests in the sub-directory `<subdir>` you can execute:
@ -121,7 +141,7 @@ If you want to build all the tests at once just use `make tests`.
- `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes).
- `--enable-precision={single|double}`: set the default precision (default: `double`).
- `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below.
- `--enable-rng={ranlux48|mt19937}`: choose the RNG (default: `ranlux48 `).
- `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `).
- `--disable-timers`: disable system dependent high-resolution timers.
- `--enable-chroma`: enable Chroma regression tests.
- `--enable-doxygen-doc`: enable the Doxygen documentation generation (build with `make doxygen-doc`)
@ -159,7 +179,6 @@ Alternatively, some CPU codenames can be directly used:
| `<code>` | Description |
| ----------- | -------------------------------------- |
| `KNC` | [Intel Xeon Phi codename Knights Corner](http://ark.intel.com/products/codename/57721/Knights-Corner) |
| `KNL` | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
| `BGQ` | Blue Gene/Q |

61
TODO
View File

@ -1,6 +1,26 @@
TODO:
---------------
Peter's work list:
2)- Precision conversion and sort out localConvert <--
3)- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started
4)- Binary I/O speed up & x-strips
-- Profile CG, BlockCG, etc... Flop count/rate -- PARTIAL, time but no flop/s yet
-- Physical propagator interface
-- Conserved currents
-- GaugeFix into central location
-- Multigrid Wilson and DWF, compare to other Multigrid implementations
-- HDCR resume
Recent DONE
-- Cut down the exterior overhead <-- DONE
-- Interior legs from SHM comms <-- DONE
-- Half-precision comms <-- DONE
-- Merge high precision reduction into develop
-- multiRHS DWF; benchmark on Cori/BNL for comms elimination
-- slice* linalg routines for multiRHS, BlockCG
-----
* Forces; the UdSdU term in gauge force term is half of what I think it should
be. This is a consequence of taking ONLY the first term in:
@ -21,16 +41,8 @@ TODO:
This means we must double the force in the Test_xxx_force routines, and is the origin of the factor of two.
This 2x is applied by hand in the fermion routines and in the Test_rect_force routine.
Policies:
* Link smearing/boundary conds; Policy class based implementation ; framework more in place
* Support different boundary conditions (finite temp, chem. potential ... )
* Support different fermion representations?
- contained entirely within the integrator presently
- Sign of force term.
- Reversibility test.
@ -41,11 +53,6 @@ Policies:
- Audit oIndex usage for cb behaviour
- Rectangle gauge actions.
Iwasaki,
Symanzik,
... etc...
- Prepare multigrid for HMC. - Alternate setup schemes.
- Support for ILDG --- ugly, not done
@ -55,9 +62,11 @@ Policies:
- FFTnD ?
- Gparity; hand opt use template specialisation elegance to enable the optimised paths ?
- Gparity force term; Gparity (R)HMC.
- Random number state save restore
- Mobius implementation clean up to rmove #if 0 stale code sequences
- CG -- profile carefully, kernel fusion, whole CG performance measurements.
================================================================
@ -90,6 +99,7 @@ Insert/Extract
Not sure of status of this -- reverify. Things are working nicely now though.
* Make the Tensor types and Complex etc... play more nicely.
- TensorRemove is a hack, come up with a long term rationalised approach to Complex vs. Scalar<Scalar<Scalar<Complex > > >
QDP forces use of "toDouble" to get back to non tensor scalar. This role is presently taken TensorRemove, but I
want to introduce a syntax that does not require this.
@ -112,6 +122,8 @@ Not sure of status of this -- reverify. Things are working nicely now though.
RECENT
---------------
- Support different fermion representations? -- DONE
- contained entirely within the integrator presently
- Clean up HMC -- DONE
- LorentzScalar<GaugeField> gets Gauge link type (cleaner). -- DONE
- Simplified the integrators a bit. -- DONE
@ -123,6 +135,26 @@ RECENT
- Parallel io improvements -- DONE
- Plaquette and link trace checks into nersc reader from the Grid_nersc_io.cc test. -- DONE
DONE:
- MultiArray -- MultiRHS done
- ConjugateGradientMultiShift -- DONE
- MCR -- DONE
- Remez -- Mike or Boost? -- DONE
- Proto (ET) -- DONE
- uBlas -- DONE ; Eigen
- Potentially Useful Boost libraries -- DONE ; Eigen
- Aligned allocator; memory pool -- DONE
- Multiprecision -- DONE
- Serialization -- DONE
- Regex -- Not needed
- Tokenize -- Why?
- Random number state save restore -- DONE
- Rectangle gauge actions. -- DONE
Iwasaki,
Symanzik,
... etc...
Done: Cayley, Partial , ContFrac force terms.
DONE
@ -207,6 +239,7 @@ Done
FUNCTIONALITY: it pleases me to keep track of things I have done (keeps me arguably sane)
======================================================================================================
* Link smearing/boundary conds; Policy class based implementation ; framework more in place -- DONE
* Command line args for geometry, simd, etc. layout. Is it necessary to have -- DONE
user pass these? Is this a QCD specific?

View File

@ -1,6 +1,5 @@
Version : 0.6.0
Version : 0.7.0
- AVX512, AVX2, AVX, SSE good
- Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above
- MPI and MPI3
- HiRep, Smearing, Generic gauge group
- Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended
- MPI and MPI3 comms optimisations for KNL and OPA finished
- Half precision comms

View File

@ -48,9 +48,9 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
int maxlat=16;
for(int lat=4;lat<=maxlat;lat+=2){
for(int Ls=1;Ls<=16;Ls*=2){
int maxlat=24;
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@ -124,8 +124,8 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
for(int lat=4;lat<=maxlat;lat+=2){
for(int Ls=1;Ls<=16;Ls*=2){
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){
std::vector<int> latt_size ({lat,lat,lat,lat});
@ -194,14 +194,14 @@ int main (int argc, char ** argv)
}
Nloop=100;
Nloop=10;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
for(int lat=4;lat<=maxlat;lat+=2){
for(int Ls=1;Ls<=16;Ls*=2){
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@ -281,8 +281,8 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
for(int lat=4;lat<=maxlat;lat+=2){
for(int Ls=1;Ls<=16;Ls*=2){
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@ -324,8 +324,8 @@ int main (int argc, char ** argv)
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
// Grid.StencilSendToRecvFromComplete(requests);
// requests.resize(0);
Grid.StencilSendToRecvFromComplete(requests);
requests.resize(0);
comm_proc = mpi_layout[mu]-1;

View File

@ -1,28 +1,22 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
@ -48,16 +42,16 @@ typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> latt4 = GridDefaultLatt();
const int Ls=8;
const int Ls=16;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@ -71,35 +65,66 @@ int main (int argc, char ** argv)
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermion src (FGrid); random(RNG5,src);
#if 0
src = zero;
{
std::vector<int> origin({0,0,0,latt4[2]-1,0});
SpinColourVectorF tmp;
tmp=zero;
tmp()(0)(0)=Complex(-2.0,0.0);
std::cout << " source site 0 " << tmp<<std::endl;
pokeSite(tmp,src,origin);
}
#else
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
#endif
LatticeFermion result(FGrid); result=zero;
LatticeFermion ref(FGrid); ref=zero;
LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeField Umu(UGrid);
random(RNG4,Umu);
LatticeGaugeField Umu5d(FGrid);
SU3::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;
for(int mu=0;mu<Nd;mu++){
LatticeColourMatrix ttmp(UGrid);
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
// if (mu !=2 ) ttmp = 0;
// ttmp = ttmp* pow(10.0,mu);
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
}
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
#endif
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
// replicate across fifth dimension
LatticeGaugeField Umu5d(FGrid);
std::vector<LatticeColourMatrix> U(4,FGrid);
for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
}
}
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
std::vector<LatticeColourMatrix> U(4,FGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
{
@ -120,8 +145,7 @@ int main (int argc, char ** argv)
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
RealD NN = UGrid->NodeCount();
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
@ -131,15 +155,22 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
int ncall =100;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =1000;
if (1) {
FGrid->Barrier();
Dw.ZeroCounters();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
@ -153,16 +184,55 @@ int main (int argc, char ** argv)
double flops=1344*volume*ncall;
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
/*
if(( norm2(err)>1.0e-4) ) {
std::cout << "RESULT\n " << result<<std::endl;
std::cout << "REF \n " << ref <<std::endl;
std::cout << "ERR \n " << err <<std::endl;
FGrid->Barrier();
exit(-1);
}
*/
assert (norm2(err)< 1.0e-4 );
Dw.Report();
}
DomainWallFermionRL DwH(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
if (1) {
FGrid->Barrier();
DwH.ZeroCounters();
DwH.Dhop(src,result,0);
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
DwH.Dhop(src,result,0);
__SSC_STOP;
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert (norm2(err)< 1.0e-3 );
DwH.Report();
}
if (1)
{
@ -171,6 +241,10 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
@ -182,21 +256,13 @@ int main (int argc, char ** argv)
LatticeFermion sresult(sFGrid);
WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
for(int x=0;x<latt4[0];x++){
for(int y=0;y<latt4[1];y++){
for(int z=0;z<latt4[2];z++){
for(int t=0;t<latt4[3];t++){
for(int s=0;s<Ls;s++){
std::vector<int> site({s,x,y,z,t});
SpinColourVector tmp;
peekSite(tmp,src,site);
pokeSite(tmp,ssrc,site);
}}}}}
localConvert(src,ssrc);
std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
FGrid->Barrier();
double t0=usecond();
sDw.Dhop(ssrc,sresult,0);
sDw.ZeroCounters();
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
sDw.Dhop(ssrc,sresult,0);
@ -210,46 +276,52 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
// std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
sDw.Report();
if(0){
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
sDw.Dhop(ssrc,sresult,0);
PerformanceCounter Counter(i);
Counter.Start();
sDw.Dhop(ssrc,sresult,0);
Counter.Stop();
Counter.Report();
}
}
std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
RealD sum=0;
for(int x=0;x<latt4[0];x++){
for(int y=0;y<latt4[1];y++){
for(int z=0;z<latt4[2];z++){
for(int t=0;t<latt4[3];t++){
for(int s=0;s<Ls;s++){
std::vector<int> site({s,x,y,z,t});
SpinColourVector normal, simd;
peekSite(normal,result,site);
peekSite(simd,sresult,site);
sum=sum+norm2(normal-simd);
if (norm2(normal-simd) > 1.0e-6 ) {
std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd "<<simd<<std::endl;
}
}}}}}
std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
assert (sum< 1.0e-4 );
err=zero;
localConvert(sresult,err);
err = err - ref;
sum = norm2(err);
std::cout<<GridLogMessage<<" difference between normal ref and simd is "<<sum<<std::endl;
if(sum > 1.0e-4 ){
std::cout<< "sD REF\n " <<ref << std::endl;
std::cout<< "sD ERR \n " <<err <<std::endl;
}
// assert(sum < 1.0e-4);
if (1) {
err=zero;
localConvert(sresult,err);
err = err - result;
sum = norm2(err);
std::cout<<GridLogMessage<<" difference between normal result and simd is "<<sum<<std::endl;
if(sum > 1.0e-4 ){
std::cout<< "sD REF\n " <<result << std::endl;
std::cout<< "sD ERR \n " << err <<std::endl;
}
assert(sum < 1.0e-4);
if(1){
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric )
std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)
std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm )
std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
LatticeFermion sr_eo(sFGrid);
LatticeFermion ssrc_e (sFrbGrid);
LatticeFermion ssrc_o (sFrbGrid);
LatticeFermion sr_e (sFrbGrid);
@ -257,39 +329,30 @@ int main (int argc, char ** argv)
pickCheckerboard(Even,ssrc_e,ssrc);
pickCheckerboard(Odd,ssrc_o,ssrc);
setCheckerboard(sr_eo,ssrc_o);
setCheckerboard(sr_eo,ssrc_e);
// setCheckerboard(sr_eo,ssrc_o);
// setCheckerboard(sr_eo,ssrc_e);
sr_e = zero;
sr_o = zero;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
FGrid->Barrier();
sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
sDw.ZeroCounters();
sDw.stat.init("DhopEO");
// sDw.stat.init("DhopEO");
double t0=usecond();
for (int i = 0; i < ncall; i++) {
sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
}
double t1=usecond();
FGrid->Barrier();
sDw.stat.print();
// sDw.stat.print();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2;
std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "sDeo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "sDeo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
sDw.Report();
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
@ -298,22 +361,26 @@ int main (int argc, char ** argv)
pickCheckerboard(Even,ssrc_e,sresult);
pickCheckerboard(Odd ,ssrc_o,sresult);
ssrc_e = ssrc_e - sr_e;
RealD error = norm2(ssrc_e);
std::cout<<GridLogMessage << "sE norm diff "<< norm2(ssrc_e)<< " vec nrm"<<norm2(sr_e) <<std::endl;
ssrc_o = ssrc_o - sr_o;
ssrc_o = ssrc_o - sr_o;
error+= norm2(ssrc_o);
std::cout<<GridLogMessage << "sO norm diff "<< norm2(ssrc_o)<< " vec nrm"<<norm2(sr_o) <<std::endl;
if(error>1.0e-4) {
if(( error>1.0e-4) ) {
setCheckerboard(ssrc,ssrc_o);
setCheckerboard(ssrc,ssrc_e);
std::cout<< ssrc << std::endl;
std::cout<< "DIFF\n " <<ssrc << std::endl;
setCheckerboard(ssrc,sr_o);
setCheckerboard(ssrc,sr_e);
std::cout<< "CBRESULT\n " <<ssrc << std::endl;
std::cout<< "RESULT\n " <<sresult<< std::endl;
}
assert(error<1.0e-4);
}
}
if (1)
@ -324,25 +391,30 @@ int main (int argc, char ** argv)
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = U[mu]*Cshift(src,mu+1,1);
for(int i=0;i<ref._odata.size();i++){
ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
}
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
for(int i=0;i<ref._odata.size();i++){
ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
}
}
ref = -0.5*ref;
}
// dump=1;
Dw.Dhop(src,result,1);
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert(norm2(err)<1.0e-4);
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
std::cout<< "DAG RESULT\n " <<ref << std::endl;
std::cout<< "DAG sRESULT\n " <<result << std::endl;
std::cout<< "DAG ERR \n " << err <<std::endl;
}
LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid);
@ -350,18 +422,24 @@ int main (int argc, char ** argv)
LatticeFermion r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl;
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
@ -369,6 +447,7 @@ int main (int argc, char ** argv)
{
Dw.ZeroCounters();
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
@ -381,6 +460,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
Dw.Report();
}
Dw.DhopEO(src_o,r_e,DaggerNo);
@ -396,14 +476,20 @@ int main (int argc, char ** argv)
err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert(norm2(err)<1.0e-4);
if((norm2(err)>1.0e-4)){
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
std::cout<< "Deo REF\n " <<result << std::endl;
std::cout<< "Deo ERR \n " << err <<std::endl;
}
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
//assert(norm2(src_e)<1.0e-4);
//assert(norm2(src_o)<1.0e-4);
Grid_finalize();
}

View File

@ -66,7 +66,8 @@ int main (int argc, char ** argv)
Vec tsum; tsum = zero;
GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
GridParallelRNG pRNG(&Grid);
pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101}));
std::vector<double> stop(threads);
Vector<Vec> sum(threads);
@ -77,8 +78,7 @@ int main (int argc, char ** argv)
}
double start=usecond();
PARALLEL_FOR_LOOP
for(int t=0;t<threads;t++){
parallel_for(int t=0;t<threads;t++){
sum[t] = x[t]._odata[0];
for(int i=0;i<Nloop;i++){

View File

@ -65,7 +65,7 @@ int main (int argc, char ** argv)
uint64_t Nloop=NLOOP;
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
@ -100,7 +100,7 @@ int main (int argc, char ** argv)
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
@ -138,7 +138,7 @@ int main (int argc, char ** argv)
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
@ -173,7 +173,7 @@ int main (int argc, char ** argv)
uint64_t Nloop=NLOOP;
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
LatticeVec y(&Grid); //random(pRNG,y);

View File

@ -0,0 +1,134 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_staggered.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::vector<int> latt_size = GridDefaultLatt();
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
std::vector<int> seeds({1,2,3,4});
GridParallelRNG pRNG(&Grid);
pRNG.SeedFixedIntegers(seeds);
// pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
typename ImprovedStaggeredFermionR::ImplParams params;
FermionField src (&Grid); random(pRNG,src);
FermionField result(&Grid); result=zero;
FermionField ref(&Grid); ref=zero;
FermionField tmp(&Grid); tmp=zero;
FermionField err(&Grid); tmp=zero;
LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid);
double volume=1;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
// Only one non-zero (y)
#if 0
Umu=zero;
Complex cone(1.0,0.0);
for(int nn=0;nn<Nd;nn++){
random(pRNG,U[nn]);
if(1) {
if (nn!=2) { U[nn]=zero; std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; }
// else { U[nn]= cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; }
else { std::cout<<GridLogMessage << "random gauge field in dir "<<nn<<std::endl; }
}
PokeIndex<LorentzIndex>(Umu,U[nn],nn);
}
#endif
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
ref = zero;
/*
{ // Naive wilson implementation
ref = zero;
for(int mu=0;mu<Nd;mu++){
// ref = src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
tmp = U[mu]*Cshift(src,mu,1);
for(int i=0;i<ref._odata.size();i++){
ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ;
}
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu,-1);
for(int i=0;i<ref._odata.size();i++){
ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ;
}
}
}
ref = -0.5*ref;
*/
RealD mass=0.1;
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params);
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
int ncall=1000;
double t0=usecond();
for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0);
}
double t1=usecond();
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
Grid_finalize();
}

View File

@ -35,8 +35,9 @@ using namespace Grid::QCD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
#define LMAX (32)
int Nloop=1000;
int Nloop=200;
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
@ -50,12 +51,12 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=2;lat<=32;lat+=2){
for(int lat=2;lat<=LMAX;lat+=2){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
LatticeColourMatrix z(&Grid);// random(pRNG,z);
LatticeColourMatrix x(&Grid);// random(pRNG,x);
@ -82,13 +83,13 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=2;lat<=32;lat+=2){
for(int lat=2;lat<=LMAX;lat+=2){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
LatticeColourMatrix z(&Grid); //random(pRNG,z);
LatticeColourMatrix x(&Grid); //random(pRNG,x);
@ -113,13 +114,13 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=2;lat<=32;lat+=2){
for(int lat=2;lat<=LMAX;lat+=2){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
LatticeColourMatrix z(&Grid); //random(pRNG,z);
LatticeColourMatrix x(&Grid); //random(pRNG,x);
@ -144,13 +145,13 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=2;lat<=32;lat+=2){
for(int lat=2;lat<=LMAX;lat+=2){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
LatticeColourMatrix z(&Grid); //random(pRNG,z);
LatticeColourMatrix x(&Grid); //random(pRNG,x);

View File

@ -69,7 +69,7 @@ int main (int argc, char ** argv)
std::vector<int> seeds({1,2,3,4});
GridParallelRNG pRNG(&Grid);
pRNG.SeedFixedIntegers(seeds);
// pRNG.SeedRandomDevice();
// pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
LatticeFermion src (&Grid); random(pRNG,src);
LatticeFermion result(&Grid); result=zero;

View File

@ -1,11 +1,7 @@
include Make.inc
simple: simple_su3_test.o simple_su3_expr.o simple_simd_test.o
EXTRA_LIBRARIES = libsimple_su3_test.a libsimple_su3_expr.a libsimple_simd_test.a
libsimple_su3_test_a_SOURCES = simple_su3_test.cc
libsimple_su3_expr_a_SOURCES = simple_su3_expr.cc
libsimple_simd_test_a_SOURCES = simple_simd_test.cc
bench-local: all
./Benchmark_su3
./Benchmark_memory_bandwidth
./Benchmark_wilson
./Benchmark_dwf --dslash-unroll

View File

@ -1,6 +1,6 @@
#!/usr/bin/env bash
]#!/usr/bin/env bash
EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2'
echo "-- deploying Eigen source..."
wget ${EIGEN_URL} --no-check-certificate

View File

@ -1,16 +1,19 @@
AC_PREREQ([2.63])
AC_INIT([Grid], [0.6.0], [https://github.com/paboyle/Grid], [Grid])
AC_INIT([Grid], [0.7.0], [https://github.com/paboyle/Grid], [Grid])
AC_CANONICAL_BUILD
AC_CANONICAL_HOST
AC_CANONICAL_TARGET
AM_INIT_AUTOMAKE(subdir-objects)
AM_INIT_AUTOMAKE([subdir-objects 1.13])
AM_EXTRA_RECURSIVE_TARGETS([tests bench])
AC_CONFIG_MACRO_DIR([m4])
AC_CONFIG_SRCDIR([lib/Grid.h])
AC_CONFIG_HEADERS([lib/Config.h],[sed -i 's|PACKAGE_|GRID_|' lib/Config.h])
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
################ Get git info
#AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
############### Checks for programs
CXXFLAGS="-O3 $CXXFLAGS"
AC_PROG_CXX
AC_PROG_RANLIB
@ -24,12 +27,15 @@ AX_GXX_VERSION
AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
[version of g++ that will compile the code])
CXXFLAGS="-O3 $CXXFLAGS"
############### Checks for typedefs, structures, and compiler characteristics
AC_TYPE_SIZE_T
AC_TYPE_UINT32_T
AC_TYPE_UINT64_T
############### OpenMP
############### OpenMP
AC_OPENMP
ac_openmp=no
if test "${OPENMP_CXXFLAGS}X" != "X"; then
@ -60,16 +66,23 @@ AC_ARG_WITH([mpfr],
[AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
############### FFTW3
AC_ARG_WITH([fftw],
############### FFTW3
AC_ARG_WITH([fftw],
[AS_HELP_STRING([--with-fftw=prefix],
[try this for a non-standard install prefix of the FFTW3 library])],
[AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
############### lapack
############### LIME
AC_ARG_WITH([lime],
[AS_HELP_STRING([--with-lime=prefix],
[try this for a non-standard install prefix of the LIME library])],
[AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
############### lapack
AC_ARG_ENABLE([lapack],
[AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])],
[AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])],
[ac_LAPACK=${enable_lapack}], [ac_LAPACK=no])
case ${ac_LAPACK} in
@ -83,6 +96,18 @@ case ${ac_LAPACK} in
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
esac
############### FP16 conversions
AC_ARG_ENABLE([sfw-fp16],
[AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])],
[ac_SFW_FP16=${enable_sfw_fp16}], [ac_SFW_FP16=yes])
case ${ac_SFW_FP16} in
yes)
AC_DEFINE([SFW_FP16],[1],[software conversion to fp16]);;
no);;
*)
AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
esac
############### MKL
AC_ARG_ENABLE([mkl],
[AC_HELP_STRING([--enable-mkl=yes|no|prefix], [enable Intel MKL for LAPACK & FFTW])],
@ -108,7 +133,7 @@ AC_ARG_WITH([hdf5],
############### first-touch
AC_ARG_ENABLE([numa],
[AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])],
[AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])],
[ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
case ${ac_NUMA} in
@ -134,8 +159,8 @@ if test "${ac_MKL}x" != "nox"; then
fi
AC_SEARCH_LIBS([__gmpf_init], [gmp],
[AC_SEARCH_LIBS([mpfr_init], [mpfr],
[AC_DEFINE([HAVE_LIBMPFR], [1],
[AC_SEARCH_LIBS([mpfr_init], [mpfr],
[AC_DEFINE([HAVE_LIBMPFR], [1],
[Define to 1 if you have the `MPFR' library])]
[have_mpfr=true], [AC_MSG_ERROR([MPFR library not found])])]
[AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library])]
@ -144,7 +169,7 @@ AC_SEARCH_LIBS([__gmpf_init], [gmp],
if test "${ac_LAPACK}x" != "nox"; then
AC_SEARCH_LIBS([LAPACKE_sbdsdc], [lapack], [],
[AC_MSG_ERROR("LAPACK enabled but library not found")])
fi
fi
AC_SEARCH_LIBS([fftw_execute], [fftw3],
[AC_SEARCH_LIBS([fftwf_execute], [fftw3f], [],
@ -152,6 +177,14 @@ AC_SEARCH_LIBS([fftw_execute], [fftw3],
[AC_DEFINE([HAVE_FFTW], [1], [Define to 1 if you have the `FFTW' library])]
[have_fftw=true])
AC_SEARCH_LIBS([limeCreateReader], [lime],
[AC_DEFINE([HAVE_LIME], [1], [Define to 1 if you have the `LIME' library])]
[have_lime=true],
[AC_MSG_WARN(C-LIME library was not found in your system.
In order to use ILGG file format please install or provide the correct path to your installation
Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
[AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
[have_hdf5=true]
@ -176,19 +209,26 @@ case ${ax_cv_cxx_compiler_vendor} in
case ${ac_SIMD} in
SSE4)
AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
SIMD_FLAGS='-msse4.2';;
case ${ac_SFW_FP16} in
yes)
SIMD_FLAGS='-msse4.2';;
no)
SIMD_FLAGS='-msse4.2 -mf16c';;
*)
AC_MSG_ERROR(["SFW_FP16 must be either yes or no value ${ac_SFW_FP16} "]);;
esac;;
AVX)
AC_DEFINE([AVX1],[1],[AVX intrinsics])
SIMD_FLAGS='-mavx';;
SIMD_FLAGS='-mavx -mf16c';;
AVXFMA4)
AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
SIMD_FLAGS='-mavx -mfma4';;
SIMD_FLAGS='-mavx -mfma4 -mf16c';;
AVXFMA)
AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
SIMD_FLAGS='-mavx -mfma';;
SIMD_FLAGS='-mavx -mfma -mf16c';;
AVX2)
AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
SIMD_FLAGS='-mavx2 -mfma';;
SIMD_FLAGS='-mavx2 -mfma -mf16c';;
AVX512)
AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
@ -297,7 +337,7 @@ case ${ac_COMMS} in
comms_type='shmem'
;;
*)
AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);
AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);
;;
esac
case ${ac_COMMS} in
@ -321,7 +361,7 @@ AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
############### RNG selection
AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937|sitmo],\
[Select Random Number Generator to be used])],\
[ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
[ac_RNG=${enable_rng}],[ac_RNG=sitmo])
case ${ac_RNG} in
ranlux48)
@ -334,7 +374,7 @@ case ${ac_RNG} in
AC_DEFINE([RNG_SITMO],[1],[RNG_SITMO] )
;;
*)
AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
;;
esac
@ -351,7 +391,7 @@ case ${ac_TIMERS} in
AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
;;
*)
AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]);
AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]);
;;
esac
@ -363,7 +403,7 @@ case ${ac_CHROMA} in
yes|no)
;;
*)
AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]);
AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]);
;;
esac
@ -384,12 +424,65 @@ DX_INIT_DOXYGEN([$PACKAGE_NAME], [doxygen.cfg])
############### Ouput
cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
GRID_CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
GRID_LDFLAGS="$AM_LDFLAGS $LDFLAGS"
GRID_LIBS=$LIBS
GRID_SHORT_SHA=`git rev-parse --short HEAD`
GRID_SHA=`git rev-parse HEAD`
GRID_BRANCH=`git rev-parse --abbrev-ref HEAD`
AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS"
AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS"
AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS"
AC_SUBST([AM_CFLAGS])
AC_SUBST([AM_CXXFLAGS])
AC_SUBST([AM_LDFLAGS])
AC_SUBST([GRID_CXXFLAGS])
AC_SUBST([GRID_LDFLAGS])
AC_SUBST([GRID_LIBS])
AC_SUBST([GRID_SHA])
AC_SUBST([GRID_BRANCH])
git_commit=`cd $srcdir && ./scripts/configure.commit`
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Summary of configuration for $PACKAGE v$VERSION
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
----- GIT VERSION -------------------------------------
$git_commit
----- PLATFORM ----------------------------------------
architecture (build) : $build_cpu
os (build) : $build_os
architecture (target) : $target_cpu
os (target) : $target_os
compiler vendor : ${ax_cv_cxx_compiler_vendor}
compiler version : ${ax_cv_gxx_version}
----- BUILD OPTIONS -----------------------------------
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
Threading : ${ac_openmp}
Communications type : ${comms_type}
Default precision : ${ac_PRECISION}
Software FP16 conversion : ${ac_SFW_FP16}
RNG choice : ${ac_RNG}
GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
LAPACK : ${ac_LAPACK}
FFTW : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
LIME (ILDG support) : `if test "x$have_lime" = xtrue; then echo yes; else echo no; fi`
HDF5 : `if test "x$have_hdf5" = xtrue; then echo yes; else echo no; fi`
build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
----- BUILD FLAGS -------------------------------------
CXXFLAGS:
`echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'`
LDFLAGS:
`echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'`
LIBS:
`echo ${LIBS} | tr ' ' '\n' | sed 's/^-/ -/g'`
-------------------------------------------------------" > grid.configure.summary
GRID_SUMMARY="`cat grid.configure.summary`"
AM_SUBST_NOTMAKE([GRID_SUMMARY])
AC_SUBST([GRID_SUMMARY])
AC_CONFIG_FILES([grid-config], [chmod +x grid-config])
AC_CONFIG_FILES(Makefile)
AC_CONFIG_FILES(lib/Makefile)
AC_CONFIG_FILES(tests/Makefile)
@ -400,42 +493,15 @@ AC_CONFIG_FILES(tests/forces/Makefile)
AC_CONFIG_FILES(tests/hadrons/Makefile)
AC_CONFIG_FILES(tests/hmc/Makefile)
AC_CONFIG_FILES(tests/solver/Makefile)
AC_CONFIG_FILES(tests/smearing/Makefile)
AC_CONFIG_FILES(tests/qdpxx/Makefile)
AC_CONFIG_FILES(tests/testu01/Makefile)
AC_CONFIG_FILES(benchmarks/Makefile)
AC_CONFIG_FILES(extras/Makefile)
AC_CONFIG_FILES(extras/Hadrons/Makefile)
AC_OUTPUT
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Summary of configuration for $PACKAGE v$VERSION
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
echo ""
cat grid.configure.summary
echo ""
----- PLATFORM ----------------------------------------
architecture (build) : $build_cpu
os (build) : $build_os
architecture (target) : $target_cpu
os (target) : $target_os
compiler vendor : ${ax_cv_cxx_compiler_vendor}
compiler version : ${ax_cv_gxx_version}
----- BUILD OPTIONS -----------------------------------
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
Threading : ${ac_openmp}
Communications type : ${comms_type}
Default precision : ${ac_PRECISION}
RNG choice : ${ac_RNG}
GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
LAPACK : ${ac_LAPACK}
FFTW : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
HDF5 : `if test "x$have_hdf5" = xtrue; then echo yes; else echo no; fi`
build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi`
----- BUILD FLAGS -------------------------------------
CXXFLAGS:
`echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'`
LDFLAGS:
`echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'`
LIBS:
`echo ${LIBS} | tr ' ' '\n' | sed 's/^-/ -/g'`
-------------------------------------------------------" > config.summary
echo ""
cat config.summary
echo ""

View File

@ -162,7 +162,8 @@ void Application::saveParameterFile(const std::string parameterFileName)
sizeString((size)*locVol_) << " (" << sizeString(size) << "/site)"
#define DEFINE_MEMPEAK \
auto memPeak = [this](const std::vector<unsigned int> &program)\
GeneticScheduler<unsigned int>::ObjFunc memPeak = \
[this](const std::vector<unsigned int> &program)\
{\
unsigned int memPeak;\
bool msg;\

View File

@ -145,6 +145,15 @@ std::string typeName(void)
return typeName(typeIdPt<T>());
}
// default writers/readers
#ifdef HAVE_HDF5
typedef Hdf5Reader CorrReader;
typedef Hdf5Writer CorrWriter;
#else
typedef XmlReader CorrReader;
typedef XmlWriter CorrWriter;
#endif
END_HADRONS_NAMESPACE
#endif // Hadrons_Global_hpp_

View File

@ -29,12 +29,20 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include <Grid/Hadrons/Modules/MAction/DWF.hpp>
#include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
#include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
#include <Grid/Hadrons/Modules/MContraction/DiscLoop.hpp>
#include <Grid/Hadrons/Modules/MContraction/Gamma3pt.hpp>
#include <Grid/Hadrons/Modules/MContraction/Meson.hpp>
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
#include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
#include <Grid/Hadrons/Modules/MGauge/Load.hpp>
#include <Grid/Hadrons/Modules/MGauge/Random.hpp>
#include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
#include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
#include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
#include <Grid/Hadrons/Modules/MSource/Point.hpp>
#include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
#include <Grid/Hadrons/Modules/MSource/Wall.hpp>
#include <Grid/Hadrons/Modules/MSource/Z2.hpp>
#include <Grid/Hadrons/Modules/Quark.hpp>

View File

@ -112,7 +112,7 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
<< " quarks '" << par().q1 << "', '" << par().q2 << "', and '"
<< par().q3 << "'" << std::endl;
XmlWriter writer(par().output);
CorrWriter writer(par().output);
PropagatorField1 &q1 = *env().template getObject<PropagatorField1>(par().q1);
PropagatorField2 &q2 = *env().template getObject<PropagatorField2>(par().q2);
PropagatorField3 &q3 = *env().template getObject<PropagatorField3>(par().q2);

View File

@ -0,0 +1,144 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MContraction/DiscLoop.hpp
Copyright (C) 2017
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_DiscLoop_hpp_
#define Hadrons_DiscLoop_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
#include <Grid/Hadrons/ModuleFactory.hpp>
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
* DiscLoop *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MContraction)
class DiscLoopPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(DiscLoopPar,
std::string, q_loop,
Gamma::Algebra, gamma,
std::string, output);
};
template <typename FImpl>
class TDiscLoop: public Module<DiscLoopPar>
{
TYPE_ALIASES(FImpl,);
class Result: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
Gamma::Algebra, gamma,
std::vector<Complex>, corr);
};
public:
// constructor
TDiscLoop(const std::string name);
// destructor
virtual ~TDiscLoop(void) = default;
// dependency relation
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
// setup
virtual void setup(void);
// execution
virtual void execute(void);
};
MODULE_REGISTER_NS(DiscLoop, TDiscLoop<FIMPL>, MContraction);
/******************************************************************************
* TDiscLoop implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
template <typename FImpl>
TDiscLoop<FImpl>::TDiscLoop(const std::string name)
: Module<DiscLoopPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
template <typename FImpl>
std::vector<std::string> TDiscLoop<FImpl>::getInput(void)
{
std::vector<std::string> in = {par().q_loop};
return in;
}
template <typename FImpl>
std::vector<std::string> TDiscLoop<FImpl>::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
template <typename FImpl>
void TDiscLoop<FImpl>::setup(void)
{
}
// execution ///////////////////////////////////////////////////////////////////
template <typename FImpl>
void TDiscLoop<FImpl>::execute(void)
{
LOG(Message) << "Computing disconnected loop contraction '" << getName()
<< "' using '" << par().q_loop << "' with " << par().gamma
<< " insertion." << std::endl;
CorrWriter writer(par().output);
PropagatorField &q_loop = *env().template getObject<PropagatorField>(par().q_loop);
LatticeComplex c(env().getGrid());
Gamma gamma(par().gamma);
std::vector<TComplex> buf;
Result result;
c = trace(gamma*q_loop);
sliceSum(c, buf, Tp);
result.gamma = par().gamma;
result.corr.resize(buf.size());
for (unsigned int t = 0; t < buf.size(); ++t)
{
result.corr[t] = TensorRemove(buf[t]);
}
write(writer, "disc", result);
}
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_DiscLoop_hpp_

View File

@ -0,0 +1,170 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MContraction/Gamma3pt.hpp
Copyright (C) 2017
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_Gamma3pt_hpp_
#define Hadrons_Gamma3pt_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
#include <Grid/Hadrons/ModuleFactory.hpp>
BEGIN_HADRONS_NAMESPACE
/*
* 3pt contraction with gamma matrix insertion.
*
* Schematic:
*
* q2 q3
* /----<------*------<----¬
* / gamma \
* / \
* i * * f
* \ /
* \ /
* \----------->----------/
* q1
*
* trace(g5*q1*adj(q2)*g5*gamma*q3)
*/
/******************************************************************************
* Gamma3pt *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MContraction)
class Gamma3ptPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(Gamma3ptPar,
std::string, q1,
std::string, q2,
std::string, q3,
Gamma::Algebra, gamma,
std::string, output);
};
template <typename FImpl1, typename FImpl2, typename FImpl3>
class TGamma3pt: public Module<Gamma3ptPar>
{
TYPE_ALIASES(FImpl1, 1);
TYPE_ALIASES(FImpl2, 2);
TYPE_ALIASES(FImpl3, 3);
class Result: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
Gamma::Algebra, gamma,
std::vector<Complex>, corr);
};
public:
// constructor
TGamma3pt(const std::string name);
// destructor
virtual ~TGamma3pt(void) = default;
// dependency relation
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
// setup
virtual void setup(void);
// execution
virtual void execute(void);
};
MODULE_REGISTER_NS(Gamma3pt, ARG(TGamma3pt<FIMPL, FIMPL, FIMPL>), MContraction);
/******************************************************************************
* TGamma3pt implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
template <typename FImpl1, typename FImpl2, typename FImpl3>
TGamma3pt<FImpl1, FImpl2, FImpl3>::TGamma3pt(const std::string name)
: Module<Gamma3ptPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
template <typename FImpl1, typename FImpl2, typename FImpl3>
std::vector<std::string> TGamma3pt<FImpl1, FImpl2, FImpl3>::getInput(void)
{
std::vector<std::string> in = {par().q1, par().q2, par().q3};
return in;
}
template <typename FImpl1, typename FImpl2, typename FImpl3>
std::vector<std::string> TGamma3pt<FImpl1, FImpl2, FImpl3>::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
template <typename FImpl1, typename FImpl2, typename FImpl3>
void TGamma3pt<FImpl1, FImpl2, FImpl3>::setup(void)
{
}
// execution ///////////////////////////////////////////////////////////////////
template <typename FImpl1, typename FImpl2, typename FImpl3>
void TGamma3pt<FImpl1, FImpl2, FImpl3>::execute(void)
{
LOG(Message) << "Computing 3pt contractions '" << getName() << "' using"
<< " quarks '" << par().q1 << "', '" << par().q2 << "' and '"
<< par().q3 << "', with " << par().gamma << " insertion."
<< std::endl;
CorrWriter writer(par().output);
PropagatorField1 &q1 = *env().template getObject<PropagatorField1>(par().q1);
PropagatorField2 &q2 = *env().template getObject<PropagatorField2>(par().q2);
PropagatorField3 &q3 = *env().template getObject<PropagatorField3>(par().q3);
LatticeComplex c(env().getGrid());
Gamma g5(Gamma::Algebra::Gamma5);
Gamma gamma(par().gamma);
std::vector<TComplex> buf;
Result result;
c = trace(g5*q1*adj(q2)*(g5*gamma)*q3);
sliceSum(c, buf, Tp);
result.gamma = par().gamma;
result.corr.resize(buf.size());
for (unsigned int t = 0; t < buf.size(); ++t)
{
result.corr[t] = TensorRemove(buf[t]);
}
write(writer, "gamma3pt", result);
}
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_Gamma3pt_hpp_

View File

@ -6,8 +6,10 @@ Source file: extras/Hadrons/Modules/MContraction/Meson.hpp
Copyright (C) 2015
Copyright (C) 2016
Copyright (C) 2017
Author: Antonin Portelli <antonin.portelli@me.com>
Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -36,20 +38,39 @@ See the full license in the file "LICENSE" in the top level distribution directo
BEGIN_HADRONS_NAMESPACE
/*
Meson contractions
-----------------------------
* options:
- q1: input propagator 1 (string)
- q2: input propagator 2 (string)
- gammas: gamma products to insert at sink & source, pairs of gamma matrices
(space-separated strings) in angled brackets (i.e. <g_sink g_src>),
in a sequence (e.g. "<Gamma5 Gamma5><Gamma5 GammaT>").
Special values: "all" - perform all possible contractions.
- mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0."),
given as multiples of (2*pi) / L.
*/
/******************************************************************************
* TMeson *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MContraction)
typedef std::pair<Gamma::Algebra, Gamma::Algebra> GammaPair;
class MesonPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(MesonPar,
std::string, q1,
std::string, q2,
std::string, output,
Gamma::Algebra, gammaSource,
Gamma::Algebra, gammaSink);
std::string, q1,
std::string, q2,
std::string, gammas,
std::string, mom,
std::string, output);
};
template <typename FImpl1, typename FImpl2>
@ -61,7 +82,10 @@ public:
class Result: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(Result, std::vector<Complex>, corr);
GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
Gamma::Algebra, gamma_snk,
Gamma::Algebra, gamma_src,
std::vector<Complex>, corr);
};
public:
// constructor
@ -71,6 +95,7 @@ public:
// dependencies/products
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
virtual void parseGammaString(std::vector<GammaPair> &gammaList);
// execution
virtual void execute(void);
};
@ -103,6 +128,32 @@ std::vector<std::string> TMeson<FImpl1, FImpl2>::getOutput(void)
return output;
}
template <typename FImpl1, typename FImpl2>
void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList)
{
// Determine gamma matrices to insert at source/sink.
if (par().gammas.compare("all") == 0)
{
// Do all contractions.
unsigned int n_gam = Ns * Ns;
gammaList.resize(n_gam*n_gam);
for (unsigned int i = 1; i < Gamma::nGamma; i += 2)
{
for (unsigned int j = 1; j < Gamma::nGamma; j += 2)
{
gammaList.push_back(std::make_pair((Gamma::Algebra)i,
(Gamma::Algebra)j));
}
}
}
else
{
// Parse individual contractions from input string.
gammaList = strToVec<GammaPair>(par().gammas);
}
}
// execution ///////////////////////////////////////////////////////////////////
template <typename FImpl1, typename FImpl2>
void TMeson<FImpl1, FImpl2>::execute(void)
@ -111,21 +162,44 @@ void TMeson<FImpl1, FImpl2>::execute(void)
<< " quarks '" << par().q1 << "' and '" << par().q2 << "'"
<< std::endl;
XmlWriter writer(par().output);
PropagatorField1 &q1 = *env().template getObject<PropagatorField1>(par().q1);
PropagatorField2 &q2 = *env().template getObject<PropagatorField2>(par().q2);
LatticeComplex c(env().getGrid());
Gamma gSrc(par().gammaSource), gSnk(par().gammaSink);
Gamma g5(Gamma::Algebra::Gamma5);
std::vector<TComplex> buf;
Result result;
c = trace(gSnk*q1*adj(gSrc)*g5*adj(q2)*g5);
sliceSum(c, buf, Tp);
result.corr.resize(buf.size());
for (unsigned int t = 0; t < buf.size(); ++t)
CorrWriter writer(par().output);
PropagatorField1 &q1 = *env().template getObject<PropagatorField1>(par().q1);
PropagatorField2 &q2 = *env().template getObject<PropagatorField2>(par().q2);
LatticeComplex c(env().getGrid());
Gamma g5(Gamma::Algebra::Gamma5);
std::vector<GammaPair> gammaList;
std::vector<TComplex> buf;
std::vector<Result> result;
std::vector<Real> p;
p = strToVec<Real>(par().mom);
LatticeComplex ph(env().getGrid()), coor(env().getGrid());
Complex i(0.0,1.0);
ph = zero;
for(unsigned int mu = 0; mu < env().getNd(); mu++)
{
result.corr[t] = TensorRemove(buf[t]);
LatticeCoordinate(coor, mu);
ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
}
ph = exp((Real)(2*M_PI)*i*ph);
parseGammaString(gammaList);
result.resize(gammaList.size());
for (unsigned int i = 0; i < result.size(); ++i)
{
Gamma gSnk(gammaList[i].first);
Gamma gSrc(gammaList[i].second);
c = trace((g5*gSnk)*q1*(adj(gSrc)*g5)*adj(q2))*ph;
sliceSum(c, buf, Tp);
result[i].gamma_snk = gammaList[i].first;
result[i].gamma_src = gammaList[i].second;
result[i].corr.resize(buf.size());
for (unsigned int t = 0; t < buf.size(); ++t)
{
result[i].corr[t] = TensorRemove(buf[t]);
}
}
write(writer, "meson", result);
}

View File

@ -0,0 +1,114 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonian.hpp
Copyright (C) 2017
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_WeakHamiltonian_hpp_
#define Hadrons_WeakHamiltonian_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
#include <Grid/Hadrons/ModuleFactory.hpp>
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
* WeakHamiltonian *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MContraction)
/*******************************************************************************
* Utilities for contractions involving the Weak Hamiltonian.
******************************************************************************/
//// Sum and store correlator.
#define MAKE_DIAG(exp, buf, res, n)\
sliceSum(exp, buf, Tp);\
res.name = (n);\
res.corr.resize(buf.size());\
for (unsigned int t = 0; t < buf.size(); ++t)\
{\
res.corr[t] = TensorRemove(buf[t]);\
}
//// Contraction of mu index: use 'mu' variable in exp.
#define SUM_MU(buf,exp)\
buf = zero;\
for (unsigned int mu = 0; mu < ndim; ++mu)\
{\
buf += exp;\
}
enum
{
i_V = 0,
i_A = 1,
n_i = 2
};
class WeakHamiltonianPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(WeakHamiltonianPar,
std::string, q1,
std::string, q2,
std::string, q3,
std::string, q4,
std::string, output);
};
#define MAKE_WEAK_MODULE(modname)\
class T##modname: public Module<WeakHamiltonianPar>\
{\
public:\
TYPE_ALIASES(FIMPL,)\
class Result: Serializable\
{\
public:\
GRID_SERIALIZABLE_CLASS_MEMBERS(Result,\
std::string, name,\
std::vector<Complex>, corr);\
};\
public:\
/* constructor */ \
T##modname(const std::string name);\
/* destructor */ \
virtual ~T##modname(void) = default;\
/* dependency relation */ \
virtual std::vector<std::string> getInput(void);\
virtual std::vector<std::string> getOutput(void);\
/* setup */ \
virtual void setup(void);\
/* execution */ \
virtual void execute(void);\
std::vector<std::string> VA_label = {"V", "A"};\
};\
MODULE_REGISTER_NS(modname, T##modname, MContraction);
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_WeakHamiltonian_hpp_

View File

@ -0,0 +1,137 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.cc
Copyright (C) 2017
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
using namespace Grid;
using namespace Hadrons;
using namespace MContraction;
/*
* Weak Hamiltonian current-current contractions, Eye-type.
*
* These contractions are generated by the Q1 and Q2 operators in the physical
* basis (see e.g. Fig 3 of arXiv:1507.03094).
*
* Schematics: q4 |
* /-<-¬ |
* / \ | q2 q3
* \ / | /----<------*------<----¬
* q2 \ / q3 | / /-*-¬ \
* /-----<-----* *-----<----¬ | / / \ \
* i * H_W * f | i * \ / q4 * f
* \ / | \ \->-/ /
* \ / | \ /
* \---------->---------/ | \----------->----------/
* q1 | q1
* |
* Saucer (S) | Eye (E)
*
* S: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1]*q4*gL[mu][p_2])
* E: trace(q3*g5*q1*adj(q2)*g5*gL[mu][p_1])*trace(q4*gL[mu][p_2])
*/
/******************************************************************************
* TWeakHamiltonianEye implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
TWeakHamiltonianEye::TWeakHamiltonianEye(const std::string name)
: Module<WeakHamiltonianPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
std::vector<std::string> TWeakHamiltonianEye::getInput(void)
{
std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
return in;
}
std::vector<std::string> TWeakHamiltonianEye::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
void TWeakHamiltonianEye::setup(void)
{
}
// execution ///////////////////////////////////////////////////////////////////
void TWeakHamiltonianEye::execute(void)
{
LOG(Message) << "Computing Weak Hamiltonian (Eye type) contractions '"
<< getName() << "' using quarks '" << par().q1 << "', '"
<< par().q2 << ", '" << par().q3 << "' and '" << par().q4
<< "'." << std::endl;
CorrWriter writer(par().output);
PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
Gamma g5 = Gamma(Gamma::Algebra::Gamma5);
LatticeComplex expbuf(env().getGrid());
std::vector<TComplex> corrbuf;
std::vector<Result> result(n_eye_diag);
unsigned int ndim = env().getNd();
PropagatorField tmp1(env().getGrid());
LatticeComplex tmp2(env().getGrid());
std::vector<PropagatorField> S_body(ndim, tmp1);
std::vector<PropagatorField> S_loop(ndim, tmp1);
std::vector<LatticeComplex> E_body(ndim, tmp2);
std::vector<LatticeComplex> E_loop(ndim, tmp2);
// Setup for S-type contractions.
for (int mu = 0; mu < ndim; ++mu)
{
S_body[mu] = MAKE_SE_BODY(q1, q2, q3, GammaL(Gamma::gmu[mu]));
S_loop[mu] = MAKE_SE_LOOP(q4, GammaL(Gamma::gmu[mu]));
}
// Perform S-type contractions.
SUM_MU(expbuf, trace(S_body[mu]*S_loop[mu]))
MAKE_DIAG(expbuf, corrbuf, result[S_diag], "HW_S")
// Recycle sub-expressions for E-type contractions.
for (unsigned int mu = 0; mu < ndim; ++mu)
{
E_body[mu] = trace(S_body[mu]);
E_loop[mu] = trace(S_loop[mu]);
}
// Perform E-type contractions.
SUM_MU(expbuf, E_body[mu]*E_loop[mu])
MAKE_DIAG(expbuf, corrbuf, result[E_diag], "HW_E")
write(writer, "HW_Eye", result);
}

View File

@ -0,0 +1,58 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp
Copyright (C) 2017
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_WeakHamiltonianEye_hpp_
#define Hadrons_WeakHamiltonianEye_hpp_
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
* WeakHamiltonianEye *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MContraction)
enum
{
S_diag = 0,
E_diag = 1,
n_eye_diag = 2
};
// Saucer and Eye subdiagram contractions.
#define MAKE_SE_BODY(Q_1, Q_2, Q_3, gamma) (Q_3*g5*Q_1*adj(Q_2)*g5*gamma)
#define MAKE_SE_LOOP(Q_loop, gamma) (Q_loop*gamma)
MAKE_WEAK_MODULE(WeakHamiltonianEye)
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_WeakHamiltonianEye_hpp_

View File

@ -0,0 +1,139 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.cc
Copyright (C) 2017
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
using namespace Grid;
using namespace Hadrons;
using namespace MContraction;
/*
* Weak Hamiltonian current-current contractions, Non-Eye-type.
*
* These contractions are generated by the Q1 and Q2 operators in the physical
* basis (see e.g. Fig 3 of arXiv:1507.03094).
*
* Schematic:
* q2 q3 | q2 q3
* /--<--¬ /--<--¬ | /--<--¬ /--<--¬
* / \ / \ | / \ / \
* / \ / \ | / \ / \
* / \ / \ | / \ / \
* i * * H_W * f | i * * * H_W * f
* \ * | | \ / \ /
* \ / \ / | \ / \ /
* \ / \ / | \ / \ /
* \ / \ / | \-->--/ \-->--/
* \-->--/ \-->--/ | q1 q4
* q1 q4 |
* Connected (C) | Wing (W)
*
* C: trace(q1*adj(q2)*g5*gL[mu]*q3*adj(q4)*g5*gL[mu])
* W: trace(q1*adj(q2)*g5*gL[mu])*trace(q3*adj(q4)*g5*gL[mu])
*
*/
/******************************************************************************
* TWeakHamiltonianNonEye implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
TWeakHamiltonianNonEye::TWeakHamiltonianNonEye(const std::string name)
: Module<WeakHamiltonianPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
std::vector<std::string> TWeakHamiltonianNonEye::getInput(void)
{
std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
return in;
}
std::vector<std::string> TWeakHamiltonianNonEye::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
void TWeakHamiltonianNonEye::setup(void)
{
}
// execution ///////////////////////////////////////////////////////////////////
void TWeakHamiltonianNonEye::execute(void)
{
LOG(Message) << "Computing Weak Hamiltonian (Non-Eye type) contractions '"
<< getName() << "' using quarks '" << par().q1 << "', '"
<< par().q2 << ", '" << par().q3 << "' and '" << par().q4
<< "'." << std::endl;
CorrWriter writer(par().output);
PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
Gamma g5 = Gamma(Gamma::Algebra::Gamma5);
LatticeComplex expbuf(env().getGrid());
std::vector<TComplex> corrbuf;
std::vector<Result> result(n_noneye_diag);
unsigned int ndim = env().getNd();
PropagatorField tmp1(env().getGrid());
LatticeComplex tmp2(env().getGrid());
std::vector<PropagatorField> C_i_side_loop(ndim, tmp1);
std::vector<PropagatorField> C_f_side_loop(ndim, tmp1);
std::vector<LatticeComplex> W_i_side_loop(ndim, tmp2);
std::vector<LatticeComplex> W_f_side_loop(ndim, tmp2);
// Setup for C-type contractions.
for (int mu = 0; mu < ndim; ++mu)
{
C_i_side_loop[mu] = MAKE_CW_SUBDIAG(q1, q2, GammaL(Gamma::gmu[mu]));
C_f_side_loop[mu] = MAKE_CW_SUBDIAG(q3, q4, GammaL(Gamma::gmu[mu]));
}
// Perform C-type contractions.
SUM_MU(expbuf, trace(C_i_side_loop[mu]*C_f_side_loop[mu]))
MAKE_DIAG(expbuf, corrbuf, result[C_diag], "HW_C")
// Recycle sub-expressions for W-type contractions.
for (unsigned int mu = 0; mu < ndim; ++mu)
{
W_i_side_loop[mu] = trace(C_i_side_loop[mu]);
W_f_side_loop[mu] = trace(C_f_side_loop[mu]);
}
// Perform W-type contractions.
SUM_MU(expbuf, W_i_side_loop[mu]*W_f_side_loop[mu])
MAKE_DIAG(expbuf, corrbuf, result[W_diag], "HW_W")
write(writer, "HW_NonEye", result);
}

View File

@ -0,0 +1,57 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp
Copyright (C) 2017
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_WeakHamiltonianNonEye_hpp_
#define Hadrons_WeakHamiltonianNonEye_hpp_
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
* WeakHamiltonianNonEye *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MContraction)
enum
{
W_diag = 0,
C_diag = 1,
n_noneye_diag = 2
};
// Wing and Connected subdiagram contractions
#define MAKE_CW_SUBDIAG(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma)
MAKE_WEAK_MODULE(WeakHamiltonianNonEye)
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_WeakHamiltonianNonEye_hpp_

View File

@ -0,0 +1,135 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.cc
Copyright (C) 2017
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
using namespace Grid;
using namespace Hadrons;
using namespace MContraction;
/*
* Weak Hamiltonian + current contractions, disconnected topology for neutral
* mesons.
*
* These contractions are generated by operators Q_1,...,10 of the dS=1 Weak
* Hamiltonian in the physical basis and an additional current J (see e.g.
* Fig 11 of arXiv:1507.03094).
*
* Schematic:
*
* q2 q4 q3
* /--<--¬ /---<--¬ /---<--¬
* / \ / \ / \
* i * * H_W | J * * f
* \ / \ / \ /
* \--->---/ \-------/ \------/
* q1
*
* options
* - q1: input propagator 1 (string)
* - q2: input propagator 2 (string)
* - q3: input propagator 3 (string), assumed to be sequential propagator
* - q4: input propagator 4 (string), assumed to be a loop
*
* type 1: trace(q1*adj(q2)*g5*gL[mu])*trace(loop*gL[mu])*trace(q3*g5)
* type 2: trace(q1*adj(q2)*g5*gL[mu]*loop*gL[mu])*trace(q3*g5)
*/
/*******************************************************************************
* TWeakNeutral4ptDisc implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
TWeakNeutral4ptDisc::TWeakNeutral4ptDisc(const std::string name)
: Module<WeakHamiltonianPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
std::vector<std::string> TWeakNeutral4ptDisc::getInput(void)
{
std::vector<std::string> in = {par().q1, par().q2, par().q3, par().q4};
return in;
}
std::vector<std::string> TWeakNeutral4ptDisc::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
void TWeakNeutral4ptDisc::setup(void)
{
}
// execution ///////////////////////////////////////////////////////////////////
void TWeakNeutral4ptDisc::execute(void)
{
LOG(Message) << "Computing Weak Hamiltonian neutral disconnected contractions '"
<< getName() << "' using quarks '" << par().q1 << "', '"
<< par().q2 << ", '" << par().q3 << "' and '" << par().q4
<< "'." << std::endl;
CorrWriter writer(par().output);
PropagatorField &q1 = *env().template getObject<PropagatorField>(par().q1);
PropagatorField &q2 = *env().template getObject<PropagatorField>(par().q2);
PropagatorField &q3 = *env().template getObject<PropagatorField>(par().q3);
PropagatorField &q4 = *env().template getObject<PropagatorField>(par().q4);
Gamma g5 = Gamma(Gamma::Algebra::Gamma5);
LatticeComplex expbuf(env().getGrid());
std::vector<TComplex> corrbuf;
std::vector<Result> result(n_neut_disc_diag);
unsigned int ndim = env().getNd();
PropagatorField tmp(env().getGrid());
std::vector<PropagatorField> meson(ndim, tmp);
std::vector<PropagatorField> loop(ndim, tmp);
LatticeComplex curr(env().getGrid());
// Setup for type 1 contractions.
for (int mu = 0; mu < ndim; ++mu)
{
meson[mu] = MAKE_DISC_MESON(q1, q2, GammaL(Gamma::gmu[mu]));
loop[mu] = MAKE_DISC_LOOP(q4, GammaL(Gamma::gmu[mu]));
}
curr = MAKE_DISC_CURR(q3, GammaL(Gamma::Algebra::Gamma5));
// Perform type 1 contractions.
SUM_MU(expbuf, trace(meson[mu]*loop[mu]))
expbuf *= curr;
MAKE_DIAG(expbuf, corrbuf, result[neut_disc_1_diag], "HW_disc0_1")
// Perform type 2 contractions.
SUM_MU(expbuf, trace(meson[mu])*trace(loop[mu]))
expbuf *= curr;
MAKE_DIAG(expbuf, corrbuf, result[neut_disc_2_diag], "HW_disc0_2")
write(writer, "HW_disc0", result);
}

View File

@ -0,0 +1,59 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp
Copyright (C) 2017
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_WeakNeutral4ptDisc_hpp_
#define Hadrons_WeakNeutral4ptDisc_hpp_
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
* WeakNeutral4ptDisc *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MContraction)
enum
{
neut_disc_1_diag = 0,
neut_disc_2_diag = 1,
n_neut_disc_diag = 2
};
// Neutral 4pt disconnected subdiagram contractions.
#define MAKE_DISC_MESON(Q_1, Q_2, gamma) (Q_1*adj(Q_2)*g5*gamma)
#define MAKE_DISC_LOOP(Q_LOOP, gamma) (Q_LOOP*gamma)
#define MAKE_DISC_CURR(Q_c, gamma) (trace(Q_c*gamma))
MAKE_WEAK_MODULE(WeakNeutral4ptDisc)
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_WeakNeutral4ptDisc_hpp_

View File

@ -0,0 +1,132 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MLoop/NoiseLoop.hpp
Copyright (C) 2016
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_NoiseLoop_hpp_
#define Hadrons_NoiseLoop_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
#include <Grid/Hadrons/ModuleFactory.hpp>
BEGIN_HADRONS_NAMESPACE
/*
Noise loop propagator
-----------------------------
* loop_x = q_x * adj(eta_x)
* options:
- q = Result of inversion on noise source.
- eta = noise source.
*/
/******************************************************************************
* NoiseLoop *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MLoop)
class NoiseLoopPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(NoiseLoopPar,
std::string, q,
std::string, eta);
};
template <typename FImpl>
class TNoiseLoop: public Module<NoiseLoopPar>
{
public:
TYPE_ALIASES(FImpl,);
public:
// constructor
TNoiseLoop(const std::string name);
// destructor
virtual ~TNoiseLoop(void) = default;
// dependency relation
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
// setup
virtual void setup(void);
// execution
virtual void execute(void);
};
MODULE_REGISTER_NS(NoiseLoop, TNoiseLoop<FIMPL>, MLoop);
/******************************************************************************
* TNoiseLoop implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
template <typename FImpl>
TNoiseLoop<FImpl>::TNoiseLoop(const std::string name)
: Module<NoiseLoopPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
template <typename FImpl>
std::vector<std::string> TNoiseLoop<FImpl>::getInput(void)
{
std::vector<std::string> in = {par().q, par().eta};
return in;
}
template <typename FImpl>
std::vector<std::string> TNoiseLoop<FImpl>::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
template <typename FImpl>
void TNoiseLoop<FImpl>::setup(void)
{
env().template registerLattice<PropagatorField>(getName());
}
// execution ///////////////////////////////////////////////////////////////////
template <typename FImpl>
void TNoiseLoop<FImpl>::execute(void)
{
PropagatorField &loop = *env().template createLattice<PropagatorField>(getName());
PropagatorField &q = *env().template getObject<PropagatorField>(par().q);
PropagatorField &eta = *env().template getObject<PropagatorField>(par().eta);
loop = q*adj(eta);
}
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_NoiseLoop_hpp_

View File

@ -6,6 +6,7 @@ Source file: extras/Hadrons/Modules/MSource/SeqGamma.hpp
Copyright (C) 2015
Copyright (C) 2016
Copyright (C) 2017
Author: Antonin Portelli <antonin.portelli@me.com>
@ -149,9 +150,9 @@ void TSeqGamma<FImpl>::execute(void)
for(unsigned int mu = 0; mu < env().getNd(); mu++)
{
LatticeCoordinate(coor, mu);
ph = ph + p[mu]*coor;
ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
}
ph = exp(i*ph);
ph = exp((Real)(2*M_PI)*i*ph);
LatticeCoordinate(t, Tp);
src = where((t >= par().tA) and (t <= par().tB), ph*(g*q), 0.*q);
}

View File

@ -0,0 +1,147 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MSource/Wall.hpp
Copyright (C) 2017
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_WallSource_hpp_
#define Hadrons_WallSource_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
#include <Grid/Hadrons/ModuleFactory.hpp>
BEGIN_HADRONS_NAMESPACE
/*
Wall source
-----------------------------
* src_x = delta(x_3 - tW) * exp(i x.mom)
* options:
- tW: source timeslice (integer)
- mom: momentum insertion, space-separated float sequence (e.g ".1 .2 1. 0.")
*/
/******************************************************************************
* Wall *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MSource)
class WallPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(WallPar,
unsigned int, tW,
std::string, mom);
};
template <typename FImpl>
class TWall: public Module<WallPar>
{
public:
TYPE_ALIASES(FImpl,);
public:
// constructor
TWall(const std::string name);
// destructor
virtual ~TWall(void) = default;
// dependency relation
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
// setup
virtual void setup(void);
// execution
virtual void execute(void);
};
MODULE_REGISTER_NS(Wall, TWall<FIMPL>, MSource);
/******************************************************************************
* TWall implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
template <typename FImpl>
TWall<FImpl>::TWall(const std::string name)
: Module<WallPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
template <typename FImpl>
std::vector<std::string> TWall<FImpl>::getInput(void)
{
std::vector<std::string> in;
return in;
}
template <typename FImpl>
std::vector<std::string> TWall<FImpl>::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
template <typename FImpl>
void TWall<FImpl>::setup(void)
{
env().template registerLattice<PropagatorField>(getName());
}
// execution ///////////////////////////////////////////////////////////////////
template <typename FImpl>
void TWall<FImpl>::execute(void)
{
LOG(Message) << "Generating wall source at t = " << par().tW
<< " with momentum " << par().mom << std::endl;
PropagatorField &src = *env().template createLattice<PropagatorField>(getName());
Lattice<iScalar<vInteger>> t(env().getGrid());
LatticeComplex ph(env().getGrid()), coor(env().getGrid());
std::vector<Real> p;
Complex i(0.0,1.0);
p = strToVec<Real>(par().mom);
ph = zero;
for(unsigned int mu = 0; mu < Nd; mu++)
{
LatticeCoordinate(coor, mu);
ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
}
ph = exp((Real)(2*M_PI)*i*ph);
LatticeCoordinate(t, Tp);
src = 1.;
src = where((t == par().tW), src*ph, 0.*src);
}
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_WallSource_hpp_

View File

@ -173,7 +173,7 @@ void TQuark<FImpl>::execute(void)
*env().template getObject<PropagatorField>(getName());
axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0);
axpby_ssp_pplus(sol, 0., sol, 1., sol, 0, Ls_-1);
axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1);
ExtractSlice(tmp, sol, 0, 0);
FermToProp(p4d, tmp, s, c);
}

View File

@ -1,4 +1,7 @@
modules_cc =\
Modules/MContraction/WeakHamiltonianEye.cc \
Modules/MContraction/WeakHamiltonianNonEye.cc \
Modules/MContraction/WeakNeutral4ptDisc.cc \
Modules/MGauge/Load.cc \
Modules/MGauge/Random.cc \
Modules/MGauge/Unit.cc
@ -7,13 +10,21 @@ modules_hpp =\
Modules/MAction/DWF.hpp \
Modules/MAction/Wilson.hpp \
Modules/MContraction/Baryon.hpp \
Modules/MContraction/DiscLoop.hpp \
Modules/MContraction/Gamma3pt.hpp \
Modules/MContraction/Meson.hpp \
Modules/MContraction/WeakHamiltonian.hpp \
Modules/MContraction/WeakHamiltonianEye.hpp \
Modules/MContraction/WeakHamiltonianNonEye.hpp \
Modules/MContraction/WeakNeutral4ptDisc.hpp \
Modules/MGauge/Load.hpp \
Modules/MGauge/Random.hpp \
Modules/MGauge/Unit.hpp \
Modules/MLoop/NoiseLoop.hpp \
Modules/MSolver/RBPrecCG.hpp \
Modules/MSource/Point.hpp \
Modules/MSource/SeqGamma.hpp \
Modules/MSource/Wall.hpp \
Modules/MSource/Z2.hpp \
Modules/Quark.hpp

View File

@ -20,4 +20,17 @@ The simple testcase in this directory is the submitted bug report that encapsula
problem. The test case works with icpc and with clang++, but fails consistently on g++
current variants.
Peter
Peter
************
Second GCC bug reported, see Issue 100.
https://wandbox.org/permlink/tzssJza6R9XnqANw
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80652
Getting Travis fails under gcc-5 for Test_simd, now that I added more comprehensive testing to the
CI test suite. The limitations of Travis runtime limits & weak cores are being shown.
Travis uses 5.4.1 for g++-5.

86
grid-config.in Executable file
View File

@ -0,0 +1,86 @@
#! /bin/sh
prefix=@prefix@
exec_prefix=@exec_prefix@
includedir=@includedir@
usage()
{
cat <<EOF
Usage: grid-config [OPTION]
Known values for OPTION are:
--prefix show Grid installation prefix
--cxxflags print pre-processor and compiler flags
--ldflags print library linking flags
--libs print library linking information
--summary print full build summary
--help display this help and exit
--version output version information
--git print git revision
EOF
exit $1
}
if test $# -eq 0; then
usage 1
fi
cflags=false
libs=false
while test $# -gt 0; do
case "$1" in
-*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
*) optarg= ;;
esac
case "$1" in
--prefix)
echo $prefix
;;
--version)
echo @VERSION@
exit 0
;;
--git)
echo "@GRID_BRANCH@ @GRID_SHA@"
exit 0
;;
--help)
usage 0
;;
--cxxflags)
echo @GRID_CXXFLAGS@
;;
--ldflags)
echo @GRID_LDFLAGS@
;;
--libs)
echo @GRID_LIBS@
;;
--summary)
echo ""
echo "@GRID_SUMMARY@"
echo ""
;;
*)
usage
exit 1
;;
esac
shift
done
exit 0

37
lib/DisableWarnings.h Normal file
View File

@ -0,0 +1,37 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/DisableWarnings.h
Copyright (C) 2016
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef DISABLE_WARNINGS_H
#define DISABLE_WARNINGS_H
//disables and intel compiler specific warning (in json.hpp)
#pragma warning disable 488
#endif

View File

@ -38,52 +38,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_H
#define GRID_H
///////////////////
// Std C++ dependencies
///////////////////
#include <cassert>
#include <complex>
#include <vector>
#include <iostream>
#include <iomanip>
#include <random>
#include <functional>
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include <ctime>
#include <sys/time.h>
#include <chrono>
///////////////////
// Grid headers
///////////////////
#include "Config.h"
#include <Grid/Timer.h>
#include <Grid/PerfCount.h>
#include <Grid/Log.h>
#include <Grid/AlignedAllocator.h>
#include <Grid/Simd.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/Threads.h>
#include <Grid/Lexicographic.h>
#include <Grid/Init.h>
#include <Grid/Communicator.h>
#include <Grid/Cartesian.h>
#include <Grid/Tensors.h>
#include <Grid/Lattice.h>
#include <Grid/Cshift.h>
#include <Grid/Stencil.h>
#include <Grid/Algorithms.h>
#include <Grid/parallelIO/BinaryIO.h>
#include <Grid/FFT.h>
#include <Grid/qcd/QCD.h>
#include <Grid/parallelIO/NerscIO.h>
#include <Grid/qcd/hmc/NerscCheckpointer.h>
#include <Grid/qcd/hmc/HmcRunner.h>
#include <Grid/GridCore.h>
#include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/Action.h>
#include <Grid/qcd/smearing/Smearing.h>
#include <Grid/qcd/hmc/HMC_aggregate.h>
#endif

View File

@ -2,11 +2,13 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/MatrixUtils.h
Source file: ./lib/Grid.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -25,51 +27,34 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_MATRIX_UTILS_H
#define GRID_MATRIX_UTILS_H
//
// Grid.h
// simd
//
// Created by Peter Boyle on 09/05/2014.
// Copyright (c) 2014 University of Edinburgh. All rights reserved.
//
namespace Grid {
#ifndef GRID_BASE_H
#define GRID_BASE_H
namespace MatrixUtils {
#include <Grid/GridStd.h>
template<class T> inline void Size(Matrix<T>& A,int &N,int &M){
N=A.size(); assert(N>0);
M=A[0].size();
for(int i=0;i<N;i++){
assert(A[i].size()==M);
}
}
#include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h>
#include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h>
#include <Grid/simd/Simd.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/threads/Threads.h>
#include <Grid/util/Util.h>
#include <Grid/communicator/Communicator.h>
#include <Grid/cartesian/Cartesian.h>
#include <Grid/tensors/Tensors.h>
#include <Grid/lattice/Lattice.h>
#include <Grid/cshift/Cshift.h>
#include <Grid/stencil/Stencil.h>
#include <Grid/parallelIO/BinaryIO.h>
#include <Grid/algorithms/Algorithms.h>
template<class T> inline void SizeSquare(Matrix<T>& A,int &N)
{
int M;
Size(A,N,M);
assert(N==M);
}
template<class T> inline void Fill(Matrix<T>& A,T & val)
{
int N,M;
Size(A,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
A[i][j]=val;
}}
}
template<class T> inline void Diagonal(Matrix<T>& A,T & val)
{
int N;
SizeSquare(A,N);
for(int i=0;i<N;i++){
A[i][i]=val;
}
}
template<class T> inline void Identity(Matrix<T>& A)
{
Fill(A,0.0);
Diagonal(A,1.0);
}
};
}
#endif

View File

@ -2,12 +2,12 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/HMC.cc
Source file: ./lib/Grid.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
@ -27,10 +27,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifndef GRID_QCD_CORE_H
#define GRID_QCD_CORE_H
namespace Grid{
namespace QCD{
/////////////////////////
// Core Grid QCD headers
/////////////////////////
#include <Grid/GridCore.h>
#include <Grid/qcd/QCD.h>
#include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h>
}
}
#endif

27
lib/GridStd.h Normal file
View File

@ -0,0 +1,27 @@
#ifndef GRID_STD_H
#define GRID_STD_H
///////////////////
// Std C++ dependencies
///////////////////
#include <cassert>
#include <complex>
#include <vector>
#include <iostream>
#include <iomanip>
#include <random>
#include <functional>
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include <ctime>
#include <sys/time.h>
#include <chrono>
///////////////////
// Grid config
///////////////////
#include "Config.h"
#endif /* GRID_STD_H */

9
lib/Grid_Eigen_Dense.h Normal file
View File

@ -0,0 +1,9 @@
#pragma once
#if defined __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
#include <Grid/Eigen/Dense>
#if defined __GNUC__
#pragma GCC diagnostic pop
#endif

Binary file not shown.

View File

@ -1,154 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Old/Tensor_peek.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_MATH_PEEK_H
#define GRID_MATH_PEEK_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////
// Peek on a specific index; returns a scalar in that index, tensor inherits rest
//////////////////////////////////////////////////////////////////////////////
// If we hit the right index, return scalar with no further recursion
//template<int Level> inline ComplexF peekIndex(const ComplexF arg) { return arg;}
//template<int Level> inline ComplexD peekIndex(const ComplexD arg) { return arg;}
//template<int Level> inline RealF peekIndex(const RealF arg) { return arg;}
//template<int Level> inline RealD peekIndex(const RealD arg) { return arg;}
#if 0
// Scalar peek, no indices
template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline
auto peekIndex(const iScalar<vtype> &arg) -> iScalar<vtype>
{
return arg;
}
// Vector peek, one index
template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline
auto peekIndex(const iVector<vtype,N> &arg,int i) -> iScalar<vtype> // Index matches
{
iScalar<vtype> ret; // return scalar
ret._internal = arg._internal[i];
return ret;
}
// Matrix peek, two indices
template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline
auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) -> iScalar<vtype>
{
iScalar<vtype> ret; // return scalar
ret._internal = arg._internal[i][j];
return ret;
}
/////////////
// No match peek for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue
/////////////
// scalar
template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
auto peekIndex(const iScalar<vtype> &arg) -> iScalar<decltype(peekIndex<Level>(arg._internal))>
{
iScalar<decltype(peekIndex<Level>(arg._internal))> ret;
ret._internal= peekIndex<Level>(arg._internal);
return ret;
}
template<int Level,class vtype, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
auto peekIndex(const iScalar<vtype> &arg,int i) -> iScalar<decltype(peekIndex<Level>(arg._internal,i))>
{
iScalar<decltype(peekIndex<Level>(arg._internal,i))> ret;
ret._internal=peekIndex<Level>(arg._internal,i);
return ret;
}
template<int Level,class vtype, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
auto peekIndex(const iScalar<vtype> &arg,int i,int j) -> iScalar<decltype(peekIndex<Level>(arg._internal,i,j))>
{
iScalar<decltype(peekIndex<Level>(arg._internal,i,j))> ret;
ret._internal=peekIndex<Level>(arg._internal,i,j);
return ret;
}
// vector
template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
auto peekIndex(const iVector<vtype,N> &arg) -> iVector<decltype(peekIndex<Level>(arg._internal[0])),N>
{
iVector<decltype(peekIndex<Level>(arg._internal[0])),N> ret;
for(int ii=0;ii<N;ii++){
ret._internal[ii]=peekIndex<Level>(arg._internal[ii]);
}
return ret;
}
template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
auto peekIndex(const iVector<vtype,N> &arg,int i) -> iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N>
{
iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N> ret;
for(int ii=0;ii<N;ii++){
ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i);
}
return ret;
}
template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
auto peekIndex(const iVector<vtype,N> &arg,int i,int j) -> iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N>
{
iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N> ret;
for(int ii=0;ii<N;ii++){
ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i,j);
}
return ret;
}
// matrix
template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
auto peekIndex(const iMatrix<vtype,N> &arg) -> iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N>
{
iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N> ret;
for(int ii=0;ii<N;ii++){
for(int jj=0;jj<N;jj++){
ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj]);// Could avoid this because peeking a scalar is dumb
}}
return ret;
}
template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
auto peekIndex(const iMatrix<vtype,N> &arg,int i) -> iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i)),N>
{
iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i)),N> ret;
for(int ii=0;ii<N;ii++){
for(int jj=0;jj<N;jj++){
ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i);
}}
return ret;
}
template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) -> iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N>
{
iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N> ret;
for(int ii=0;ii<N;ii++){
for(int jj=0;jj<N;jj++){
ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i,j);
}}
return ret;
}
#endif
}
#endif

View File

@ -1,127 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Old/Tensor_poke.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_MATH_POKE_H
#define GRID_MATH_POKE_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////
// Poke a specific index;
//////////////////////////////////////////////////////////////////////////////
#if 0
// Scalar poke
template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline
void pokeIndex(iScalar<vtype> &ret, const iScalar<vtype> &arg)
{
ret._internal = arg._internal;
}
// Vector poke, one index
template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline
void pokeIndex(iVector<vtype,N> &ret, const iScalar<vtype> &arg,int i)
{
ret._internal[i] = arg._internal;
}
//Matrix poke, two indices
template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline
void pokeIndex(iMatrix<vtype,N> &ret, const iScalar<vtype> &arg,int i,int j)
{
ret._internal[i][j] = arg._internal;
}
/////////////
// No match poke for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue
/////////////
// scalar
template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal))> &arg)
{
pokeIndex<Level>(ret._internal,arg._internal);
}
template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal,0))> &arg, int i)
{
pokeIndex<Level>(ret._internal,arg._internal,i);
}
template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal,0,0))> &arg,int i,int j)
{
pokeIndex<Level>(ret._internal,arg._internal,i,j);
}
// Vector
template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
void pokeIndex(iVector<vtype,N> &ret, iVector<decltype(peekIndex<Level>(ret._internal)),N> &arg)
{
for(int ii=0;ii<N;ii++){
pokeIndex<Level>(ret._internal[ii],arg._internal[ii]);
}
}
template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(peekIndex<Level>(ret._internal,0)),N> &arg,int i)
{
for(int ii=0;ii<N;ii++){
pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i);
}
}
template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(peekIndex<Level>(ret._internal,0,0)),N> &arg,int i,int j)
{
for(int ii=0;ii<N;ii++){
pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i,j);
}
}
// Matrix
template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal)),N> &arg)
{
for(int ii=0;ii<N;ii++){
for(int jj=0;jj<N;jj++){
pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj]);
}}
}
template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal,0)),N> &arg,int i)
{
for(int ii=0;ii<N;ii++){
for(int jj=0;jj<N;jj++){
pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i);
}}
}
template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal,0,0)),N> &arg, int i,int j)
{
for(int ii=0;ii<N;ii++){
for(int jj=0;jj<N;jj++){
pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i,j);
}}
}
#endif
}
#endif

View File

@ -39,19 +39,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/approx/MultiShiftFunction.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientShifted.h>
#include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h>
#include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
// Lanczos support
#include <Grid/algorithms/iterative/MatrixUtils.h>
//#include <Grid/algorithms/iterative/MatrixUtils.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/CoarsenedMatrix.h>
#include <Grid/algorithms/FFT.h>
// Eigen/lanczos
// EigCg

View File

@ -267,8 +267,7 @@ namespace Grid {
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
PARALLEL_FOR_LOOP
for(int ss=0;ss<Grid()->oSites();ss++){
parallel_for(int ss=0;ss<Grid()->oSites();ss++){
siteVector res = zero;
siteVector nbr;
int ptype;
@ -380,8 +379,7 @@ PARALLEL_FOR_LOOP
Subspace.ProjectToSubspace(oProj,oblock);
// blockProject(iProj,iblock,Subspace.subspace);
// blockProject(oProj,oblock,Subspace.subspace);
PARALLEL_FOR_LOOP
for(int ss=0;ss<Grid()->oSites();ss++){
parallel_for(int ss=0;ss<Grid()->oSites();ss++){
for(int j=0;j<nbasis;j++){
if( disp!= 0 ) {
A[p]._odata[ss](j,i) = oProj._odata[ss](j);
@ -427,7 +425,7 @@ PARALLEL_FOR_LOOP
A[p]=zero;
}
GridParallelRNG RNG(Grid()); RNG.SeedRandomDevice();
GridParallelRNG RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34}));
Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);
Complex one(1.0);

View File

@ -235,7 +235,7 @@ namespace Grid {
Field tmp(in._grid);
_Mat.MeooeDag(in,tmp);
_Mat.MooeeInvDag(tmp,out);
_Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp);
_Mat.MooeeDag(in,out);

View File

@ -197,8 +197,9 @@ namespace Grid {
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid;
//std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
//<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
// std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites();

View File

@ -25,7 +25,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/GridCore.h>
namespace Grid {
double MultiShiftFunction::approx(double x)

View File

@ -16,7 +16,7 @@
#define INCLUDED_ALG_REMEZ_H
#include <stddef.h>
#include <Config.h>
#include <Grid/GridStd.h>
#ifdef HAVE_LIBGMP
#include "bigfloat.h"

View File

@ -0,0 +1,366 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
Copyright (C) 2017
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
#define GRID_BLOCK_CONJUGATE_GRADIENT_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////
// Block conjugate gradient. Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
template <class Field>
class BlockConjugateGradient : public OperatorFunction<Field> {
public:
typedef typename Field::scalar_type scomplex;
const int blockDim = 0;
int Nblock;
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = 0; // First dimension is block dim
Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard;
conformable(Psi, Src);
Field P(Src);
Field AP(Src);
Field R(Src);
Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_alpha = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_beta = Eigen::MatrixXcd::Zero(Nblock,Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,Src,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
// Initial search dir is guess
Linop.HermOp(Psi, AP);
/************************************************************************
* Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
************************************************************************
* O'Leary : R = B - A X
* O'Leary : P = M R ; preconditioner M = 1
* O'Leary : alpha = PAP^{-1} RMR
* O'Leary : beta = RMR^{-1}_old RMR_new
* O'Leary : X=X+Palpha
* O'Leary : R_new=R_old-AP alpha
* O'Leary : P=MR_new+P beta
*/
R = Src - AP;
P = R;
sliceInnerProductMatrix(m_rr,R,R,Orthog);
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
RealD rrsum=0;
for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
MatrixTimer.Start();
Linop.HermOp(P, AP);
MatrixTimer.Stop();
// Alpha
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
sliceInnerTimer.Stop();
m_pAp_inv = m_pAp.inverse();
m_alpha = m_pAp_inv * m_rr ;
// Psi, R update
sliceMaddTimer.Start();
sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog); // add alpha * P to psi
sliceMaddMatrix(R ,m_alpha,AP, R,Orthog,-1.0);// sub alpha * AP to resid
sliceMaddTimer.Stop();
// Beta
m_rr_inv = m_rr.inverse();
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_rr,R,R,Orthog);
sliceInnerTimer.Stop();
m_beta = m_rr_inv *m_rr;
// Search update
sliceMaddTimer.Start();
sliceMaddMatrix(AP,m_beta,P,R,Orthog);
sliceMaddTimer.Stop();
P= AP;
/*********************
* convergence monitor
*********************
*/
RealD max_resid=0;
for(int b=0;b<Nblock;b++){
RealD rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
//////////////////////////////////////////////////////////////////////////
// multiRHS conjugate gradient. Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
template <class Field>
class MultiRHSConjugateGradient : public OperatorFunction<Field> {
public:
typedef typename Field::scalar_type scomplex;
const int blockDim = 0;
int Nblock;
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = 0; // First dimension is block dim
Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard;
conformable(Psi, Src);
Field P(Src);
Field AP(Src);
Field R(Src);
std::vector<ComplexD> v_pAp(Nblock);
std::vector<RealD> v_rr (Nblock);
std::vector<RealD> v_rr_inv(Nblock);
std::vector<RealD> v_alpha(Nblock);
std::vector<RealD> v_beta(Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,Src,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
// Initial search dir is guess
Linop.HermOp(Psi, AP);
R = Src - AP;
P = R;
sliceNorm(v_rr,R,Orthog);
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch sliceNormTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
RealD rrsum=0;
for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
MatrixTimer.Start();
Linop.HermOp(P, AP);
MatrixTimer.Stop();
// Alpha
// sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog);
sliceInnerTimer.Start();
sliceInnerProductVector(v_pAp,P,AP,Orthog);
sliceInnerTimer.Stop();
for(int b=0;b<Nblock;b++){
// std::cout << " "<< v_pAp[b]<<" "<< v_pAp_test[b]<<std::endl;
v_alpha[b] = v_rr[b]/real(v_pAp[b]);
}
// Psi, R update
sliceMaddTimer.Start();
sliceMaddVector(Psi,v_alpha, P,Psi,Orthog); // add alpha * P to psi
sliceMaddVector(R ,v_alpha,AP, R,Orthog,-1.0);// sub alpha * AP to resid
sliceMaddTimer.Stop();
// Beta
for(int b=0;b<Nblock;b++){
v_rr_inv[b] = 1.0/v_rr[b];
}
sliceNormTimer.Start();
sliceNorm(v_rr,R,Orthog);
sliceNormTimer.Stop();
for(int b=0;b<Nblock;b++){
v_beta[b] = v_rr_inv[b] *v_rr[b];
}
// Search update
sliceMaddTimer.Start();
sliceMaddVector(P,v_beta,P,R,Orthog);
sliceMaddTimer.Stop();
/*********************
* convergence monitor
*********************
*/
RealD max_resid=0;
for(int b=0;b<Nblock;b++){
RealD rr = v_rr[b]/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << sliceNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
}
#endif

View File

@ -45,6 +45,8 @@ class ConjugateGradient : public OperatorFunction<Field> {
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
@ -76,18 +78,12 @@ class ConjugateGradient : public OperatorFunction<Field> {
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: p " << a << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
@ -97,8 +93,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
}
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq
<< std::endl;
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
@ -128,8 +123,11 @@ class ConjugateGradient : public OperatorFunction<Field> {
p = p * b + r;
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
// Stopping condition
if (cp <= rsq) {
@ -137,31 +135,33 @@ class ConjugateGradient : public OperatorFunction<Field> {
Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD mmpnorm = sqrt(norm2(mmp));
RealD psinorm = sqrt(norm2(psi));
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage
<< "ConjugateGradient: Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual << " target "
<< Tolerance << std::endl;
std::cout << GridLogMessage << "Time elapsed: Iterations "
<< SolverTimer.Elapsed() << " Matrix "
<< MatrixTimer.Elapsed() << " Linalg "
<< LinalgTimer.Elapsed();
std::cout << std::endl;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
}

View File

@ -35,6 +35,7 @@ namespace Grid {
class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
public:
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
@ -42,12 +43,16 @@ namespace Grid {
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
Integer TotalInnerIterations; //Number of inner CG iterations
Integer TotalOuterIterations; //Number of restarts
Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL){ };
void useGuesser(LinearFunction<FieldF> &g){
@ -55,9 +60,8 @@ namespace Grid {
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
(*this)(src_d_in,sol_d,NULL);
}
void operator() (const FieldD &src_d_in, FieldD &sol_d, RealD *shift){
TotalInnerIterations = 0;
GridStopWatch TotalTimer;
TotalTimer.Start();
@ -77,7 +81,7 @@ namespace Grid {
FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation
RealD inner_tol = Tolerance;
RealD inner_tol = InnerTolerance;
FieldF src_f(SinglePrecGrid);
src_f.checkerboard = cb;
@ -85,17 +89,18 @@ namespace Grid {
FieldF sol_f(SinglePrecGrid);
sol_f.checkerboard = cb;
ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations);
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
if(shift) axpy(tmp_d,*shift,sol_d,tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
@ -119,8 +124,9 @@ namespace Grid {
//Inner CG
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f,shift);
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
TotalInnerIterations += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
@ -133,11 +139,13 @@ namespace Grid {
//Final trial CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
ConjugateGradientShifted<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d,shift);
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};

View File

@ -45,7 +45,6 @@ public:
Integer MaxIterations;
int verbose;
MultiShiftFunction shifts;
int iter;
ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :
MaxIterations(maxit),
@ -61,7 +60,6 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
std::vector<Field> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
{
int nshift = shifts.order;
@ -107,12 +105,11 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
RealD a,b,c,d;
RealD cp,bp,qq; //prev
int cb=src.checkerboard;
// Matrix mult fields
Field r(grid);
Field p(grid); p.checkerboard = src.checkerboard;
Field p(grid);
Field tmp(grid);
Field mmp(grid);mmp.checkerboard = src.checkerboard;
Field mmp(grid);
// Check lightest mass
for(int s=0;s<nshift;s++){
@ -135,9 +132,6 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
p=src;
//MdagM+m[0]
std::cout << "p.checkerboard " << p.checkerboard
<< "mmp.checkerboard " << mmp.checkerboard << std::endl;
Linop.HermOpAndNorm(p,mmp,d,qq);
axpy(mmp,mass[0],p,mmp);
RealD rn = norm2(p);
@ -275,7 +269,6 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
RealD cn = norm2(src);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
}
iter = k;
return;
}
}

View File

@ -1,404 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@quark.phy.bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END/ LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H
#define GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H
namespace Grid {
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF
//, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0
//, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0
>
class MixedPrecisionConjugateGradientMultiShift : public LinearFunction<FieldD> {
public:
// RealD Tolerance;
Integer MaxInnerIterations;
Integer MaxOuterIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
MultiShiftFunction shifts;
Integer iter;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
// LinearFunction<FieldF> *guesser;
MixedPrecisionConjugateGradientMultiShift(GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d,
Integer maxinnerit, MultiShiftFunction &_shifts ) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
MaxInnerIterations(maxinnerit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), shifts(_shifts) {};
void operator() (const FieldD &src_d_in, FieldD &sol_d){
assert(0); // not yet implemented
}
void operator() (const FieldD &src_d_in, std::vector<FieldD> &sol_d){
GridStopWatch TotalTimer;
TotalTimer.Start();
int cb = src_d_in.checkerboard;
int nshift = shifts.order;
assert(nshift == sol_d.size());
for(int i=0;i<nshift;i++) sol_d[i].checkerboard = cb;
RealD src_norm = norm2(src_d_in);
// RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in._grid;
FieldD tmp_d(DoublePrecGrid); tmp_d.checkerboard = cb;
FieldD tmp2_d(DoublePrecGrid); tmp2_d.checkerboard = cb;
FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation
// RealD inner_tol = Tolerance;
FieldD psi_d(DoublePrecGrid);psi_d.checkerboard = cb;
FieldF src_f(SinglePrecGrid);
src_f.checkerboard = cb;
std::vector<FieldF> sol_f(nshift,SinglePrecGrid);
for(int i=0;i<nshift;i++) sol_f[i].checkerboard = cb;
// ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations);
ConjugateGradientMultiShift<FieldF> MSCG(MaxInnerIterations,shifts);
// CG_f.ErrorOnNoConverge = false;
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
{
// std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
// if(norm < OuterLoopNormMult * stop){
// std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
// break;
// }
// while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
PrecChangeTimer.Stop();
// zeroit(sol_f);
//Inner CG
InnerCGtimer.Start();
int if_relup = 0;
#if 0
MSCG(Linop_f,src_f,sol_f);
#else
{
GridBase *grid = SinglePrecGrid;
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.);
std::vector<FieldF> ps(nshift,grid);// Search directions
assert(sol_f.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
int cb=src_f.checkerboard;
// Matrix mult fields
FieldF r(grid); r.checkerboard = src_f.checkerboard;
FieldF p(grid); p.checkerboard = src_f.checkerboard;
FieldF tmp(grid); tmp.checkerboard = src_f.checkerboard;
FieldF mmp(grid);mmp.checkerboard = src_f.checkerboard;
FieldF psi(grid);psi.checkerboard = src_f.checkerboard;
std::cout.precision(12);
std::cout<<GridLogMessage<<"norm2(psi_d)= "<<norm2(psi_d)<<std::endl;
std::cout<<GridLogMessage<<"norm2(psi)= "<<norm2(psi)<<std::endl;
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_f);
Real c_relup = cp;
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientMultiShift: shift "<<s
<<" target resid "<<rsq[s]<<std::endl;
ps[s] = src_f;
}
// r and p for primary
r=src_f;
p=src_f;
//MdagM+m[0]
std::cout << "p.checkerboard " << p.checkerboard
<< "mmp.checkerboard " << mmp.checkerboard << std::endl;
Linop_f.HermOpAndNorm(p,mmp,d,qq);
axpy(mmp,mass[0],p,mmp);
RealD rn = norm2(p);
d += rn*mass[0];
// have verified that inner product of
// p and mmp is equal to d after this since
// the d computation is tricky
// qq = real(innerProduct(p,mmp));
// std::cout<<GridLogMessage << "debug equal ? qq "<<qq<<" d "<< d<<std::endl;
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r,b,mmp,r);
axpby(psi,0.,-bs[0],src_f,src_f);
for(int s=0;s<nshift;s++) {
axpby(sol_f[s],0.,-bs[s]*alpha[s],src_f,src_f);
}
// Iteration loop
int k;
// inefficient zeroing, please replace!
// RealD sol_norm = axpy_norm(sol_d[0],-1.,sol_d[0],sol_d[0]);
zeroit(sol_d[0]);
std::cout<<GridLogMessage<<"norm(sol_d[0])= "<<norm2(sol_d[0])<<std::endl;
int all_converged = 1;
RealD tmp1,tmp2;
for (k=1;k<=MaxOuterIterations;k++){
a = c /cp;
axpy(p,a,p,r);
// Note to self - direction ps is iterated seperately
// for each shift. Does not appear to have any scope
// for avoiding linear algebra in "single" case.
//
// However SAME r is used. Could load "r" and update
// ALL ps[s]. 2/3 Bandwidth saving
// New Kernel: Load r, vector of coeffs, vector of pointers ps
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps[s],a,ps[s],r);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps[s],z[s][iz],as,r,ps[s]);
}
}
}
cp=c;
Linop_f.HermOpAndNorm(p,mmp,d,qq);
axpy(mmp,mass[0],p,mmp);
RealD rn = norm2(p);
d += rn*mass[0];
bp=b;
b=-cp/d;
c=axpy_norm(r,b,mmp,r);
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
axpy(psi,-bs[0],ps[0],psi);
for(int s=0;s<nshift;s++){
int ss = s;
// Scope for optimisation here in case of "single".
// Could load sol_f[0] and pull all ps[s] in.
// if ( single ) ss=primary;
// Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
// Pipelined CG gain:
//
// New Kernel: Load r, vector of coeffs, vector of pointers ps
// New Kernel: Load sol_f[0], vector of coeffs, vector of pointers ps
// If can predict the coefficient bs then we can fuse these and avoid write reread cyce
// on ps[s].
// Before: 3 x npole + 3 x npole
// After : 2 x npole (ps[s]) => 3x speed up of multishift CG.
if( (!converged[s]) ) {
axpy(sol_f[ss],-bs[s]*alpha[s],ps[s],sol_f[ss]);
}
}
if (k%MaxInnerIterations==0){
// if (c < 1e-4*c_relup){
RealD c_f=c;
precisionChange(tmp_d,psi);
RealD sol_norm =axpy_norm (psi_d,1.,tmp_d,psi_d);
tmp1 = norm2(psi);
zeroit(psi);
tmp2 = norm2(psi);
std::cout<<GridLogMessage<<"k= "<<k<<" norm2(sol)= "<<sol_norm<<" "<<tmp1<<" "<<tmp2<<std::endl;
// precisionChange(sol_d[0],sol_f[0]);
Linop_d.HermOpAndNorm(psi_d,tmp_d,tmp1,tmp2);
axpy(tmp2_d,mass[0],psi_d,tmp_d);
axpy(tmp_d,-1.,tmp2_d,src_d);
precisionChange(r,tmp_d);
c_relup = norm2(r);
std::cout<<GridLogMessage<<"k= "<<k<<" norm2(r)= "<<c<<" "<<c_relup<<" "<<c_f<<std::endl;
if_relup=1;
}
// Convergence checks
all_converged=1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
if (k%MaxInnerIterations==0)
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has not converged "<<css<<"<"<<rsq[s]<<std::endl;
all_converged=0;
}
}
}
#if 0
if ( all_converged ){
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
#else
if ( converged[0] ){
std::cout<<GridLogMessage<< "CGMultiShift: Shift 0 have converged iteration, terminating "<<k<<std::endl;
#endif
#if 1
for(int s=1; s < nshift; s++) {
Linop_f.HermOpAndNorm(sol_f[s],mmp,d,qq);
axpy(tmp,mass[s],sol_f[s],mmp);
axpy(r,-alpha[s],src_f,tmp);
RealD rn = norm2(r);
RealD cn = norm2(src_f);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
}
#endif
iter = k;
break;
}
}
// ugly hack
if ( !all_converged )
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
#endif
InnerCGtimer.Stop();
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
sol_d[0]=psi_d;
for(int i=1;i<nshift;i++)precisionChange(sol_d[i], sol_f[i]);
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
// Check answers
for(int s=0; s < nshift; s++) {
RealD tmp1,tmp2;
Linop_d.HermOpAndNorm(sol_d[s],tmp_d,tmp1,tmp2);
axpy(tmp2_d,shifts.poles[s],sol_d[s],tmp_d);
axpy(tmp_d,-1.,src_d,tmp2_d);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(norm2(tmp_d)/norm2(src_d))<<std::endl;
}
PrecChangeTimer.Stop();
}
//Final trial CG
// std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
}
#endif

View File

@ -1,168 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradient.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_SHIFTED_H
#define GRID_CONJUGATE_GRADIENT_SHIFTED_H
namespace Grid {
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class ConjugateGradientShifted : public OperatorFunction<Field> {
public:
bool ErrorOnNoConverge; //throw an assert when the CG fails to converge. Defaults true.
RealD Tolerance;
Integer MaxIterations;
ConjugateGradientShifted(RealD tol,Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv) {
};
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi ){
(*this)(Linop,src,psi,NULL);
}
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi, RealD *shift){
psi.checkerboard = src.checkerboard;
conformable(psi,src);
RealD cp,c,a,d,b,ssq,qq,b_pred;
Field p(src);
Field mmp(src);
Field r(src);
//Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess)==0);
Linop.HermOpAndNorm(psi,mmp,d,b);
if(shift) axpy(mmp,*shift,psi,mmp);
RealD rn = norm2(psi);
if(shift) d += rn*(*shift);
RealD d2 = real(innerProduct(psi,mmp));
b= norm2(mmp);
RealD src_norm=norm2(src);
r= src-mmp;
p= r;
a =norm2(p);
cp =a;
ssq=norm2(src);
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: src "<<ssq <<std::endl;
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: mp "<<d <<std::endl;
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: mmp "<<b <<std::endl;
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: cp,r "<<cp <<std::endl;
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: p "<<a <<std::endl;
RealD rsq = Tolerance* Tolerance*ssq;
//Check if guess is really REALLY good :)
if ( cp <= rsq ) {
return;
}
std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k=1;k<=MaxIterations;k++){
c=cp;
MatrixTimer.Start();
Linop.HermOpAndNorm(p,mmp,d,qq);
MatrixTimer.Stop();
LinalgTimer.Start();
if(shift) axpy(mmp,*shift,p,mmp);
RealD rn = norm2(p);
if(shift) d += rn*(*shift);
RealD d2 = real(innerProduct(p,mmp));
qq = norm2(mmp);
if (k%10==1) std::cout<< std::setprecision(4)<< "d: "<<d<<" d2= "<<d2<<std::endl;
// RealD qqck = norm2(mmp);
// ComplexD dck = innerProduct(p,mmp);
a = c/d;
b_pred = a*(a*qq-d)/c;
cp = axpy_norm(r,-a,mmp,r);
b = cp/c;
if (k%10==1) std::cout<< std::setprecision(4)<<"k= "<<k<<" src: "<<src_norm<<" r= "<<cp<<std::endl;
// Fuse these loops ; should be really easy
psi= a*p+psi;
p = p*b+r;
LinalgTimer.Stop();
std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl;
// Stopping condition
if ( cp <= rsq ) {
SolverTimer.Stop();
Linop.HermOpAndNorm(psi,mmp,d,qq);
if(shift) mmp = mmp + (*shift) * psi;
p=mmp-src;
RealD mmpnorm = sqrt(norm2(mmp));
RealD psinorm = sqrt(norm2(psi));
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm/srcnorm;
std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
<<" computed residual "<<sqrt(cp/ssq)
<<" true residual " <<true_residual
<<" target "<<Tolerance<<std::endl;
std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
std::cout<<std::endl;
if(ErrorOnNoConverge)
assert(true_residual/Tolerance < 1000.0);
return;
}
}
std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl;
// assert(0);
}
};
}
#endif

View File

@ -30,20 +30,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define GRID_IRL_H
#include <string.h> //memset
#ifdef USE_LAPACK
#ifdef USE_MKL
#include<mkl_lapack.h>
#else
void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
double *vl, double *vu, int *il, int *iu, double *abstol,
int *m, double *w, double *z, int *ldz, int *isuppz,
double *work, int *lwork, int *iwork, int *liwork,
int *info);
//#include <lapacke/lapacke.h>
#endif
#endif
#include "DenseMatrix.h"
#include "EigenSort.h"
#include <Grid/algorithms/densematrix/DenseMatrix.h>
#include <Grid/algorithms/iterative/EigenSort.h>
namespace Grid {
@ -67,13 +64,12 @@ public:
int Np; // Np -- Number of spare vecs in kryloc space
int Nm; // Nm -- total number of vectors
RealD OrthoTime;
RealD eresid;
SortEigen<Field> _sort;
// GridCartesian &_fgrid;
LinearOperatorBase<Field> &_Linop;
OperatorFunction<Field> &_poly;
@ -130,23 +126,23 @@ public:
GridBase *grid = evec[0]._grid;
Field w(grid);
std::cout<<GridLogMessage << "RitzMatrix "<<std::endl;
std::cout << "RitzMatrix "<<std::endl;
for(int i=0;i<k;i++){
_poly(_Linop,evec[i],w);
std::cout<<GridLogMessage << "["<<i<<"] ";
std::cout << "["<<i<<"] ";
for(int j=0;j<k;j++){
ComplexD in = innerProduct(evec[j],w);
if ( fabs((double)i-j)>1 ) {
if (abs(in) >1.0e-9 ) {
std::cout<<GridLogMessage<<"oops"<<std::endl;
std::cout<<"oops"<<std::endl;
abort();
} else
std::cout<<GridLogMessage << " 0 ";
std::cout << " 0 ";
} else {
std::cout<<GridLogMessage << " "<<in<<" ";
std::cout << " "<<in<<" ";
}
}
std::cout<<GridLogMessage << std::endl;
std::cout << std::endl;
}
}
@ -180,10 +176,10 @@ public:
RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
// 7. vk+1 := wk/βk+1
std::cout<<GridLogMessage << "alpha = " << zalph << " beta "<<beta<<std::endl;
// std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
const RealD tiny = 1.0e-20;
if ( beta < tiny ) {
std::cout<<GridLogMessage << " beta is tiny "<<beta<<std::endl;
std::cout << " beta is tiny "<<beta<<std::endl;
}
lmd[k] = alph;
lme[k] = beta;
@ -259,7 +255,6 @@ public:
}
#ifdef USE_LAPACK
#define LAPACK_INT long long
void diagonalize_lapack(DenseVector<RealD>& lmd,
DenseVector<RealD>& lme,
int N1,
@ -269,7 +264,7 @@ public:
const int size = Nm;
// tevals.resize(size);
// tevecs.resize(size);
LAPACK_INT NN = N1;
int NN = N1;
double evals_tmp[NN];
double evec_tmp[NN][NN];
memset(evec_tmp[0],0,sizeof(double)*NN*NN);
@ -283,19 +278,19 @@ public:
if (i==j) evals_tmp[i] = lmd[i];
if (j==(i-1)) EE[j] = lme[j];
}
LAPACK_INT evals_found;
LAPACK_INT lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
LAPACK_INT liwork = 3+NN*10 ;
LAPACK_INT iwork[liwork];
int evals_found;
int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
int liwork = 3+NN*10 ;
int iwork[liwork];
double work[lwork];
LAPACK_INT isuppz[2*NN];
int isuppz[2*NN];
char jobz = 'V'; // calculate evals & evecs
char range = 'I'; // calculate all evals
// char range = 'A'; // calculate all evals
char uplo = 'U'; // refer to upper half of original matrix
char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
int ifail[NN];
long long info;
int info;
// int total = QMP_get_number_of_nodes();
// int node = QMP_get_node_number();
// GridBase *grid = evec[0]._grid;
@ -303,18 +298,14 @@ public:
int node = grid->_processor;
int interval = (NN/total)+1;
double vl = 0.0, vu = 0.0;
LAPACK_INT il = interval*node+1 , iu = interval*(node+1);
int il = interval*node+1 , iu = interval*(node+1);
if (iu > NN) iu=NN;
double tol = 0.0;
if (1) {
memset(evals_tmp,0,sizeof(double)*NN);
if ( il <= NN){
printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
#ifdef USE_MKL
dstegr(&jobz, &range, &NN,
#else
LAPACK_dstegr(&jobz, &range, &NN,
#endif
(double*)DD, (double*)EE,
&vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
&tol, // tolerance
@ -346,7 +337,6 @@ public:
lmd [NN-1-i]=evals_tmp[i];
}
}
#undef LAPACK_INT
#endif
@ -377,14 +367,12 @@ public:
// diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
#endif
int Niter = 10000*N1;
int Niter = 100*N1;
int kmin = 1;
int kmax = N2;
// (this should be more sophisticated)
for(int iter=0; ; ++iter){
if ( (iter+1)%(100*N1)==0)
std::cout<<GridLogMessage << "[QL method] Not converged - iteration "<<iter+1<<"\n";
for(int iter=0; iter<Niter; ++iter){
// determination of 2x2 leading submatrix
RealD dsub = lmd[kmax-1]-lmd[kmax-2];
@ -413,11 +401,11 @@ public:
_sort.push(lmd3,N2);
_sort.push(lmd2,N2);
for(int k=0; k<N2; ++k){
if (fabs(lmd2[k] - lmd3[k]) >SMALL) std::cout<<GridLogMessage <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
// if (fabs(lme2[k] - lme[k]) >SMALL) std::cout<<GridLogMessage <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
if (fabs(lmd2[k] - lmd3[k]) >SMALL) std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
// if (fabs(lme2[k] - lme[k]) >SMALL) std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
}
for(int k=0; k<N1*N1; ++k){
// if (fabs(Qt2[k] - Qt[k]) >SMALL) std::cout<<GridLogMessage <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
// if (fabs(Qt2[k] - Qt[k]) >SMALL) std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
}
}
#endif
@ -432,7 +420,7 @@ public:
}
}
}
std::cout<<GridLogMessage << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
std::cout << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
abort();
}
@ -449,7 +437,6 @@ public:
DenseVector<Field>& evec,
int k)
{
double t0=-usecond()/1e6;
typedef typename Field::scalar_type MyComplex;
MyComplex ip;
@ -468,8 +455,6 @@ public:
w = w - ip * evec[j];
}
normalise(w);
t0+=usecond()/1e6;
OrthoTime +=t0;
}
void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) {
@ -503,10 +488,10 @@ until convergence
GridBase *grid = evec[0]._grid;
assert(grid == src._grid);
std::cout<<GridLogMessage << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
std::cout<<GridLogMessage << " -- Nm = " << Nm << std::endl;
std::cout<<GridLogMessage << " -- size of eval = " << eval.size() << std::endl;
std::cout<<GridLogMessage << " -- size of evec = " << evec.size() << std::endl;
std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
std::cout << " -- Nm = " << Nm << std::endl;
std::cout << " -- size of eval = " << eval.size() << std::endl;
std::cout << " -- size of evec = " << evec.size() << std::endl;
assert(Nm == evec.size() && Nm == eval.size());
@ -517,7 +502,6 @@ until convergence
DenseVector<int> Iconv(Nm);
DenseVector<Field> B(Nm,grid); // waste of space replicating
// DenseVector<Field> Btemp(Nm,grid); // waste of space replicating
Field f(grid);
Field v(grid);
@ -533,48 +517,35 @@ until convergence
// (uniform vector) Why not src??
// evec[0] = 1.0;
evec[0] = src;
std:: cout<<GridLogMessage <<"norm2(src)= " << norm2(src)<<std::endl;
std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
// << src._grid << std::endl;
normalise(evec[0]);
std:: cout<<GridLogMessage <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
// << evec[0]._grid << std::endl;
// Initial Nk steps
OrthoTime=0.;
double t0=usecond()/1e6;
for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
double t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL::Initial steps: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
std::cout<<GridLogMessage <<"IRL::Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
// std:: cout<<GridLogMessage <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
// std:: cout<<GridLogMessage <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
// std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
// std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
RitzMatrix(evec,Nk);
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL::RitzMatrix: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
for(int k=0; k<Nk; ++k){
// std:: cout<<GridLogMessage <<"eval " << k << " " <<eval[k] << std::endl;
// std:: cout<<GridLogMessage <<"lme " << k << " " << lme[k] << std::endl;
// std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
// std:: cout <<"lme " << k << " " << lme[k] << std::endl;
}
// Restarting loop begins
for(int iter = 0; iter<Niter; ++iter){
std::cout<<GridLogMessage<<"\n Restart iteration = "<< iter << std::endl;
std::cout<<"\n Restart iteration = "<< iter << std::endl;
//
// Rudy does a sort first which looks very different. Getting fed up with sorting out the algo defs.
// We loop over
//
OrthoTime=0.;
for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL:: "<<Np <<" steps: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
std::cout<<GridLogMessage <<"IRL::Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
f *= lme[Nm-1];
RitzMatrix(evec,k2);
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL:: RitzMatrix: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
// getting eigenvalues
for(int k=0; k<Nm; ++k){
@ -583,27 +554,18 @@ until convergence
}
setUnit_Qt(Nm,Qt);
diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL:: diagonalize: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
// sorting
_sort.push(eval2,Nm);
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL:: eval sorting: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
// Implicitly shifted QR transformations
setUnit_Qt(Nm,Qt);
for(int ip=0; ip<k2; ++ip){
std::cout<<GridLogMessage << "eval "<< ip << " "<< eval2[ip] << std::endl;
}
for(int ip=k2; ip<Nm; ++ip){
std::cout<<GridLogMessage << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
}
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL::qr_decomp: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
if (0) {
for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
for(int j=k1-1; j<k2+1; ++j){
@ -612,38 +574,14 @@ if (0) {
B[j] += Qt[k+Nm*j] * evec[k];
}
}
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL::QR Rotate: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
}
if (1) {
for(int i=0; i<(Nk+1); ++i) {
B[i] = 0.0;
B[i].checkerboard = evec[0].checkerboard;
}
int j_block = 24; int k_block=24;
PARALLEL_FOR_LOOP
for(int ss=0;ss < grid->oSites();ss++){
for(int jj=k1-1; jj<k2+1; jj += j_block)
for(int kk=0; kk<Nm; kk += k_block)
for(int j=jj; (j<(k2+1)) && j<(jj+j_block); ++j){
for(int k=kk; (k<Nm) && k<(kk+k_block) ; ++k){
B[j]._odata[ss] +=Qt[k+Nm*j] * evec[k]._odata[ss];
}
}
}
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL::QR rotation: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
}
for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j];
for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j];
// Compressed vector f and beta(k2)
f *= Qt[Nm-1+Nm*(k2-1)];
f += lme[k2-1] * evec[k2];
beta_k = norm2(f);
beta_k = sqrt(beta_k);
std::cout<<GridLogMessage<<" beta(k) = "<<beta_k<<std::endl;
std::cout<<" beta(k) = "<<beta_k<<std::endl;
RealD betar = 1.0/beta_k;
evec[k2] = betar * f;
@ -656,10 +594,7 @@ PARALLEL_FOR_LOOP
}
setUnit_Qt(Nm,Qt);
diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL::diagonalize: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
if (0) {
for(int k = 0; k<Nk; ++k) B[k]=0.0;
for(int j = 0; j<Nk; ++j){
@ -667,34 +602,12 @@ if (0) {
B[j].checkerboard = evec[k].checkerboard;
B[j] += Qt[k+j*Nm] * evec[k];
}
std::cout<<GridLogMessage << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
// std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
}
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL::Convergence rotation: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
}
if (1) {
for(int i=0; i<(Nk+1); ++i) {
B[i] = 0.0;
B[i].checkerboard = evec[0].checkerboard;
}
int j_block = 24; int k_block=24;
PARALLEL_FOR_LOOP
for(int ss=0;ss < grid->oSites();ss++){
for(int jj=0; jj<Nk; jj += j_block)
for(int kk=0; kk<Nk; kk += k_block)
for(int j=jj; (j<Nk) && j<(jj+j_block); ++j){
for(int k=kk; (k<Nk) && k<(kk+k_block) ; ++k){
B[j]._odata[ss] +=Qt[k+Nm*j] * evec[k]._odata[ss];
}
}
}
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL::convergence rotation : "<<t1-t0<< "seconds"<<std::endl; t0=t1;
}
// _sort.push(eval2,B,Nk);
Nconv = 0;
// std::cout<<GridLogMessage << std::setiosflags(std::ios_base::scientific);
// std::cout << std::setiosflags(std::ios_base::scientific);
for(int i=0; i<Nk; ++i){
// _poly(_Linop,B[i],v);
@ -702,16 +615,14 @@ PARALLEL_FOR_LOOP
RealD vnum = real(innerProduct(B[i],v)); // HermOp.
RealD vden = norm2(B[i]);
RealD vv0 = norm2(v);
eval2[i] = vnum/vden;
v -= eval2[i]*B[i];
RealD vv = norm2(v);
std::cout.precision(13);
std::cout<<GridLogMessage << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
std::cout<<"eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
std::cout<<"|H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv;
std::cout<<" "<< vnum/(sqrt(vden)*sqrt(vv0)) << std::endl;
std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
if((vv<eresid*eresid) && (i == Nconv) ){
@ -720,19 +631,17 @@ PARALLEL_FOR_LOOP
}
} // i-loop end
// std::cout<<GridLogMessage << std::resetiosflags(std::ios_base::scientific);
t1=usecond()/1e6;
std::cout<<GridLogMessage <<"IRL::convergence testing: "<<t1-t0<< "seconds"<<std::endl; t0=t1;
// std::cout << std::resetiosflags(std::ios_base::scientific);
std::cout<<GridLogMessage<<" #modes converged: "<<Nconv<<std::endl;
std::cout<<" #modes converged: "<<Nconv<<std::endl;
if( Nconv>=Nstop ){
goto converged;
}
} // end of iter loop
std::cout<<GridLogMessage<<"\n NOT converged.\n";
std::cout<<"\n NOT converged.\n";
abort();
converged:
@ -745,10 +654,10 @@ PARALLEL_FOR_LOOP
}
_sort.push(eval,evec,Nconv);
std::cout<<GridLogMessage << "\n Converged\n Summary :\n";
std::cout<<GridLogMessage << " -- Iterations = "<< Nconv << "\n";
std::cout<<GridLogMessage << " -- beta(k) = "<< beta_k << "\n";
std::cout<<GridLogMessage << " -- Nconv = "<< Nconv << "\n";
std::cout << "\n Converged\n Summary :\n";
std::cout << " -- Iterations = "<< Nconv << "\n";
std::cout << " -- beta(k) = "<< beta_k << "\n";
std::cout << " -- Nconv = "<< Nconv << "\n";
}
/////////////////////////////////////////////////
@ -771,25 +680,25 @@ PARALLEL_FOR_LOOP
}
}
std::cout<<GridLogMessage<<"Lanczos_Factor start/end " <<start <<"/"<<end<<std::endl;
std::cout<<"Lanczos_Factor start/end " <<start <<"/"<<end<<std::endl;
// Starting from scratch, bq[0] contains a random vector and |bq[0]| = 1
int first;
if(start == 0){
std::cout<<GridLogMessage << "start == 0\n"; //TESTING
std::cout << "start == 0\n"; //TESTING
_poly(_Linop,bq[0],bf);
alpha = real(innerProduct(bq[0],bf));//alpha = bq[0]^dag A bq[0]
std::cout<<GridLogMessage << "alpha = " << alpha << std::endl;
std::cout << "alpha = " << alpha << std::endl;
bf = bf - alpha * bq[0]; //bf = A bq[0] - alpha bq[0]
H[0][0]=alpha;
std::cout<<GridLogMessage << "Set H(0,0) to " << H[0][0] << std::endl;
std::cout << "Set H(0,0) to " << H[0][0] << std::endl;
first = 1;
@ -809,19 +718,19 @@ PARALLEL_FOR_LOOP
beta = 0;sqbt = 0;
std::cout<<GridLogMessage << "cont is true so setting beta to zero\n";
std::cout << "cont is true so setting beta to zero\n";
} else {
beta = norm2(bf);
sqbt = sqrt(beta);
std::cout<<GridLogMessage << "beta = " << beta << std::endl;
std::cout << "beta = " << beta << std::endl;
}
for(int j=first;j<end;j++){
std::cout<<GridLogMessage << "Factor j " << j <<std::endl;
std::cout << "Factor j " << j <<std::endl;
if(cont){ // switches to factoring; understand start!=0 and initial bf value is right.
bq[j] = bf; cont = false;
@ -844,7 +753,7 @@ PARALLEL_FOR_LOOP
beta = fnorm;
sqbt = sqrt(beta);
std::cout<<GridLogMessage << "alpha = " << alpha << " fnorm = " << fnorm << '\n';
std::cout << "alpha = " << alpha << " fnorm = " << fnorm << '\n';
///Iterative refinement of orthogonality V = [ bq[0] bq[1] ... bq[M] ]
int re = 0;
@ -879,8 +788,8 @@ PARALLEL_FOR_LOOP
bck = sqrt( nmbex );
re++;
}
std::cout<<GridLogMessage << "Iteratively refined orthogonality, changes alpha\n";
if(re > 1) std::cout<<GridLogMessage << "orthagonality refined " << re << " times" <<std::endl;
std::cout << "Iteratively refined orthogonality, changes alpha\n";
if(re > 1) std::cout << "orthagonality refined " << re << " times" <<std::endl;
H[j][j]=alpha;
}
@ -895,13 +804,11 @@ PARALLEL_FOR_LOOP
void ImplicitRestart(int TM, DenseVector<RealD> &evals, DenseVector<DenseVector<RealD> > &evecs, DenseVector<Field> &bq, Field &bf, int cont)
{
std::cout<<GridLogMessage << "ImplicitRestart begin. Eigensort starting\n";
std::cout << "ImplicitRestart begin. Eigensort starting\n";
DenseMatrix<RealD> H; Resize(H,Nm,Nm);
#ifndef USE_LAPACK
EigenSort(evals, evecs);
#endif
///Assign shifts
int K=Nk;
@ -924,15 +831,15 @@ PARALLEL_FOR_LOOP
/// Shifted H defines a new K step Arnoldi factorization
RealD beta = H[ff][ff-1];
RealD sig = Q[TM - 1][ff - 1];
std::cout<<GridLogMessage << "beta = " << beta << " sig = " << real(sig) <<std::endl;
std::cout << "beta = " << beta << " sig = " << real(sig) <<std::endl;
std::cout<<GridLogMessage << "TM = " << TM << " ";
std::cout<<GridLogMessage << norm2(bq[0]) << " -- before" <<std::endl;
std::cout << "TM = " << TM << " ";
std::cout << norm2(bq[0]) << " -- before" <<std::endl;
/// q -> q Q
times_real(bq, Q, TM);
std::cout<<GridLogMessage << norm2(bq[0]) << " -- after " << ff <<std::endl;
std::cout << norm2(bq[0]) << " -- after " << ff <<std::endl;
bf = beta* bq[ff] + sig* bf;
/// Do the rest of the factorization
@ -956,7 +863,7 @@ PARALLEL_FOR_LOOP
int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with
if(ff < M) {
std::cout<<GridLogMessage << "Krylov: aborting ff "<<ff <<" "<<M<<std::endl;
std::cout << "Krylov: aborting ff "<<ff <<" "<<M<<std::endl;
abort(); // Why would this happen?
}
@ -965,7 +872,7 @@ PARALLEL_FOR_LOOP
for(int it = 0; it < Niter && (converged < Nk); ++it) {
std::cout<<GridLogMessage << "Krylov: Iteration --> " << it << std::endl;
std::cout << "Krylov: Iteration --> " << it << std::endl;
int lock_num = lock ? converged : 0;
DenseVector<RealD> tevals(M - lock_num );
DenseMatrix<RealD> tevecs; Resize(tevecs,M - lock_num,M - lock_num);
@ -981,7 +888,7 @@ PARALLEL_FOR_LOOP
Wilkinson<RealD>(H, evals, evecs, small);
// Check();
std::cout<<GridLogMessage << "Done "<<std::endl;
std::cout << "Done "<<std::endl;
}
@ -1046,7 +953,7 @@ PARALLEL_FOR_LOOP
DenseVector<RealD> &tevals, DenseVector<DenseVector<RealD> > &tevecs,
int lock, int converged)
{
std::cout<<GridLogMessage << "Converged " << converged << " so far." << std::endl;
std::cout << "Converged " << converged << " so far." << std::endl;
int lock_num = lock ? converged : 0;
int M = Nm;
@ -1061,9 +968,7 @@ PARALLEL_FOR_LOOP
RealD small=1.0e-16;
Wilkinson<RealD>(AH, tevals, tevecs, small);
#ifndef USE_LAPACK
EigenSort(tevals, tevecs);
#endif
RealD resid_nrm= norm2(bf);
@ -1074,7 +979,7 @@ PARALLEL_FOR_LOOP
RealD diff = 0;
diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm;
std::cout<<GridLogMessage << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl;
std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl;
if(diff < converged) {
@ -1090,13 +995,13 @@ PARALLEL_FOR_LOOP
lock_num++;
}
converged++;
std::cout<<GridLogMessage << " converged on eval " << converged << " of " << Nk << std::endl;
std::cout << " converged on eval " << converged << " of " << Nk << std::endl;
} else {
break;
}
}
#endif
std::cout<<GridLogMessage << "Got " << converged << " so far " <<std::endl;
std::cout << "Got " << converged << " so far " <<std::endl;
}
///Check
@ -1105,9 +1010,7 @@ PARALLEL_FOR_LOOP
DenseVector<RealD> goodval(this->get);
#ifndef USE_LAPACK
EigenSort(evals,evecs);
#endif
int NM = Nm;
@ -1179,16 +1082,14 @@ say con = 2
**/
template<class T>
static void Lock(DenseMatrix<T> &H, ///Hess mtx
DenseMatrix<T> &Q, ///Lock Transform
T val, ///value to be locked
int con, ///number already locked
static void Lock(DenseMatrix<T> &H, // Hess mtx
DenseMatrix<T> &Q, // Lock Transform
T val, // value to be locked
int con, // number already locked
RealD small,
int dfg,
bool herm)
{
//ForceTridiagonal(H);
int M = H.dim;
@ -1220,7 +1121,6 @@ static void Lock(DenseMatrix<T> &H, ///Hess mtx
AH = Hermitian(QQ)*AH;
AH = AH*QQ;
for(int i=con;i<M;i++){
for(int j=con;j<M;j++){

View File

@ -1,453 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/Matrix.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef MATRIX_H
#define MATRIX_H
#include <cstdlib>
#include <string>
#include <cmath>
#include <vector>
#include <iostream>
#include <iomanip>
#include <complex>
#include <typeinfo>
#include <Grid.h>
/** Sign function **/
template <class T> T sign(T p){return ( p/abs(p) );}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////// Hijack STL containers for our wicked means /////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class T> using Vector = Vector<T>;
template<class T> using Matrix = Vector<Vector<T> >;
template<class T> void Resize(Vector<T > & vec, int N) { vec.resize(N); }
template<class T> void Resize(Matrix<T > & mat, int N, int M) {
mat.resize(N);
for(int i=0;i<N;i++){
mat[i].resize(M);
}
}
template<class T> void Size(Vector<T> & vec, int &N)
{
N= vec.size();
}
template<class T> void Size(Matrix<T> & mat, int &N,int &M)
{
N= mat.size();
M= mat[0].size();
}
template<class T> void SizeSquare(Matrix<T> & mat, int &N)
{
int M; Size(mat,N,M);
assert(N==M);
}
template<class T> void SizeSame(Matrix<T> & mat1,Matrix<T> &mat2, int &N1,int &M1)
{
int N2,M2;
Size(mat1,N1,M1);
Size(mat2,N2,M2);
assert(N1==N2);
assert(M1==M2);
}
//*****************************************
//* (Complex) Vector operations *
//*****************************************
/**Conj of a Vector **/
template <class T> Vector<T> conj(Vector<T> p){
Vector<T> q(p.size());
for(int i=0;i<p.size();i++){q[i] = conj(p[i]);}
return q;
}
/** Norm of a Vector**/
template <class T> T norm(Vector<T> p){
T sum = 0;
for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
return abs(sqrt(sum));
}
/** Norm squared of a Vector **/
template <class T> T norm2(Vector<T> p){
T sum = 0;
for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
return abs((sum));
}
/** Sum elements of a Vector **/
template <class T> T trace(Vector<T> p){
T sum = 0;
for(int i=0;i<p.size();i++){sum = sum + p[i];}
return sum;
}
/** Fill a Vector with constant c **/
template <class T> void Fill(Vector<T> &p, T c){
for(int i=0;i<p.size();i++){p[i] = c;}
}
/** Normalize a Vector **/
template <class T> void normalize(Vector<T> &p){
T m = norm(p);
if( abs(m) > 0.0) for(int i=0;i<p.size();i++){p[i] /= m;}
}
/** Vector by scalar **/
template <class T, class U> Vector<T> times(Vector<T> p, U s){
for(int i=0;i<p.size();i++){p[i] *= s;}
return p;
}
template <class T, class U> Vector<T> times(U s, Vector<T> p){
for(int i=0;i<p.size();i++){p[i] *= s;}
return p;
}
/** inner product of a and b = conj(a) . b **/
template <class T> T inner(Vector<T> a, Vector<T> b){
T m = 0.;
for(int i=0;i<a.size();i++){m = m + conj(a[i])*b[i];}
return m;
}
/** sum of a and b = a + b **/
template <class T> Vector<T> add(Vector<T> a, Vector<T> b){
Vector<T> m(a.size());
for(int i=0;i<a.size();i++){m[i] = a[i] + b[i];}
return m;
}
/** sum of a and b = a - b **/
template <class T> Vector<T> sub(Vector<T> a, Vector<T> b){
Vector<T> m(a.size());
for(int i=0;i<a.size();i++){m[i] = a[i] - b[i];}
return m;
}
/**
*********************************
* Matrices *
*********************************
**/
template<class T> void Fill(Matrix<T> & mat, T&val) {
int N,M;
Size(mat,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
mat[i][j] = val;
}}
}
/** Transpose of a matrix **/
Matrix<T> Transpose(Matrix<T> & mat){
int N,M;
Size(mat,N,M);
Matrix C; Resize(C,M,N);
for(int i=0;i<M;i++){
for(int j=0;j<N;j++){
C[i][j] = mat[j][i];
}}
return C;
}
/** Set Matrix to unit matrix **/
template<class T> void Unity(Matrix<T> &mat){
int N; SizeSquare(mat,N);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
if ( i==j ) A[i][j] = 1;
else A[i][j] = 0;
}
}
}
/** Add C * I to matrix **/
template<class T>
void PlusUnit(Matrix<T> & A,T c){
int dim; SizeSquare(A,dim);
for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;}
}
/** return the Hermitian conjugate of matrix **/
Matrix<T> HermitianConj(Matrix<T> &mat){
int dim; SizeSquare(mat,dim);
Matrix<T> C; Resize(C,dim,dim);
for(int i=0;i<dim;i++){
for(int j=0;j<dim;j++){
C[i][j] = conj(mat[j][i]);
}
}
return C;
}
/** return diagonal entries as a Vector **/
Vector<T> diag(Matrix<T> &A)
{
int dim; SizeSquare(A,dim);
Vector<T> d; Resize(d,dim);
for(int i=0;i<dim;i++){
d[i] = A[i][i];
}
return d;
}
/** Left multiply by a Vector **/
Vector<T> operator *(Vector<T> &B,Matrix<T> &A)
{
int K,M,N;
Size(B,K);
Size(A,M,N);
assert(K==M);
Vector<T> C; Resize(C,N);
for(int j=0;j<N;j++){
T sum = 0.0;
for(int i=0;i<M;i++){
sum += B[i] * A[i][j];
}
C[j] = sum;
}
return C;
}
/** return 1/diagonal entries as a Vector **/
Vector<T> inv_diag(Matrix<T> & A){
int dim; SizeSquare(A,dim);
Vector<T> d; Resize(d,dim);
for(int i=0;i<dim;i++){
d[i] = 1.0/A[i][i];
}
return d;
}
/** Matrix Addition **/
inline Matrix<T> operator + (Matrix<T> &A,Matrix<T> &B)
{
int N,M ; SizeSame(A,B,N,M);
Matrix C; Resize(C,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
C[i][j] = A[i][j] + B[i][j];
}
}
return C;
}
/** Matrix Subtraction **/
inline Matrix<T> operator- (Matrix<T> & A,Matrix<T> &B){
int N,M ; SizeSame(A,B,N,M);
Matrix C; Resize(C,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
C[i][j] = A[i][j] - B[i][j];
}}
return C;
}
/** Matrix scalar multiplication **/
inline Matrix<T> operator* (Matrix<T> & A,T c){
int N,M; Size(A,N,M);
Matrix C; Resize(C,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
C[i][j] = A[i][j]*c;
}}
return C;
}
/** Matrix Matrix multiplication **/
inline Matrix<T> operator* (Matrix<T> &A,Matrix<T> &B){
int K,L,N,M;
Size(A,K,L);
Size(B,N,M); assert(L==N);
Matrix C; Resize(C,K,M);
for(int i=0;i<K;i++){
for(int j=0;j<M;j++){
T sum = 0.0;
for(int k=0;k<N;k++) sum += A[i][k]*B[k][j];
C[i][j] =sum;
}
}
return C;
}
/** Matrix Vector multiplication **/
inline Vector<T> operator* (Matrix<T> &A,Vector<T> &B){
int M,N,K;
Size(A,N,M);
Size(B,K); assert(K==M);
Vector<T> C; Resize(C,N);
for(int i=0;i<N;i++){
T sum = 0.0;
for(int j=0;j<M;j++) sum += A[i][j]*B[j];
C[i] = sum;
}
return C;
}
/** Some version of Matrix norm **/
/*
inline T Norm(){ // this is not a usual L2 norm
T norm = 0;
for(int i=0;i<dim;i++){
for(int j=0;j<dim;j++){
norm += abs(A[i][j]);
}}
return norm;
}
*/
/** Some version of Matrix norm **/
template<class T> T LargestDiag(Matrix<T> &A)
{
int dim ; SizeSquare(A,dim);
T ld = abs(A[0][0]);
for(int i=1;i<dim;i++){
T cf = abs(A[i][i]);
if(abs(cf) > abs(ld) ){ld = cf;}
}
return ld;
}
/** Look for entries on the leading subdiagonal that are smaller than 'small' **/
template <class T,class U> int Chop_subdiag(Matrix<T> &A,T norm, int offset, U small)
{
int dim; SizeSquare(A,dim);
for(int l = dim - 1 - offset; l >= 1; l--) {
if((U)abs(A[l][l - 1]) < (U)small) {
A[l][l-1]=(U)0.0;
return l;
}
}
return 0;
}
/** Look for entries on the leading subdiagonal that are smaller than 'small' **/
template <class T,class U> int Chop_symm_subdiag(Matrix<T> & A,T norm, int offset, U small)
{
int dim; SizeSquare(A,dim);
for(int l = dim - 1 - offset; l >= 1; l--) {
if((U)abs(A[l][l - 1]) < (U)small) {
A[l][l - 1] = (U)0.0;
A[l - 1][l] = (U)0.0;
return l;
}
}
return 0;
}
/**Assign a submatrix to a larger one**/
template<class T>
void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
{
for(int i = row_st; i<row_end; i++){
for(int j = col_st; j<col_end; j++){
A[i][j] = S[i - row_st][j - col_st];
}
}
}
/**Get a square submatrix**/
template <class T>
Matrix<T> GetSubMtx(Matrix<T> &A,int row_st, int row_end, int col_st, int col_end)
{
Matrix<T> H; Resize(row_end - row_st,col_end-col_st);
for(int i = row_st; i<row_end; i++){
for(int j = col_st; j<col_end; j++){
H[i-row_st][j-col_st]=A[i][j];
}}
return H;
}
/**Assign a submatrix to a larger one NB remember Vector Vectors are transposes of the matricies they represent**/
template<class T>
void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
{
for(int i = row_st; i<row_end; i++){
for(int j = col_st; j<col_end; j++){
A[i][j] = S[i - row_st][j - col_st];
}}
}
/** compute b_i A_ij b_j **/ // surprised no Conj
template<class T> T proj(Matrix<T> A, Vector<T> B){
int dim; SizeSquare(A,dim);
int dimB; Size(B,dimB);
assert(dimB==dim);
T C = 0;
for(int i=0;i<dim;i++){
T sum = 0.0;
for(int j=0;j<dim;j++){
sum += A[i][j]*B[j];
}
C += B[i]*sum; // No conj?
}
return C;
}
/*
*************************************************************
*
* Matrix Vector products
*
*************************************************************
*/
// Instead make a linop and call my CG;
/// q -> q Q
template <class T,class Fermion> void times(Vector<Fermion> &q, Matrix<T> &Q)
{
int M; SizeSquare(Q,M);
int N; Size(q,N);
assert(M==N);
times(q,Q,N);
}
/// q -> q Q
template <class T> void times(multi1d<LatticeFermion> &q, Matrix<T> &Q, int N)
{
GridBase *grid = q[0]._grid;
int M; SizeSquare(Q,M);
int K; Size(q,K);
assert(N<M);
assert(N<K);
Vector<Fermion> S(N,grid );
for(int j=0;j<N;j++){
S[j] = zero;
for(int k=0;k<N;k++){
S[j] = S[j] + q[k]* Q[k][j];
}
}
for(int j=0;j<q.size();j++){
q[j] = S[j];
}
}
#endif

View File

@ -1,15 +0,0 @@
- ConjugateGradientMultiShift
- MCR
- Potentially Useful Boost libraries
- MultiArray
- Aligned allocator; memory pool
- Remez -- Mike or Boost?
- Multiprecision
- quaternians
- Tokenize
- Serialization
- Regex
- Proto (ET)
- uBlas

View File

@ -1,122 +0,0 @@
#include <math.h>
#include <stdlib.h>
#include <vector>
struct Bisection {
static void get_eig2(int row_num,std::vector<RealD> &ALPHA,std::vector<RealD> &BETA, std::vector<RealD> & eig)
{
int i,j;
std::vector<RealD> evec1(row_num+3);
std::vector<RealD> evec2(row_num+3);
RealD eps2;
ALPHA[1]=0.;
BETHA[1]=0.;
for(i=0;i<row_num-1;i++) {
ALPHA[i+1] = A[i*(row_num+1)].real();
BETHA[i+2] = A[i*(row_num+1)+1].real();
}
ALPHA[row_num] = A[(row_num-1)*(row_num+1)].real();
bisec(ALPHA,BETHA,row_num,1,row_num,1e-10,1e-10,evec1,eps2);
bisec(ALPHA,BETHA,row_num,1,row_num,1e-16,1e-16,evec2,eps2);
// Do we really need to sort here?
int begin=1;
int end = row_num;
int swapped=1;
while(swapped) {
swapped=0;
for(i=begin;i<end;i++){
if(mag(evec2[i])>mag(evec2[i+1])) {
swap(evec2+i,evec2+i+1);
swapped=1;
}
}
end--;
for(i=end-1;i>=begin;i--){
if(mag(evec2[i])>mag(evec2[i+1])) {
swap(evec2+i,evec2+i+1);
swapped=1;
}
}
begin++;
}
for(i=0;i<row_num;i++){
for(j=0;j<row_num;j++) {
if(i==j) H[i*row_num+j]=evec2[i+1];
else H[i*row_num+j]=0.;
}
}
}
static void bisec(std::vector<RealD> &c,
std::vector<RealD> &b,
int n,
int m1,
int m2,
RealD eps1,
RealD relfeh,
std::vector<RealD> &x,
RealD &eps2)
{
std::vector<RealD> wu(n+2);
RealD h,q,x1,xu,x0,xmin,xmax;
int i,a,k;
b[1]=0.0;
xmin=c[n]-fabs(b[n]);
xmax=c[n]+fabs(b[n]);
for(i=1;i<n;i++){
h=fabs(b[i])+fabs(b[i+1]);
if(c[i]+h>xmax) xmax= c[i]+h;
if(c[i]-h<xmin) xmin= c[i]-h;
}
xmax *=2.;
eps2=relfeh*((xmin+xmax)>0.0 ? xmax : -xmin);
if(eps1<=0.0) eps1=eps2;
eps2=0.5*eps1+7.0*(eps2);
x0=xmax;
for(i=m1;i<=m2;i++){
x[i]=xmax;
wu[i]=xmin;
}
for(k=m2;k>=m1;k--){
xu=xmin;
i=k;
do{
if(xu<wu[i]){
xu=wu[i];
i=m1-1;
}
i--;
}while(i>=m1);
if(x0>x[k]) x0=x[k];
while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){
x1=(xu+x0)/2;
a=0;
q=1.0;
for(i=1;i<=n;i++){
q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh);
if(q<0) a++;
}
// printf("x1=%e a=%d\n",x1,a);
if(a<k){
if(a<m1){
xu=x1;
wu[m1]=x1;
}else {
xu=x1;
wu[a+1]=x1;
if(x[a]>x1) x[a]=x1;
}
}else x0=x1;
}
x[k]=(x0+xu)/2;
}
}
}

View File

@ -1 +0,0 @@

View File

@ -1,7 +1,7 @@
#include <Grid/Grid.h>
#include <Grid/GridCore.h>
namespace Grid {
@ -13,9 +13,10 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return NULL;
#ifdef _OPENMP
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;

View File

@ -6,8 +6,9 @@
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -52,7 +53,7 @@ public:
// Physics Grid information.
std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
std::vector<int> _gdimensions;// Global dimensions of array after cb removal
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
@ -99,7 +100,7 @@ public:
virtual int oIndex(std::vector<int> &coor)
{
int idx=0;
// Works with either global or local coordinates
// Works with either global or local coordinates
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx;
}
@ -121,6 +122,11 @@ public:
Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
}
inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) {
lcoor.resize(_ndimension);
for (int d = 0; d < _ndimension; d++)
lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
}
//////////////////////////////////////////////////////////
// SIMD lane addressing
@ -129,6 +135,7 @@ public:
{
Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
}
inline int PermuteDim(int dimension){
return _simd_layout[dimension]>1;
}
@ -146,15 +153,15 @@ public:
// Distance should be either 0,1,2..
//
if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
}
permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type;
for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
}
permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type;
}
for(int d=_ndimension-1;d>dimension;d--){
if (_simd_layout[d]>1 ) permute_type++;
if (_simd_layout[d]>1 ) permute_type++;
}
return permute_type;
}
@ -174,21 +181,44 @@ public:
inline const std::vector<int> &LocalDimensions(void) { return _ldimensions;};
inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
////////////////////////////////////////////////////////////////
// Utility to print the full decomposition details
////////////////////////////////////////////////////////////////
void show_decomposition(){
std::cout << GridLogMessage << "Full Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "Global Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "Local Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "Outer strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "Inner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "iSites : " << _isites << std::endl;
std::cout << GridLogMessage << "oSites : " << _osites << std::endl;
std::cout << GridLogMessage << "lSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "gSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "Nd : " << _ndimension << std::endl;
}
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
}
void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
gidx=0;
int mult=1;
for(int mu=0;mu<_ndimension;mu++) {
gidx+=mult*gcoor[mu];
mult*=_gdimensions[mu];
gidx+=mult*gcoor[mu];
mult*=_gdimensions[mu];
}
}
void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
@ -196,9 +226,9 @@ public:
pcoor.resize(_ndimension);
lcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++){
int _fld = _fdimensions[mu]/_processors[mu];
pcoor[mu] = gcoor[mu]/_fld;
lcoor[mu] = gcoor[mu]%_fld;
int _fld = _fdimensions[mu]/_processors[mu];
pcoor[mu] = gcoor[mu]/_fld;
lcoor[mu] = gcoor[mu]%_fld;
}
}
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
@ -207,16 +237,16 @@ public:
std::vector<int> lcoor;
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
rank = RankFromProcessorCoor(pcoor);
/*
std::vector<int> cblcoor(lcoor);
for(int d=0;d<cblcoor.size();d++){
if( this->CheckerBoarded(d) ) {
cblcoor[d] = lcoor[d]/2;
}
if( this->CheckerBoarded(d) ) {
cblcoor[d] = lcoor[d]/2;
}
}
i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim
o_idx= oIndex(lcoor); // this implies divide by 2 on checkerdim
*/
i_idx= iIndex(lcoor);
o_idx= oIndex(lcoor);
}
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
@ -238,7 +268,7 @@ public:
{
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
if(CheckerBoarded(0)){
fcoor[0] = fcoor[0]*2+cb;
fcoor[0] = fcoor[0]*2+cb;
}
}
void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)

View File

@ -25,7 +25,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/GridCore.h>
namespace Grid {
///////////////////////////////////////////////////////////////
@ -33,6 +34,7 @@ namespace Grid {
///////////////////////////////////////////////////////////////
void * CartesianCommunicator::ShmCommBuf;
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
/////////////////////////////////
// Alloc, free shmem region
@ -88,7 +90,9 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
int CartesianCommunicator::NodeCount(void) { return ProcessorCount();};
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
@ -96,6 +100,7 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_
int bytes)
{
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
{

View File

@ -116,6 +116,12 @@ class CartesianCommunicator {
// Implemented in Communicator_base.C
/////////////////////////////////
static void * ShmCommBuf;
// Isend/Irecv/Wait, or Sendrecv blocking
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
static CommunicatorPolicy_t CommunicatorPolicy;
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
size_t heap_top;
size_t heap_bytes;
@ -148,6 +154,7 @@ class CartesianCommunicator {
const std::vector<int> & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ;
int ProcessorCount(void) ;
int NodeCount(void) ;
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
@ -200,7 +207,7 @@ class CartesianCommunicator {
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,

View File

@ -25,7 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/GridCore.h>
#include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/ActionCore.h>
#include <mpi.h>
namespace Grid {
@ -39,9 +41,13 @@ MPI_Comm CartesianCommunicator::communicator_world;
// Should error check all MPI calls.
void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag;
int provided;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init(argc,argv);
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
if ( provided != MPI_THREAD_MULTIPLE ) {
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
}
}
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
ShmInitGeneric();
@ -152,24 +158,34 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
int from,
int bytes)
{
MPI_Request xrq;
MPI_Request rrq;
int rank = _processor;
int myrank = _processor;
int ierr;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
MPI_Request xrq;
MPI_Request rrq;
list.push_back(xrq);
list.push_back(rrq);
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
int nreq=list.size();
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
}
}
void CartesianCommunicator::Barrier(void)

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -25,9 +25,23 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/GridCore.h>
#include <mpi.h>
#include <semaphore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/mman.h>
//#include <zlib.h>
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
@ -50,6 +64,10 @@ std::vector<int> CartesianCommunicator::GroupRanks;
std::vector<int> CartesianCommunicator::MyGroup;
std::vector<void *> CartesianCommunicator::ShmCommBufs;
int CartesianCommunicator::NodeCount(void) { return GroupSize;};
#undef FORCE_COMMS
void *CartesianCommunicator::ShmBufferSelf(void)
{
return ShmCommBufs[ShmRank];
@ -57,6 +75,9 @@ void *CartesianCommunicator::ShmBufferSelf(void)
void *CartesianCommunicator::ShmBuffer(int rank)
{
int gpeer = GroupRanks[rank];
#ifdef FORCE_COMMS
return NULL;
#endif
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
@ -65,7 +86,13 @@ void *CartesianCommunicator::ShmBuffer(int rank)
}
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
{
static int count =0;
int gpeer = GroupRanks[rank];
assert(gpeer!=ShmRank); // never send to self
assert(rank!=WorldRank);// never send to self
#ifdef FORCE_COMMS
return NULL;
#endif
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
@ -76,16 +103,27 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
}
void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag;
int provided;
// mtrace();
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init(argc,argv);
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
assert (provided == MPI_THREAD_MULTIPLE);
}
Grid_quiesce_nodes();
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
MPI_Comm_rank(communicator_world,&WorldRank);
MPI_Comm_size(communicator_world,&WorldSize);
if ( WorldRank == 0 ) {
std::cout << GridLogMessage<< "Initialising MPI "<< WorldRank <<"/"<<WorldSize <<std::endl;
}
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
@ -131,7 +169,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
assert(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
///////////////////////////////////////////////////////////////////
@ -141,7 +178,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
leaders_group[group++] = l;
}
}
///////////////////////////////////////////////////////////////////
// Identify the rank of the group in which I (and my leader) live
///////////////////////////////////////////////////////////////////
@ -152,39 +188,114 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
}
}
assert(GroupRank!=-1);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared window for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(ShmComm);
ShmCommBuf = 0;
ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
assert(ierr==0);
// KNL hack -- force to numa-domain 1 in flat
#if 0
//#include <numaif.h>
for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
void *pages = (void *) ( page + ShmCommBuf );
int status;
int flags=MPOL_MF_MOVE_ALL;
int nodes=1; // numa domain == MCDRAM
unsigned long count=1;
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
if (ierr && (page==0)) perror("numa relocate command failed");
}
#endif
MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBufs.resize(ShmSize);
for(int r=0;r<ShmSize;r++){
MPI_Aint sz;
int dsp_unit;
MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
#if 1
char shm_name [NAME_MAX];
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
shm_unlink(shm_name);
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
ftruncate(fd, size);
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0);
ShmCommBufs[r] =ptr;
}
}
MPI_Barrier(ShmComm);
if ( ShmRank != 0 ) {
for(int r=0;r<ShmSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r);
int fd=shm_open(shm_name,O_RDWR,0666);
if ( fd<0 ) { perror("failed shm_open"); assert(0); }
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0);
ShmCommBufs[r] =ptr;
}
}
#else
std::vector<int> shmids(ShmSize);
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
key_t key = 0x4545 + r;
if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
int errsv = errno;
printf("Errno %d\n",errsv);
perror("shmget");
exit(1);
}
printf("shmid: 0x%x\n", shmids[r]);
}
}
MPI_Barrier(ShmComm);
MPI_Bcast(&shmids[0],ShmSize*sizeof(int),MPI_BYTE,0,ShmComm);
MPI_Barrier(ShmComm);
for(int r=0;r<ShmSize;r++){
ShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
if (ShmCommBufs[r] == (uint64_t *)-1) {
perror("Shared memory attach failure");
shmctl(shmids[r], IPC_RMID, NULL);
exit(2);
}
printf("shmaddr: %p\n", ShmCommBufs[r]);
}
MPI_Barrier(ShmComm);
// Mark for clean up
for(int r=0;r<ShmSize;r++){
shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
}
MPI_Barrier(ShmComm);
#endif
ShmCommBuf = ShmCommBufs[ShmRank];
MPI_Barrier(ShmComm);
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
check[0] = GroupRank;
check[1] = r;
check[2] = 0x5A5A5A;
}
}
MPI_Barrier(ShmComm);
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
assert(check[0]==GroupRank);
assert(check[1]==r);
assert(check[2]==0x5A5A5A);
}
MPI_Barrier(ShmComm);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Verbose for now
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -192,7 +303,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
std::cout<< WorldSize << " Ranks " ;
std::cout<< GroupSize << " Nodes " ;
std::cout<< ShmSize << " with ranks-per-node "<<std::endl;
std::cout<< " with "<< ShmSize << " ranks-per-node "<<std::endl;
std::cout<<GridLogMessage <<"Grid MPI-3 configuration: allocated shared memory region of size ";
std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
@ -207,7 +318,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
if(g!=ShmSize-1) std::cout<<",";
else std::cout<<"}"<<std::endl;
}
}
for(int g=0;g<GroupSize;g++){
@ -216,23 +326,21 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
if ( (ShmRank == 0) && (GroupRank==g) ) {
std::cout<<MyGroup[r];
if(r<ShmSize-1) std::cout<<",";
else std::cout<<"}"<<std::endl;
else std::cout<<"}"<<std::endl<<std::flush;
}
MPI_Barrier(communicator_world);
}
}
assert(ShmSetup==0); ShmSetup=1;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Want to implement some magic ... Group sub-cubes into those on same node
////////////////////////////////////////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &dest,int &source)
{
std::vector<int> coor = _processor_coor;
std::vector<int> coor = _processor_coor; // my coord
assert(std::abs(shift) <_processors[dim]);
coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
@ -242,28 +350,32 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
Lexicographic::IndexFromCoor(coor,dest,_processors);
dest = LexicographicToWorldRank[dest];
}
}// rank is world rank.
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
Lexicographic::IndexFromCoor(coor,rank,_processors);
rank = LexicographicToWorldRank[rank];
return rank;
}
}// rank is world rank
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
Lexicographic::CoorFromIndex(coor,rank,_processors);
rank = LexicographicToWorldRank[rank];
int lr=-1;
for(int r=0;r<WorldSize;r++){// map world Rank to lexico and then to coor
if( LexicographicToWorldRank[r]==rank) lr = r;
}
assert(lr!=-1);
Lexicographic::CoorFromIndex(coor,lr,_processors);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
int ierr;
communicator=communicator_world;
_ndimension = processors.size();
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
@ -275,24 +387,22 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
}
}
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int dim = 0;
std::vector<int> WorldDims = processors;
ShmDims.resize(_ndimension,1);
ShmDims.resize (_ndimension,1);
GroupDims.resize(_ndimension);
ShmCoor.resize(_ndimension);
ShmCoor.resize (_ndimension);
GroupCoor.resize(_ndimension);
WorldCoor.resize(_ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%_ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%_ndimension;
}
@ -304,6 +414,29 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
GroupDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Verbose
////////////////////////////////////////////////////////////////
#if 0
std::cout<< GridLogMessage << "MPI-3 usage "<<std::endl;
std::cout<< GridLogMessage << "SHM ";
for(int d=0;d<_ndimension;d++){
std::cout<< ShmDims[d] <<" ";
}
std::cout<< std::endl;
std::cout<< GridLogMessage << "Group ";
for(int d=0;d<_ndimension;d++){
std::cout<< GroupDims[d] <<" ";
}
std::cout<< std::endl;
std::cout<< GridLogMessage<<"World ";
for(int d=0;d<_ndimension;d++){
std::cout<< WorldDims[d] <<" ";
}
std::cout<< std::endl;
#endif
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
@ -317,29 +450,57 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
//
////////////////////////////////////////////////////////////////
LexicographicToWorldRank.resize(WorldSize,0);
Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
for(int d=0;d<_ndimension;d++){
WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
}
_processor_coor = WorldCoor;
int lexico;
Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
LexicographicToWorldRank[lexico]=WorldRank;
_processor = lexico;
_processor = WorldRank;
///////////////////////////////////////////////////////////////////
// global sum Lexico to World mapping
///////////////////////////////////////////////////////////////////
int lexico;
LexicographicToWorldRank.resize(WorldSize,0);
Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
LexicographicToWorldRank[lexico] = WorldRank;
ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
assert(ierr==0);
};
for(int i=0;i<WorldSize;i++){
int wr = LexicographicToWorldRank[i];
// int wr = i;
std::vector<int> coor(_ndimension);
ProcessorCoorFromRank(wr,coor); // from world rank
int ck = RankFromProcessorCoor(coor);
assert(ck==wr);
if ( wr == WorldRank ) {
for(int j=0;j<coor.size();j++) {
assert(coor[j] == _processor_coor[j]);
}
}
/*
std::cout << GridLogMessage<< " Lexicographic "<<i;
std::cout << " MPI rank "<<wr;
std::cout << " Coor ";
for(int j=0;j<coor.size();j++) std::cout << coor[j];
std::cout<< std::endl;
*/
/////////////////////////////////////////////////////
// Check everyone agrees on everyone elses coords
/////////////////////////////////////////////////////
std::vector<int> mcoor = coor;
this->Broadcast(0,(void *)&mcoor[0],mcoor.size()*sizeof(int));
for(int d = 0 ; d< _ndimension; d++) {
assert(coor[d] == mcoor[d]);
}
}
};
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
@ -367,8 +528,6 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
@ -377,10 +536,14 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
// unsigned long xcrc = crc32(0L, Z_NULL, 0);
// unsigned long rcrc = crc32(0L, Z_NULL, 0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
@ -397,7 +560,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
@ -406,95 +568,29 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
int from,
int bytes)
{
#if 0
this->StencilBarrier();
MPI_Request xrq;
MPI_Request rrq;
static int sequence;
int myrank = _processor;
int ierr;
int tag;
int check;
assert(dest != _processor);
assert(from != _processor);
int gdest = GroupRanks[dest];
int gfrom = GroupRanks[from];
int gme = GroupRanks[_processor];
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
MPI_Request xrq;
MPI_Request rrq;
sequence++;
char *from_ptr = (char *)ShmCommBufs[ShmRank];
int small = (bytes<MAX_MPI_SHM_BYTES);
typedef uint64_t T;
int words = bytes/sizeof(T);
assert(((size_t)bytes &(sizeof(T)-1))==0);
assert(gme == ShmRank);
if ( small && (gdest !=MPI_UNDEFINED) ) {
char *to_ptr = (char *)ShmCommBufs[gdest];
assert(gme != gdest);
T *ip = (T *)xmit;
T *op = (T *)to_ptr;
PARALLEL_FOR_LOOP
for(int w=0;w<words;w++) {
op[w]=ip[w];
}
bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
bcopy(& sequence,&to_ptr[bytes+4],sizeof(sequence));
} else {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
}
this->StencilBarrier();
if (small && (gfrom !=MPI_UNDEFINED) ) {
T *ip = (T *)from_ptr;
T *op = (T *)recv;
PARALLEL_FOR_LOOP
for(int w=0;w<words;w++) {
op[w]=ip[w];
}
bcopy(&from_ptr[bytes] ,&tag ,sizeof(tag));
bcopy(&from_ptr[bytes+4],&check,sizeof(check));
assert(check==sequence);
assert(tag==from);
} else {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
this->StencilBarrier();
#else
MPI_Request xrq;
MPI_Request rrq;
int rank = _processor;
int ierr;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
#endif
}
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
@ -505,57 +601,63 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_
MPI_Request rrq;
int ierr;
assert(dest != _processor);
assert(from != _processor);
int gdest = GroupRanks[dest];
int gfrom = GroupRanks[from];
int gme = GroupRanks[_processor];
assert(gme == ShmRank);
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
#ifdef FORCE_COMMS
gdest = MPI_UNDEFINED;
gfrom = MPI_UNDEFINED;
#endif
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}
if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
}
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list);
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
{
SendToRecvFromComplete(list);
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Win_sync (ShmWindow);
MPI_Barrier (ShmComm);
MPI_Win_sync (ShmWindow);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
@ -565,7 +667,11 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
communicator);
assert(ierr==0);
}
int CartesianCommunicator::RankWorld(void){
int r;
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,

View File

@ -27,6 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include "Grid.h"
#include <mpi.h>
//#include <numaif.h>
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Workarounds:
@ -42,19 +43,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
typedef sem_t *Grid_semaphore;
#error /*THis is deprecated*/
#if 0
#define SEM_INIT(S) S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
#define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
#define SEM_POST(S) assert ( sem_post(S) == 0 );
#define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
#else
#define SEM_INIT(S) ;
#define SEM_INIT_EXCL(S) ;
#define SEM_POST(S) ;
#define SEM_WAIT(S) ;
#endif
#include <sys/mman.h>
namespace Grid {
enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL };
enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL, COMMAND_SENDRECV };
struct Descriptor {
uint64_t buf;
@ -62,6 +71,12 @@ struct Descriptor {
int rank;
int tag;
int command;
uint64_t xbuf;
uint64_t rbuf;
int xtag;
int rtag;
int src;
int dest;
MPI_Request request;
};
@ -94,18 +109,14 @@ public:
void SemInit(void) {
sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
// printf("SEM_NAME: %s \n",sem_name);
SEM_INIT(sem_head);
sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
// printf("SEM_NAME: %s \n",sem_name);
SEM_INIT(sem_tail);
}
void SemInitExcl(void) {
sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
// printf("SEM_INIT_EXCL: %s \n",sem_name);
SEM_INIT_EXCL(sem_head);
sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
// printf("SEM_INIT_EXCL: %s \n",sem_name);
SEM_INIT_EXCL(sem_tail);
}
void WakeUpDMA(void) {
@ -125,6 +136,13 @@ public:
while(1){
WaitForCommand();
// std::cout << "Getting command "<<std::endl;
#if 0
_mm_monitor((void *)&state->head,0,0);
int s=state->start;
if ( s != state->head ) {
_mm_mwait(0,0);
}
#endif
Event();
}
}
@ -132,6 +150,7 @@ public:
int Event (void) ;
uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
void QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) ;
void WaitAll() {
// std::cout << "Queueing WAIT command "<<std::endl;
@ -141,7 +160,7 @@ public:
// std::cout << "Waiting from semaphore "<<std::endl;
WaitForComplete();
// std::cout << "Checking FIFO is empty "<<std::endl;
assert ( state->tail == state->head );
while ( state->tail != state->head );
}
};
@ -196,6 +215,12 @@ public:
// std::cout << "Waking up DMA "<< slave<<std::endl;
};
static void QueueSendRecv(int slave,void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src)
{
Slaves[slave].QueueSendRecv(xbuf,rbuf,bytes,xtag,rtag,comm,dest,src);
Slaves[slave].WakeUpDMA();
}
static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
// std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank <<std::endl;
Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
@ -226,6 +251,28 @@ public:
return;
};
static void QueueRoundRobinSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
uint8_t * cxbuf = (uint8_t *) xbuf;
uint8_t * crbuf = (uint8_t *) rbuf;
static int rrp=0;
int procs = VerticalSize-1;
int myoff=0;
int mywork=bytes;
QueueSendRecv(rrp+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
rrp = rrp+1;
if ( rrp == (VerticalSize-1) ) rrp = 0;
}
static void QueueMultiplexedSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) {
uint8_t * cxbuf = (uint8_t *) xbuf;
uint8_t * crbuf = (uint8_t *) rbuf;
int mywork, myoff, procs;
procs = VerticalSize-1;
for(int s=0;s<procs;s++) {
GetWork(bytes,s,mywork,myoff,procs);
QueueSendRecv(s+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src);
}
};
static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
uint8_t * cbuf = (uint8_t *) buf;
int mywork, myoff, procs;
@ -275,6 +322,7 @@ std::vector<void *> MPIoffloadEngine::VerticalShmBufs;
std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
std::vector<int> MPIoffloadEngine::UserCommunicatorToWorldRanks;
int CartesianCommunicator::NodeCount(void) { return HorizontalSize;};
int MPIoffloadEngine::ShmSetup = 0;
void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
@ -370,12 +418,22 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
ftruncate(fd, size);
VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( VerticalShmBufs[r] == MAP_FAILED ) {
perror("failed mmap");
assert(0);
}
/*
for(uint64_t page=0;page<size;page+=4096){
void *pages = (void *) ( page + (uint64_t)VerticalShmBufs[r] );
int status;
int flags=MPOL_MF_MOVE_ALL;
int nodes=1; // numa domain == MCDRAM
unsigned long count=1;
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
if (ierr && (page==0)) perror("numa relocate command failed");
}
*/
uint64_t * check = (uint64_t *) VerticalShmBufs[r];
check[0] = WorldRank;
check[1] = r;
@ -404,7 +462,7 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
uint64_t * check = (uint64_t *) VerticalShmBufs[r];
assert(check[0]== WorldRank);
assert(check[1]== r);
std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
// std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
}
}
#endif
@ -542,6 +600,8 @@ int Slave::Event (void) {
static int head_last;
static int start_last;
int ierr;
MPI_Status stat;
static int i=0;
////////////////////////////////////////////////////
// Try to advance the start pointers
@ -550,11 +610,6 @@ int Slave::Event (void) {
if ( s != state->head ) {
switch ( state->Descrs[s].command ) {
case COMMAND_ISEND:
/*
std::cout<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
<< " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
<< " Comm " << MPIoffloadEngine::communicator_universe<< " me " <<universe_rank<< std::endl;
*/
ierr = MPI_Isend((void *)(state->Descrs[s].buf+base),
state->Descrs[s].bytes,
MPI_CHAR,
@ -568,11 +623,6 @@ int Slave::Event (void) {
break;
case COMMAND_IRECV:
/*
std::cout<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
<< " from " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
<< " Comm " << MPIoffloadEngine::communicator_universe<< " me "<< universe_rank<< std::endl;
*/
ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base),
state->Descrs[s].bytes,
MPI_CHAR,
@ -588,10 +638,32 @@ int Slave::Event (void) {
return 1;
break;
case COMMAND_SENDRECV:
// fprintf(stderr,"Sendrecv ->%d %d : <-%d %d \n",state->Descrs[s].dest, state->Descrs[s].xtag+i*10,state->Descrs[s].src, state->Descrs[s].rtag+i*10);
ierr=MPI_Sendrecv((void *)(state->Descrs[s].xbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].dest, state->Descrs[s].xtag+i*10,
(void *)(state->Descrs[s].rbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].src , state->Descrs[s].rtag+i*10,
MPIoffloadEngine::communicator_universe,MPI_STATUS_IGNORE);
assert(ierr==0);
// fprintf(stderr,"Sendrecv done %d %d\n",ierr,i);
// MPI_Barrier(MPIoffloadEngine::HorizontalComm);
// fprintf(stderr,"Barrier\n");
i++;
state->start = PERI_PLUS(s);
return 1;
break;
case COMMAND_WAITALL:
for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
if ( state->Descrs[t].command != COMMAND_SENDRECV ) {
MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
}
};
s=PERI_PLUS(s);
state->start = s;
@ -613,6 +685,45 @@ int Slave::Event (void) {
// External interaction with the queue
//////////////////////////////////////////////////////////////////////////////
void Slave::QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src)
{
int head =state->head;
int next = PERI_PLUS(head);
// Set up descriptor
int worldrank;
int hashtag;
MPI_Comm communicator;
MPI_Request request;
uint64_t relative;
relative = (uint64_t)xbuf - base;
state->Descrs[head].xbuf = relative;
relative= (uint64_t)rbuf - base;
state->Descrs[head].rbuf = relative;
state->Descrs[head].bytes = bytes;
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,xtag,comm,dest);
state->Descrs[head].dest = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
state->Descrs[head].xtag = hashtag;
MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,rtag,comm,src);
state->Descrs[head].src = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
state->Descrs[head].rtag = hashtag;
state->Descrs[head].command= COMMAND_SENDRECV;
// Block until FIFO has space
while( state->tail==next );
// Msync on weak order architectures
// Advance pointer
state->head = next;
};
uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank)
{
/////////////////////////////////////////
@ -812,19 +923,22 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_
assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
assert(from!=_processor);
assert(dest!=_processor);
MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
}
MPIoffloadEngine::QueueMultiplexedSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
//MPIoffloadEngine::QueueRoundRobinSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from);
//MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
//MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
MPIoffloadEngine::WaitAll();
//this->Barrier();
}
void CartesianCommunicator::StencilBarrier(void)
{
}
void CartesianCommunicator::StencilBarrier(void) { }
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{

View File

@ -25,7 +25,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/GridCore.h>
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
@ -87,6 +88,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
{
assert(0);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
assert(0);
@ -97,7 +99,7 @@ void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor ;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor; }
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
source =0;

View File

@ -27,6 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include <Grid/Grid.h>
#include <mpp/shmem.h>
#include <array>
namespace Grid {
@ -51,7 +52,7 @@ typedef struct HandShake_t {
} HandShake;
std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
std::array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
ret.fill(SHMEM_SYNC_VALUE);
return ret;
}
@ -109,7 +110,7 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
source = u;
dest = 0;
shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all(); // necessary?
u = dest;
}
@ -125,7 +126,7 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
source = u;
dest = 0;
shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all(); // necessary?
u = dest;
}
@ -137,7 +138,8 @@ void CartesianCommunicator::GlobalSum(float &f){
source = f;
dest =0.0;
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
f = dest;
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
@ -148,14 +150,16 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N)
static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync = psync_init;
if ( shmem_addr_accessible(f,_processor) ){
shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
return;
}
for(int i=0;i<N;i++){
dest =0.0;
source = f[i];
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
f[i] = dest;
}
}
@ -168,7 +172,8 @@ void CartesianCommunicator::GlobalSum(double &d)
source = d;
dest = 0;
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
d = dest;
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
@ -180,14 +185,16 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
if ( shmem_addr_accessible(d,_processor) ){
shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
return;
}
for(int i=0;i<N;i++){
source = d[i];
dest =0.0;
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data());
shmem_barrier_all();
d[i] = dest;
}
}
@ -282,11 +289,13 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
SHMEM_VET(recv);
// shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
shmem_putmem(recv,xmit,bytes,dest);
if ( CommunicatorPolicy == CommunicatorPolicySequential ) shmem_barrier_all();
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
// shmem_quiet(); // I'm done
shmem_barrier_all();// He's done too
if( CommunicatorPolicy == CommunicatorPolicyConcurrent ) shmem_barrier_all();// He's done too
}
void CartesianCommunicator::Barrier(void)
{
@ -301,13 +310,13 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
int words = bytes/4;
if ( shmem_addr_accessible(data,_processor) ){
shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync);
shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync.data());
return;
}
for(int w=0;w<words;w++){
word = array[w];
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
if ( shmem_my_pe() != root ) {
array[w] = word;
}
@ -325,13 +334,17 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
for(int w=0;w<words;w++){
word = array[w];
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data());
if ( shmem_my_pe() != root ) {
array[w]= word;
}
shmem_barrier_all();
}
}
int CartesianCommunicator::RankWorld(void){
return shmem_my_pe();
}
}

View File

@ -1,5 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -31,21 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid {
template<class vobj>
class SimpleCompressor {
public:
void Point(int) {};
vobj operator() (const vobj &arg) {
return arg;
}
};
///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split with compression
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{
int rd = rhs._grid->_rdimensions[dimension];
@ -53,19 +42,17 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimen
cbmask = 0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int stride=rhs._grid->_slice_stride[dimension];
if ( cbmask == 0x3 ) {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int bo = n*e2;
buffer[off+bo+b]=compress(rhs._odata[so+o+b]);
buffer[off+bo+b]=rhs._odata[so+o+b];
}
}
} else {
@ -74,25 +61,23 @@ PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) {
table.push_back(std::pair<int,int> (bo++,o+b));
}
}
}
PARALLEL_FOR_LOOP
for(int i=0;i<table.size();i++){
buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
parallel_for(int i=0;i<table.size();i++){
buffer[off+table[i].first]=rhs._odata[so+table[i].second];
}
}
}
///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split with compression
// Gather for when there *is* need to SIMD split
///////////////////////////////////////////////////////////////////
template<class cobj,class vobj,class compressor> void
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_object *> pointers,int dimension,int plane,int cbmask,compressor &compress)
template<class vobj> void
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
@ -105,57 +90,40 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int n1=rhs._grid->_slice_stride[dimension];
int n2=rhs._grid->_slice_block[dimension];
if ( cbmask ==0x3){
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*n1;
int offset = b+n*n2;
cobj temp =compress(rhs._odata[so+o+b]);
extract<cobj>(temp,pointers,offset);
int offset = b+n*e2;
vobj temp =rhs._odata[so+o+b];
extract<vobj>(temp,pointers,offset);
}
}
} else {
assert(0); //Fixme think this is buggy
for(int n=0;n<e1;n++){
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
std::cout << " Dense packed buffer WARNING " <<std::endl;
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension];
int o=n*n1;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int offset = b+n*rhs._grid->_slice_block[dimension];
int offset = b+n*e2;
if ( ocb & cbmask ) {
cobj temp =compress(rhs._odata[so+o+b]);
extract<cobj>(temp,pointers,offset);
vobj temp =rhs._odata[so+o+b];
extract<vobj>(temp,pointers,offset);
}
}
}
}
}
//////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
{
SimpleCompressor<vobj> dontcompress;
Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
}
//////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
{
SimpleCompressor<vobj> dontcompress;
Gather_plane_extract<vobj,vobj,decltype(dontcompress)>(rhs,pointers,dimension,plane,cbmask,dontcompress);
}
//////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split
//////////////////////////////////////////////////////
@ -171,10 +139,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int stride=rhs._grid->_slice_stride[dimension];
if ( cbmask ==0x3 ) {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
@ -182,24 +150,28 @@ PARALLEL_NESTED_LOOP2
}
}
} else {
std::vector<std::pair<int,int> > table;
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
rhs._odata[so+o+b]=buffer[bo++];
table.push_back(std::pair<int,int> (so+o+b,bo++));
}
}
}
parallel_for(int i=0;i<table.size();i++){
// std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
rhs._odata[table[i].first]=buffer[table[i].second];
}
}
}
//////////////////////////////////////////////////////
// Scatter for when there *is* need to SIMD split
//////////////////////////////////////////////////////
template<class vobj,class cobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<cobj *> pointers,int dimension,int plane,int cbmask)
template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
@ -213,8 +185,7 @@ PARALLEL_NESTED_LOOP2
int e2=rhs._grid->_slice_block[dimension];
if(cbmask ==0x3 ) {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
@ -222,7 +193,11 @@ PARALLEL_NESTED_LOOP2
}
}
} else {
assert(0); // think this is buggy FIXME
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
@ -254,8 +229,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e2=rhs._grid->_slice_block[dimension];
int stride = rhs._grid->_slice_stride[dimension];
if(cbmask == 0x3 ){
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
@ -264,8 +238,7 @@ PARALLEL_NESTED_LOOP2
}
}
} else {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
@ -295,8 +268,8 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block [dimension];
int stride = rhs._grid->_slice_stride[dimension];
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
@ -338,8 +311,8 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
// the permute type
ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
int permute_dim =grid->PermuteDim(dimension);
int permute_type=grid->PermuteType(dimension);
int permute_type_dist;
@ -348,7 +321,6 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
int o = 0;
int bo = x * grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even;
int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@ -361,9 +333,23 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
// wrap is whether sshift > rd.
// num is sshift mod rd.
//
// shift 7
//
// XoXo YcYc
// oXoX cYcY
// XoXo YcYc
// oXoX cYcY
//
// sshift --
//
// XX YY ; 3
// XX YY ; 0
// XX YY ; 3
// XX YY ; 0
//
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int wrap = sshift/rd; wrap=wrap % ly;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
@ -375,7 +361,6 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
} else {
permute_type_dist = permute_type;
}
}
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);

View File

@ -74,7 +74,6 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
// std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x3);
@ -154,10 +153,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
(void *)&recv_buf[0],
recv_from_rank,
bytes);
grid->Barrier();
// for(int i=0;i<words;i++){
// std::cout << "SendRecv ["<<i<<"] snd "<<send_buf[i]<<" rcv " << recv_buf[i] << " 0x" << cbmask<<std::endl;
// }
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
}
}
@ -243,7 +240,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];

12276
lib/json/json.hpp Normal file

File diff suppressed because it is too large Load Diff

View File

@ -39,8 +39,7 @@ namespace Grid {
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@ -73,8 +71,7 @@ PARALLEL_FOR_LOOP
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@ -89,8 +86,7 @@ PARALLEL_FOR_LOOP
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
@ -108,8 +104,7 @@ PARALLEL_FOR_LOOP
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(lhs,ret);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
obj1 tmp;
mult(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
@ -120,8 +115,7 @@ PARALLEL_FOR_LOOP
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,lhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
obj1 tmp;
mac(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
@ -132,8 +126,7 @@ PARALLEL_FOR_LOOP
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,lhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs._odata[ss],&rhs);
@ -147,8 +140,7 @@ PARALLEL_FOR_LOOP
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(lhs,ret);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs._odata[ss],&rhs);
@ -166,8 +158,7 @@ PARALLEL_FOR_LOOP
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mult(&tmp,&lhs,&rhs._odata[ss]);
@ -182,8 +173,7 @@ PARALLEL_FOR_LOOP
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mac(&tmp,&lhs,&rhs._odata[ss]);
@ -198,8 +188,7 @@ PARALLEL_FOR_LOOP
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs,&rhs._odata[ss]);
@ -213,8 +202,7 @@ PARALLEL_FOR_LOOP
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs,&rhs._odata[ss]);
@ -230,8 +218,7 @@ PARALLEL_FOR_LOOP
ret.checkerboard = x.checkerboard;
conformable(ret,x);
conformable(x,y);
PARALLEL_FOR_LOOP
for(int ss=0;ss<x._grid->oSites();ss++){
parallel_for(int ss=0;ss<x._grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = a*x._odata[ss]+y._odata[ss];
vstream(ret._odata[ss],tmp);
@ -245,8 +232,7 @@ PARALLEL_FOR_LOOP
ret.checkerboard = x.checkerboard;
conformable(ret,x);
conformable(x,y);
PARALLEL_FOR_LOOP
for(int ss=0;ss<x._grid->oSites();ss++){
parallel_for(int ss=0;ss<x._grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vstream(ret._odata[ss],tmp);

View File

@ -121,8 +121,7 @@ public:
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
PARALLEL_FOR_LOOP
for(int ss=0;ss<_grid->oSites();ss++){
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
@ -144,8 +143,7 @@ PARALLEL_FOR_LOOP
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
PARALLEL_FOR_LOOP
for(int ss=0;ss<_grid->oSites();ss++){
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
@ -167,8 +165,7 @@ PARALLEL_FOR_LOOP
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
PARALLEL_FOR_LOOP
for(int ss=0;ss<_grid->oSites();ss++){
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
//vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,eval(ss,expr));
@ -191,8 +188,7 @@ PARALLEL_FOR_LOOP
checkerboard=cb;
_odata.resize(_grid->oSites());
PARALLEL_FOR_LOOP
for(int ss=0;ss<_grid->oSites();ss++){
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
@ -213,8 +209,7 @@ PARALLEL_FOR_LOOP
checkerboard=cb;
_odata.resize(_grid->oSites());
PARALLEL_FOR_LOOP
for(int ss=0;ss<_grid->oSites();ss++){
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
@ -235,73 +230,79 @@ PARALLEL_FOR_LOOP
checkerboard=cb;
_odata.resize(_grid->oSites());
PARALLEL_FOR_LOOP
for(int ss=0;ss<_grid->oSites();ss++){
parallel_for(int ss=0;ss<_grid->oSites();ss++){
vstream(_odata[ss] ,eval(ss,expr));
}
};
//////////////////////////////////////////////////////////////////
// Constructor requires "grid" passed.
// what about a default grid?
//////////////////////////////////////////////////////////////////
Lattice(GridBase *grid) : _odata(grid->oSites()) {
_grid = grid;
//////////////////////////////////////////////////////////////////
// Constructor requires "grid" passed.
// what about a default grid?
//////////////////////////////////////////////////////////////////
Lattice(GridBase *grid) : _odata(grid->oSites()) {
_grid = grid;
// _odata.reserve(_grid->oSites());
// _odata.resize(_grid->oSites());
// std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
assert((((uint64_t)&_odata[0])&0xF) ==0);
checkerboard=0;
}
Lattice(const Lattice& r){ // copy constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
PARALLEL_FOR_LOOP
for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
}
virtual ~Lattice(void) = default;
assert((((uint64_t)&_odata[0])&0xF) ==0);
checkerboard=0;
}
Lattice(const Lattice& r){ // copy constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
}
virtual ~Lattice(void) = default;
template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
PARALLEL_FOR_LOOP
for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r;
}
return *this;
}
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard;
conformable(*this,r);
PARALLEL_FOR_LOOP
for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss];
}
return *this;
void reset(GridBase* grid) {
if (_grid != grid) {
_grid = grid;
_odata.resize(grid->oSites());
checkerboard = 0;
}
}
// *=,+=,-= operators inherit behvour from correspond */+/- operation
template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
*this = (*this)*r;
return *this;
template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r;
}
template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
*this = (*this)-r;
return *this;
return *this;
}
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard;
conformable(*this,r);
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss];
}
template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
*this = (*this)+r;
return *this;
}
}; // class Lattice
return *this;
}
// *=,+=,-= operators inherit behvour from correspond */+/- operation
template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
*this = (*this)*r;
return *this;
}
template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
*this = (*this)-r;
return *this;
}
template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
*this = (*this)+r;
return *this;
}
}; // class Lattice
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
std::vector<int> gcoor;
typedef typename vobj::scalar_object sobj;
@ -319,7 +320,7 @@ PARALLEL_FOR_LOOP
}
return stream;
}
}

View File

@ -45,90 +45,87 @@ namespace Grid {
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{
Lattice<vInteger> ret(rhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
{
Lattice<vInteger> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare lattice to scalar
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{
Lattice<vInteger> ret(lhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
{
Lattice<vInteger> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare scalar to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{
Lattice<vInteger> ret(rhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
{
Lattice<vInteger> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
}
//////////////////////////////////////////////////////////////////////////
// Map to functors
//////////////////////////////////////////////////////////////////////////
// Less than
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
}
// Less than equal
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vle<lobj,robj>(),lhs,rhs);
}
// Greater than
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
// Less than
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
}
// Less than equal
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vle<lobj,robj>(),lhs,rhs);
}
// Greater than
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vgt<lobj,robj>(),lhs,rhs);
}
// Greater than equal
}
// Greater than equal
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
@ -136,38 +133,37 @@ PARALLEL_FOR_LOOP
return LSComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vge<lobj,robj>(),lhs,rhs);
}
// equal
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(veq<lobj,robj>(),lhs,rhs);
}
// not equal
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vne<lobj,robj>(),lhs,rhs);
}
}
#endif

View File

@ -34,47 +34,42 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid {
/////////////////////////////////////////////////////
// Non site, reduced locally reduced routines
/////////////////////////////////////////////////////
// localNorm2,
template<class vobj>
/////////////////////////////////////////////////////
// Non site, reduced locally reduced routines
/////////////////////////////////////////////////////
// localNorm2,
template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
}
return ret;
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// localInnerProduct
template<class vobj>
// localInnerProduct
template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites(); ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix
template<class ll,class rr>
// outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix
template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
{
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
{
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
}
#endif

View File

@ -37,8 +37,7 @@ namespace Grid {
inline Lattice<vobj> operator -(const Lattice<vobj> &r)
{
Lattice<vobj> ret(r._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<r._grid->oSites();ss++){
parallel_for(int ss=0;ss<r._grid->oSites();ss++){
vstream(ret._odata[ss], -r._odata[ss]);
}
return ret;
@ -74,8 +73,7 @@ PARALLEL_FOR_LOOP
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites(); ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs*rhs._odata[ss];
@ -86,8 +84,7 @@ PARALLEL_FOR_LOOP
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
{
Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites(); ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs+rhs._odata[ss];
@ -98,11 +95,9 @@ PARALLEL_FOR_LOOP
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
{
Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites(); ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs-rhs._odata[ss];
}
return ret;
}
@ -110,8 +105,7 @@ PARALLEL_FOR_LOOP
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites(); ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]*rhs;
@ -122,8 +116,7 @@ PARALLEL_FOR_LOOP
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
{
Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites(); ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]+rhs;
@ -134,15 +127,12 @@ PARALLEL_FOR_LOOP
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
{
Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites(); ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]-rhs;
}
return ret;
}
}
#endif

View File

@ -44,22 +44,20 @@ namespace Grid {
{
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
ret.checkerboard=lhs.checkerboard;
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
}
return ret;
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
}
return ret;
};
template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
{
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
ret.checkerboard=lhs.checkerboard;
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
}
return ret;
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -68,18 +66,16 @@ PARALLEL_FOR_LOOP
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
{
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
}
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
}
}
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
{
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
}
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
}
}
//////////////////////////////////////////////////////
@ -131,9 +127,6 @@ PARALLEL_FOR_LOOP
assert( l.checkerboard == l._grid->CheckerBoard(site));
// FIXME
// assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site);

View File

@ -40,8 +40,7 @@ namespace Grid {
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = adj(lhs._odata[ss]);
}
return ret;
@ -49,13 +48,10 @@ PARALLEL_FOR_LOOP
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = conjugate(lhs._odata[ss]);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = conjugate(lhs._odata[ss]);
}
return ret;
};
}
#endif

View File

@ -1,159 +1,154 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reduction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_REDUCTION_H
#define GRID_LATTICE_REDUCTION_H
#include <Grid/Grid_Eigen_Dense.h>
namespace Grid {
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg);
return std::real(nrm);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg);
return std::real(nrm);
}
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
scalar_type nrm;
GridBase *grid = left._grid;
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
PARALLEL_FOR_LOOP
for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
}
sumarray[thr]=TensorRemove(vnrm) ;
}
// Double inner product
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
scalar_type nrm;
GridBase *grid = left._grid;
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
vector_type vvnrm; vvnrm=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vvnrm = vvnrm+sumarray[i];
}
nrm = Reduce(vvnrm);// sum across simd
right._grid->GlobalSum(nrm);
return nrm;
decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
}
sumarray[thr]=TensorRemove(vnrm) ;
}
vector_type vvnrm; vvnrm=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vvnrm = vvnrm+sumarray[i];
}
nrm = Reduce(vvnrm);// sum across simd
right._grid->GlobalSum(nrm);
return nrm;
}
template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))
))::scalar_object
{
return sum(closure(expr));
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg){
GridBase *grid=arg._grid;
int Nsimd = grid->Nsimd();
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
PARALLEL_FOR_LOOP
for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;
sobj ssum=zero;
std::vector<sobj> buf(Nsimd);
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg._grid->GlobalSum(ssum);
return ssum;
{
return sum(closure(expr));
}
template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))
))::scalar_object
{
return sum(closure(expr));
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
GridBase *grid=arg._grid;
int Nsimd = grid->Nsimd();
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;
sobj ssum=zero;
std::vector<sobj> buf(Nsimd);
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg._grid->GlobalSum(ssum);
return ssum;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
{
///////////////////////////////////////////////////////
// FIXME precision promoted summation
// may be important for correlation functions
// But easily avoided by using double precision fields
///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
GridBase *grid = Data._grid;
assert(grid!=NULL);
// FIXME
// std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
@ -165,23 +160,31 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int rd=grid->_rdimensions[orthogdim];
std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars
std::vector<sobj> extracted(Nsimd); // splitting the SIMD
std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars
std::vector<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
result.resize(fd); // And then global sum to return the same vector to every node
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
std::vector<int> coor(Nd);
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
parallel_for(int r=0;r<rd;r++){
for(int ss=0;ss<grid->oSites();ss++){
Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions);
int r = coor[orthogdim];
lvSum[r]=lvSum[r]+Data._odata[ss];
}
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data._odata[ss];
}
}
}
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
@ -216,10 +219,303 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
result[t]=gsum;
}
}
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
assert(grid!=NULL);
conformable(grid,rhs._grid);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
parallel_for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss]));
lvSum[r]=lvSum[r]+vv;
}
}
}
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
temp._internal = lvSum[rt];
extract(temp,extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
}
}
// sum over nodes.
scalar_type gsum;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt];
} else {
gsum=scalar_type(0.0);
}
grid->GlobalSum(gsum);
result[t]=gsum;
}
}
template<class vobj>
static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = rhs._grid->GlobalDimensions()[Orthog];
std::vector<ComplexD> ip(Nblock);
sn.resize(Nblock);
sliceInnerProductVector(ip,rhs,rhs,Orthog);
for(int ss=0;ss<Nblock;ss++){
sn[ss] = real(ip[ss]);
}
};
template<class vobj>
static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int orthogdim,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::tensor_reduced tensor_reduced;
GridBase *grid = X._grid;
int Nsimd =grid->Nsimd();
int Nblock =grid->GlobalDimensions()[orthogdim];
int fd =grid->_fdimensions[orthogdim];
int ld =grid->_ldimensions[orthogdim];
int rd =grid->_rdimensions[orthogdim];
int e1 =grid->_slice_nblock[orthogdim];
int e2 =grid->_slice_block [orthogdim];
int stride =grid->_slice_stride[orthogdim];
std::vector<int> icoor;
for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
vector_type av;
for(int l=0;l<Nsimd;l++){
grid->iCoorFromIindex(icoor,l);
int ldx =r+icoor[orthogdim]*rd;
scalar_type *as =(scalar_type *)&av;
as[l] = scalar_type(a[ldx])*scale;
}
tensor_reduced at; at=av;
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
R._odata[ss] = at*X._odata[ss]+Y._odata[ss];
}
}
}
};
/*
template<class vobj>
static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int Orthog,RealD scale=1.0)
{
// FIXME: Implementation is slow
// Best base the linear combination by constructing a
// set of vectors of size grid->_rdimensions[Orthog].
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
// If we based this on Cshift it would work for spread out
// but it would be even slower
for(int i=0;i<Nblock;i++){
ExtractSlice(Rslice,Y,i,Orthog);
ExtractSlice(Xslice,X,i,Orthog);
Rslice = Rslice + Xslice*(scale*a[i]);
InsertSlice(Rslice,R,i,Orthog);
}
};
template<class vobj>
static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
// FIXME: Implementation is slow
// Look at localInnerProduct implementation,
// and do inside a site loop with block strided iterators
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::tensor_reduced scalar;
typedef typename scalar::scalar_object scomplex;
int Nblock = lhs._grid->GlobalDimensions()[Orthog];
vec.resize(Nblock);
std::vector<scomplex> sip(Nblock);
Lattice<scalar> IP(lhs._grid);
IP=localInnerProduct(lhs,rhs);
sliceSum(IP,sip,Orthog);
for(int ss=0;ss<Nblock;ss++){
vec[ss] = TensorRemove(sip[ss]);
}
}
*/
//////////////////////////////////////////////////////////////////////////////////////////
// FIXME: Implementation is slow
// If we based this on Cshift it would work for spread out
// but it would be even slower
//
// Repeated extract slice is inefficient
//
// Best base the linear combination by constructing a
// set of vectors of size grid->_rdimensions[Orthog].
//////////////////////////////////////////////////////////////////////////////////////////
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(0);
std::vector<int> simd_phys(0);
std::vector<int> mpi_phys(0);
for(int d=0;d<NN;d++){
if( d!=Orthog ) {
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
}
}
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
}
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
for(int i=0;i<Nblock;i++){
ExtractSlice(Rslice,Y,i,Orthog);
for(int j=0;j<Nblock;j++){
ExtractSlice(Xslice,X,j,Orthog);
Rslice = Rslice + Xslice*(scale*aa(j,i));
}
InsertSlice(Rslice,R,i,Orthog);
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
// FIXME: Implementation is slow
// Not sure of best solution.. think about it
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
Lattice<vobj> Lslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
for(int i=0;i<Nblock;i++){
ExtractSlice(Lslice,lhs,i,Orthog);
for(int j=0;j<Nblock;j++){
ExtractSlice(Rslice,rhs,j,Orthog);
mat(i,j) = innerProduct(Lslice,Rslice);
}
}
#undef FORCE_DIAG
#ifdef FORCE_DIAG
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
if ( i != j ) mat(i,j)=0.0;
}
}
#endif
return;
}
} /*END NAMESPACE GRID*/
#endif

View File

@ -6,8 +6,8 @@
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -30,11 +30,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define GRID_LATTICE_RNG_H
#include <random>
#ifdef RNG_SITMO
#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
#endif
#if defined(RNG_SITMO)
#define RNG_FAST_DISCARD
#else
#undef RNG_FAST_DISCARD
#endif
namespace Grid {
//////////////////////////////////////////////////////////////
// Allow the RNG state to be less dense than the fine grid
//////////////////////////////////////////////////////////////
@ -64,115 +72,188 @@ namespace Grid {
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
}
return multiplicity;
}
// merge of April 11 2017
//<<<<<<< HEAD
// this function is necessary for the LS vectorised field
inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
{
int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors
// all further divisions are local
for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same
assert(fine->Nsimd() == coarse->Nsimd());
// check that the two grids divide cleanly
assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
return fine->lSites() / coarse->lSites();
}
/*
// Wrap seed_seq to give common interface with random_device
class fixedSeed {
public:
typedef std::seed_seq::result_type result_type;
std::seed_seq src;
fixedSeed(const std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
result_type operator () (void){
std::vector<result_type> list(1);
src.generate(list.begin(),list.end());
return list[0];
}
};
=======
>>>>>>> develop
*/
// real scalars are one component
template<class scalar,class distribution,class generator> void fillScalar(scalar &s,distribution &dist,generator & gen)
template<class scalar,class distribution,class generator>
void fillScalar(scalar &s,distribution &dist,generator & gen)
{
s=dist(gen);
}
template<class distribution,class generator> void fillScalar(ComplexF &s,distribution &dist, generator &gen)
template<class distribution,class generator>
void fillScalar(ComplexF &s,distribution &dist, generator &gen)
{
s=ComplexF(dist(gen),dist(gen));
}
template<class distribution,class generator> void fillScalar(ComplexD &s,distribution &dist,generator &gen)
template<class distribution,class generator>
void fillScalar(ComplexD &s,distribution &dist,generator &gen)
{
s=ComplexD(dist(gen),dist(gen));
}
class GridRNGbase {
public:
int _seeded;
// One generator per site.
// Uniform and Gaussian distributions from these generators.
#ifdef RNG_RANLUX
typedef uint64_t RngStateType;
typedef std::ranlux48 RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 15;
#elif RNG_MT19937
#endif
#ifdef RNG_MT19937
typedef std::mt19937 RngEngine;
typedef uint32_t RngStateType;
static const int RngStateCount = std::mt19937::state_size;
#elif RNG_SITMO
#endif
#ifdef RNG_SITMO
typedef sitmo::prng_engine RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 4;
static const int RngStateCount = 13;
#endif
std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD>> _uniform;
std::vector<std::normal_distribution<RealD>> _gaussian;
std::vector<std::discrete_distribution<int32_t>> _bernoulli;
void GetState(std::vector<RngStateType> & saved,int gen) {
std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD> > _uniform;
std::vector<std::normal_distribution<RealD> > _gaussian;
std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::uniform_int_distribution<uint32_t> > _uid;
///////////////////////
// support for parallel init
///////////////////////
#ifdef RNG_FAST_DISCARD
static void Skip(RngEngine &eng)
{
/////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites
// This goes by 10^12.
// Consider quenched updating; likely never exceeding rate of 1000 sweeps
// per second on any machine. This gives us of order 10^9 seconds, or 100 years
// skip ahead.
// For HMC unlikely to go at faster than a solve per second, and
// tens of seconds per trajectory so this is clean in all reasonable cases,
// and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary
/////////////////////////////////////////////////////////////////////////////////////
uint64_t skip = 0x1; skip = skip<<40;
eng.discard(skip);
}
#endif
static RngEngine Reseed(RngEngine &eng)
{
std::vector<uint32_t> newseed;
std::uniform_int_distribution<uint32_t> uid;
return Reseed(eng,newseed,uid);
}
static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
std::uniform_int_distribution<uint32_t> &uid)
{
const int reseeds=4;
newseed.resize(reseeds);
for(int i=0;i<reseeds;i++){
newseed[i] = uid(eng);
}
std::seed_seq sseq(newseed.begin(),newseed.end());
return RngEngine(sseq);
}
void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
saved.resize(RngStateCount);
std::stringstream ss;
ss<<_generators[gen];
ss<<eng;
ss.seekg(0,ss.beg);
for(int i=0;i<RngStateCount;i++){
ss>>saved[i];
ss>>saved[i];
}
}
void SetState(std::vector<RngStateType> & saved,int gen){
void GetState(std::vector<RngStateType> & saved,int gen) {
GetState(saved,_generators[gen]);
}
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
assert(saved.size()==RngStateCount);
std::stringstream ss;
for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" ";
ss<< saved[i]<<" ";
}
ss.seekg(0,ss.beg);
ss>>_generators[gen];
ss>>eng;
}
void SetState(std::vector<RngStateType> & saved,int gen){
SetState(saved,_generators[gen]);
}
void SetEngine(RngEngine &Eng, int gen){
_generators[gen]=Eng;
}
void GetEngine(RngEngine &Eng, int gen){
Eng=_generators[gen];
}
template<class source> void Seed(source &src, int gen)
{
_generators[gen] = RngEngine(src);
}
};
class GridSerialRNG : public GridRNGbase {
public:
// FIXME ... do we require lockstep draws of randoms
// from all nodes keeping seeds consistent.
// place a barrier/broadcast in the fill routine
template<class source> void Seed(source &src)
{
typename source::result_type init = src();
CartesianCommunicator::BroadcastWorld(0,(void *)&init,sizeof(init));
_generators[0] = RngEngine(init);
_seeded=1;
}
GridSerialRNG() : GridRNGbase() {
_generators.resize(1);
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_seeded=0;
_uid.resize(1,std::uniform_int_distribution<uint32_t>() );
}
template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
typedef typename sobj::scalar_type scalar_type;
@ -183,9 +264,9 @@ namespace Grid {
dist[0].reset();
for(int idx=0;idx<words;idx++){
fillScalar(buf[idx],dist[0],_generators[0]);
fillScalar(buf[idx],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
};
@ -215,7 +296,7 @@ namespace Grid {
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
@ -223,7 +304,7 @@ namespace Grid {
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
@ -231,7 +312,7 @@ namespace Grid {
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<vRealF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
@ -243,156 +324,185 @@ namespace Grid {
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
void SeedRandomDevice(void){
std::random_device rd;
Seed(rd);
}
void SeedFixedIntegers(const std::vector<int> &seeds){
fixedSeed src(seeds);
Seed(src);
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq src(seeds.begin(),seeds.end());
Seed(src,0);
}
};
class GridParallelRNG : public GridRNGbase {
double _time_counter;
public:
GridBase *_grid;
int _vol;
unsigned int _vol;
int generator_idx(int os,int is){
int generator_idx(int os,int is) {
return is*_grid->oSites()+os;
}
GridParallelRNG(GridBase *grid) : GridRNGbase() {
_grid=grid;
_vol =_grid->iSites()*_grid->oSites();
_grid = grid;
_vol =_grid->iSites()*_grid->oSites();
_generators.resize(_vol);
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_seeded=0;
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
}
// This loop could be made faster to avoid the Ahmdahl by
// i) seed generators on each timeslice, for x=y=z=0;
// ii) seed generators on each z for x=y=0
// iii)seed generators on each y,z for x=0
// iv) seed generators on each y,z,x
// made possible by physical indexing.
template<class source> void Seed(source &src)
{
std::vector<int> gcoor;
int gsites = _grid->_gsites;
typename source::result_type init = src();
RngEngine pseeder(init);
std::uniform_int_distribution<uint64_t> ui;
for(int gidx=0;gidx<gsites;gidx++){
int rank,o_idx,i_idx;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
int l_idx=generator_idx(o_idx,i_idx);
const int num_rand_seed=16;
std::vector<int> site_seeds(num_rand_seed);
for(int i=0;i<site_seeds.size();i++){
site_seeds[i]= ui(pseeder);
}
_grid->Broadcast(0,(void *)&site_seeds[0],sizeof(int)*site_seeds.size());
if( rank == _grid->ThisRank() ){
fixedSeed ssrc(site_seeds);
typename source::result_type sinit = ssrc();
_generators[l_idx] = RngEngine(sinit);
}
}
_seeded=1;
}
//FIXME implement generic IO and create state save/restore
//void SaveState(const std::string<char> &file);
//void LoadState(const std::string<char> &file);
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int multiplicity = RNGfillable(_grid,l._grid);
int Nsimd =_grid->Nsimd();
int osites=_grid->oSites();
int words=sizeof(scalar_object)/sizeof(scalar_type);
double inner_time_counter = usecond();
int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l._grid too
int osites = _grid->oSites(); // guaranteed to be <= l._grid->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type);
PARALLEL_FOR_LOOP
for(int ss=0;ss<osites;ss++){
parallel_for(int ss=0;ss<osites;ss++){
std::vector<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
std::vector<scalar_object> buf(Nsimd);
for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times
int sm = multiplicity * ss + m; // Maps the generator site to the fine site
int sm=multiplicity*ss+m; // Maps the generator site to the fine site
for(int si=0;si<Nsimd;si++){
int gdx = generator_idx(ss,si); // index of generator state
scalar_type *pointer = (scalar_type *)&buf[si];
dist[gdx].reset();
for(int idx=0;idx<words;idx++){
fillScalar(pointer[idx],dist[gdx],_generators[gdx]);
}
}
// merge into SIMD lanes
merge(l._odata[sm],buf);
}
for (int si = 0; si < Nsimd; si++) {
int gdx = generator_idx(ss, si); // index of generator state
scalar_type *pointer = (scalar_type *)&buf[si];
dist[gdx].reset();
for (int idx = 0; idx < words; idx++)
fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
}
// merge into SIMD lanes, FIXME suboptimal implementation
merge(l._odata[sm], buf);
}
}
_time_counter += usecond()- inner_time_counter;
};
void SeedRandomDevice(void){
std::random_device rd;
Seed(rd);
}
void SeedFixedIntegers(const std::vector<int> &seeds){
fixedSeed src(seeds);
Seed(src);
// Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq source(seeds.begin(),seeds.end());
RngEngine master_engine(source);
#ifdef RNG_FAST_DISCARD
////////////////////////////////////////////////
// Skip ahead through a single stream.
// Applicable to SITMO and other has based/crypto RNGs
// Should be applicable to Mersenne Twister, but the C++11
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
std::vector<int> gcoor;
int rank,o_idx,i_idx;
// Everybody loops over global volume.
for(int gidx=0;gidx<_grid->_gsites;gidx++){
Skip(master_engine); // Skip to next RNG sequence
// Where is it?
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// If this is one of mine we take it
if( rank == _grid->ThisRank() ){
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
}
}
#else
////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient
// and maximally parallel; but NOT reproducible from machine to machine.
// Not ideal, but fastest way to reseed all nodes.
////////////////////////////////////////////////////////////////
{
// Obtain one Reseed per processor
int Nproc = _grid->ProcessorCount();
std::vector<RngEngine> seeders(Nproc);
int me= _grid->ThisRank();
for(int p=0;p<Nproc;p++){
seeders[p] = Reseed(master_engine);
}
master_engine = seeders[me];
}
{
// Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads();
std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine);
}
parallel_for(int t=0;t<Nthread;t++) {
// set up one per local site in threaded fashion
std::vector<uint32_t> newseeds;
std::uniform_int_distribution<uint32_t> uid;
for(int l=0;l<_grid->lSites();l++) {
if ( (l%Nthread)==t ) {
_generators[l] = Reseed(seeders[t],newseeds,uid);
}
}
}
}
#endif
}
void Report(){
std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
}
////////////////////////////////////////////////////////////////////////
// Support for rigorous test of RNG's
// Return uniform random uint32_t from requested site generator
////////////////////////////////////////////////////////////////////////
uint32_t GlobalU01(int gsite){
uint32_t the_number;
// who
std::vector<int> gcoor;
int rank,o_idx,i_idx;
_grid->GlobalIndexToGlobalCoor(gsite,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// draw
int l_idx=generator_idx(o_idx,i_idx);
if( rank == _grid->ThisRank() ){
the_number = _uid[l_idx](_generators[l_idx]);
}
// share & return
_grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
return the_number;
}
};
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l){
rng.fill(l,rng._uniform);
}
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); }
template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l){
rng.fill(l,rng._gaussian);
}
template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){
rng.fill(l,rng._bernoulli);
}
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){
rng.fill(l,rng._uniform);
}
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){
rng.fill(l,rng._gaussian);
}
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){
rng.fill(l,rng._bernoulli);
}
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); }
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
}
#endif

View File

@ -42,8 +42,7 @@ namespace Grid {
-> Lattice<decltype(trace(lhs._odata[0]))>
{
Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = trace(lhs._odata[ss]);
}
return ret;
@ -56,8 +55,7 @@ PARALLEL_FOR_LOOP
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
{
Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
}
return ret;

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -51,7 +51,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
half.checkerboard = cb;
int ssh=0;
//PARALLEL_FOR_LOOP
//parallel_for
for(int ss=0;ss<full._grid->oSites();ss++){
std::vector<int> coor;
int cbos;
@ -68,7 +68,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
int cb = half.checkerboard;
int ssh=0;
//PARALLEL_FOR_LOOP
//parallel_for
for(int ss=0;ss<full._grid->oSites();ss++){
std::vector<int> coor;
int cbos;
@ -153,8 +153,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
}
PARALLEL_FOR_LOOP
for(int sf=0;sf<fine->oSites();sf++){
parallel_for(int sf=0;sf<fine->oSites();sf++){
int sc;
std::vector<int> coor_c(_ndimension);
@ -186,8 +185,7 @@ template<class vobj,class CComplex>
fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner);
PARALLEL_FOR_LOOP
for(int ss=0;ss<coarse->oSites();ss++){
parallel_for(int ss=0;ss<coarse->oSites();ss++){
CoarseInner._odata[ss] = coarse_inner._odata[ss];
}
}
@ -333,9 +331,6 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
typedef typename vobj::scalar_object sobj;
typedef typename vvobj::scalar_object ssobj;
sobj s;
ssobj ss;
GridBase *ig = in._grid;
GridBase *og = out._grid;
@ -347,10 +342,13 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
for(int d=0;d<no;d++){
assert(ig->_processors[d] == og->_processors[d]);
assert(ig->_ldimensions[d] == og->_ldimensions[d]);
assert(ig->lSites() == og->lSites());
}
//PARALLEL_FOR_LOOP
for(int idx=0;idx<ig->lSites();idx++){
parallel_for(int idx=0;idx<ig->lSites();idx++){
sobj s;
ssobj ss;
std::vector<int> lcoor(ni);
ig->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(s,in,lcoor);
@ -361,10 +359,9 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
template<class vobj>
void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
{
typedef typename vobj::scalar_object sobj;
sobj s;
GridBase *lg = lowDim._grid;
GridBase *hg = higherDim._grid;
@ -386,17 +383,16 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
}
// the above should guarantee that the operations are local
// Guido: check the threading here
//PARALLEL_FOR_LOOP
for(int idx=0;idx<lg->lSites();idx++){
parallel_for(int idx=0;idx<lg->lSites();idx++){
sobj s;
std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
dl=0;
int ddl=0;
hcoor[orthog] = slice;
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
hcoor[d]=lcoor[dl++];
hcoor[d]=lcoor[ddl++];
}
}
peekLocalSite(s,lowDim,lcoor);
@ -405,10 +401,9 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
}
template<class vobj>
void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, int orthog)
void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice, int orthog)
{
typedef typename vobj::scalar_object sobj;
sobj s;
GridBase *lg = lowDim._grid;
GridBase *hg = higherDim._grid;
@ -429,16 +424,16 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
}
}
// the above should guarantee that the operations are local
//PARALLEL_FOR_LOOP
for(int idx=0;idx<lg->lSites();idx++){
parallel_for(int idx=0;idx<lg->lSites();idx++){
sobj s;
std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
dl=0;
int ddl=0;
hcoor[orthog] = slice;
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
hcoor[d]=lcoor[dl++];
hcoor[d]=lcoor[ddl++];
}
}
peekLocalSite(s,higherDim,hcoor);
@ -449,10 +444,9 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
template<class vobj>
void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
{
typedef typename vobj::scalar_object sobj;
sobj s;
GridBase *lg = lowDim._grid;
GridBase *hg = higherDim._grid;
@ -469,8 +463,8 @@ void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice
}
// the above should guarantee that the operations are local
//PARALLEL_FOR_LOOP
for(int idx=0;idx<lg->lSites();idx++){
parallel_for(int idx=0;idx<lg->lSites();idx++){
sobj s;
std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
@ -488,7 +482,6 @@ template<class vobj>
void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
{
typedef typename vobj::scalar_object sobj;
sobj s;
GridBase *lg = lowDim._grid;
GridBase *hg = higherDim._grid;
@ -505,8 +498,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic
}
// the above should guarantee that the operations are local
//PARALLEL_FOR_LOOP
for(int idx=0;idx<lg->lSites();idx++){
parallel_for(int idx=0;idx<lg->lSites();idx++){
sobj s;
std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
@ -574,8 +567,7 @@ typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>
in_grid->iCoorFromIindex(in_icoor[lane], lane);
}
PARALLEL_FOR_LOOP
for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
//Assemble vector of pointers to output elements
std::vector<sobj*> out_ptrs(in_nsimd);
@ -623,8 +615,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in);
PARALLEL_FOR_LOOP
for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
parallel_for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
std::vector<int> out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
@ -642,10 +633,6 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
merge(out._odata[out_oidx], ptrs, 0);
}
}
}
#endif

View File

@ -40,27 +40,24 @@ namespace Grid {
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = transpose(lhs._odata[ss]);
}
return ret;
};
Lattice<vobj> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = transpose(lhs._odata[ss]);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Index level dependent transpose
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
////////////////////////////////////////////////////////////////////////////////////////////////////
// Index level dependent transpose
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
{
Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
PARALLEL_FOR_LOOP
for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
}
return ret;
};
{
Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
}
return ret;
};
}
#endif

View File

@ -37,8 +37,7 @@ namespace Grid {
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=pow(rhs._odata[ss],y);
}
return ret;
@ -47,8 +46,7 @@ PARALLEL_FOR_LOOP
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=mod(rhs._odata[ss],y);
}
return ret;
@ -58,8 +56,7 @@ PARALLEL_FOR_LOOP
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=div(rhs._odata[ss],y);
}
return ret;
@ -69,8 +66,7 @@ PARALLEL_FOR_LOOP
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
PARALLEL_FOR_LOOP
for(int ss=0;ss<rhs._grid->oSites();ss++){
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
}
return ret;

View File

@ -56,8 +56,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
std::vector<scalar_object> truevals (Nsimd);
std::vector<scalar_object> falsevals(Nsimd);
PARALLEL_FOR_LOOP
for(int ss=0;ss<iftrue._grid->oSites(); ss++){
parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
extract(iftrue._odata[ss] ,truevals);
extract(iffalse._odata[ss] ,falsevals);

Some files were not shown because too many files have changed in this diff Show More