1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-07-04 15:37:05 +01:00

Compare commits

..

354 Commits

Author SHA1 Message Date
27ea2afe86 No compile on comms == none fix 2017-10-30 01:14:11 +00:00
78e8704eac Shaking out 2017-10-30 00:25:31 +00:00
67131d82f2 Get subrank info from communicator constructor 2017-10-30 00:24:11 +00:00
615a9448b9 Extended sub comm supported 2017-10-30 00:23:34 +00:00
00164f5ce5 : 2017-10-30 00:22:52 +00:00
a7f72eb994 SHaking out 2017-10-30 00:22:06 +00:00
501fa1614a Communicator updates for split grid 2017-10-30 00:16:12 +00:00
5bf42e1e15 Update 2017-10-30 00:05:21 +00:00
fe4d9b003c More digits 2017-10-30 00:04:47 +00:00
4a699b4da3 New rank can be found out 2017-10-30 00:04:14 +00:00
689323f4ee Reverse dim ordering lexico support 2017-10-30 00:03:15 +00:00
84b441800f Merge branch 'develop' into feature/lanczos-reorg 2017-10-27 14:21:38 +01:00
1ef424b139 Split grid Y2K bug fix attempt 2017-10-27 14:20:35 +01:00
aa66f41c69 Bug fix in the coarse restore...
Think this is nearly there
2017-10-27 10:29:34 +01:00
f96c800d25 Passes reload of coarse basis 2017-10-27 09:43:22 +01:00
32a52d7583 Move the local coherence lanczos into algorithms.
Keep the I/O in the tester. Other people can copy this method to write other I/O formats.
2017-10-27 09:04:31 +01:00
fa04b6d3c2 Finished ? Verifying coarse evec restore 2017-10-27 08:18:29 +01:00
7fab183c0e Better read test 2017-10-27 08:17:49 +01:00
9ec9850bdb 64bit ftello update 2017-10-26 23:34:31 +01:00
0c4ddaea0b Cleaning up 2017-10-26 23:31:46 +01:00
00ebc150ad Mistake in string parse; interface is ambiguous and must fix. Is char * a file, or a XML buffer ? 2017-10-26 23:30:37 +01:00
0f3e9ae57d Gsites error. Only appeared (so far) in I/O code for even odd fields 2017-10-26 23:29:59 +01:00
034de160bf Staggered updates : Schur fixed and added a unit test for Test_staggered_cg_schur.cc giving stronger check 2017-10-26 20:58:46 +01:00
14507fd6e4 Final? candidate for push back on the lanczos reorg feature 2017-10-26 16:25:01 +01:00
2db05ac214 Test for split/unsplit in isolation 2017-10-26 07:48:03 +01:00
31f99574fa Moving these out of algorithms 2017-10-26 07:47:42 +01:00
a34c8a2961 Update to IRL; getting close to the structure I would like. 2017-10-26 07:45:56 +01:00
ccd20df827 Better IRL interface 2017-10-26 01:59:59 +01:00
e9be293444 Better messaging 2017-10-26 01:59:30 +01:00
d577211cc3 Relax stoppign condition 2017-10-25 23:57:54 +01:00
f4336e480a Faster converge time 2017-10-25 23:53:44 +01:00
e4d461cb03 Messagign 2017-10-25 23:53:19 +01:00
3d63b4894e Use existing functionality where possible 2017-10-25 23:52:47 +01:00
08583afaff Red black friendly coarsening 2017-10-25 23:51:18 +01:00
b395a312af Better error messaging 2017-10-25 23:50:37 +01:00
66295b99aa Bit less verbose SciDAC IO 2017-10-25 23:50:05 +01:00
b8654be0ef 64 bit safe offsets 2017-10-25 23:49:23 +01:00
a479325349 Rewrite of local coherence lanczos 2017-10-25 23:48:47 +01:00
f6c3f6bf2d XML serialisation of parms and initialise from parms object 2017-10-25 23:47:59 +01:00
d83868fdbb Identity linear op added -- useful in circumstances where a linear op may or may not be needed.
Supply a trivial one if not needed
2017-10-25 23:47:10 +01:00
303e0b927d Improvements for coarse grid compressed lanczos 2017-10-25 23:46:33 +01:00
28ba8a0f48 Force spacing more nicely 2017-10-25 23:45:57 +01:00
f9e28577f3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-10-25 21:07:56 +01:00
8a3aae98f6 Solving minor bug in compilation 2017-10-25 10:34:49 +01:00
8309f2364b Solving again the MPI comm bug with FFTs 2017-10-25 10:24:14 +01:00
cac1750078 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-10-24 23:30:36 +01:00
27936900e6 Putting the FG verbosity in the Integrator level 2017-10-18 13:08:09 +01:00
e325929851 ALl codes compile against the new Lanczos call signature 2017-10-13 14:02:43 +01:00
47af3565f4 Logging improvement; reunified the Lanczos codes 2017-10-13 13:23:07 +01:00
4b4d187935 Reunified the Lanczos implementations 2017-10-13 13:22:44 +01:00
9aff354ab5 Final version prior to reunification 2017-10-13 13:22:26 +01:00
cb9ff20249 Approx tests and lanczos improvement 2017-10-13 11:30:50 +01:00
9fe6ac71ea Starting reorg of Blocked lanczos 2017-10-11 10:12:07 +01:00
f1fa00b71b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-10-10 14:26:44 +01:00
bf58557fb1 Block compressed Lanczos 2017-10-10 14:15:11 +01:00
10cb37f504 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-10-10 14:09:44 +01:00
1374c943d4 Correct Schur operator called 2017-10-10 13:59:50 +01:00
a1d80282ec cb factorise 2017-10-10 13:49:31 +01:00
4eb8bbbebe Christop mods 2017-10-10 13:48:51 +01:00
d1c6288c5f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-10-10 13:38:40 +01:00
dd949bc428 Merge branch 'feature/staggering' into develop 2017-10-10 13:02:51 +01:00
bb7378cfc3 Schur for staggered 2017-10-10 12:02:18 +01:00
f0e084a88c Schur staggered 2017-10-10 10:00:43 +01:00
153672d8ec Split CG testing 2017-10-09 23:20:58 +01:00
08ca338875 Split grid communication 2017-10-09 23:19:45 +01:00
f7cbf82c04 Better stdout/err debug 2017-10-09 23:18:48 +01:00
07009c569a Comms splitting improvements 2017-10-09 23:16:51 +01:00
09f4cdb11e Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering 2017-10-04 10:51:16 +01:00
1e54882f71 Stagger 2017-10-04 10:51:06 +01:00
d54807b8c0 MPIT works with split grid now 2017-10-02 23:14:56 +01:00
5625b47c7d Merge branch 'feature/dwf-multirhs' into develop 2017-10-02 12:42:32 +01:00
1edcf902b7 Macos ANON 2017-10-02 12:41:02 +01:00
e5c19e1fd7 RB constructor change 2017-10-02 12:25:52 +01:00
a11d0a33d1 Merge branch 'feature/dwf-multirhs' of https://github.com/paboyle/Grid into feature/dwf-multirhs 2017-10-02 11:42:07 +01:00
4f8b6f26b4 Merge branch 'develop' into feature/dwf-multirhs 2017-10-02 11:41:49 +01:00
073525c5b3 Small patch from cori 2017-10-02 03:38:21 -07:00
eb6153080a Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering 2017-10-02 08:56:33 +01:00
f7072d1ac2 Solving an annoying compilation error in json 2017-10-02 07:13:40 +01:00
fddeb29d6b Bug fix with spreadout FFT 2017-09-21 11:10:08 +01:00
a9ec5cf564 Christoph bug report integrate 2017-09-21 10:32:41 +01:00
946a8671b9 Merge pull request #129 from djm2131/feature/eofa
Add support for DWF with the exact one flavor algorithm
2017-09-21 10:15:21 +01:00
a6eeea777b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-09-21 10:12:41 +01:00
771a1b8e79 Merge pull request #128 from paboyle/feature/CG-reliable-update
Feature/cg reliable update
2017-09-21 10:12:03 +01:00
bfb68e6f02 Merge pull request #130 from giltirn/gparity-handunroll
Gparity handunroll
2017-09-21 10:11:00 +01:00
77f7737ccc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-09-19 14:28:01 +01:00
18c335198a Merge branch 'hotfix/dirac-ITT-fix1' into develop 2017-09-16 18:19:02 +01:00
f9df685cde Merge branch 'hotfix/dirac-ITT-fix1' 2017-09-16 18:18:48 +01:00
17c5b0f152 Patching comparison point 2017-09-16 18:18:07 +01:00
5918769f97 Subtle Naik term bug updated in Stencil; less on logical && with a function call on right 2017-09-16 12:51:26 +01:00
bbaf1ada91 Merge branch 'feature/json-fix' into develop 2017-09-08 16:02:08 +01:00
1950ac9294 Fixed the Intel compiler problem with the JSON classes 2017-09-08 15:18:59 +01:00
13fa70ac1a Merge branch 'develop' into feature/json-fix 2017-09-08 13:42:20 +01:00
7cb2b11f26 Fixing Intel compiler error for the JSON parser 2017-09-08 13:41:53 +01:00
1184ed29ae Merge pull request #124 from nmeyer-ur/feature/arm-neon
Added integer reduce functionality
2017-09-08 10:54:35 +02:00
203c7bf6fa Merge branch 'hotfix/dirac-ITT-fix' into develop 2017-09-05 15:08:51 +01:00
c709883f3f Merge branch 'hotfix/dirac-ITT-fix' 2017-09-05 15:08:16 +01:00
aed5de4d50 Patching macos compile 2017-09-05 15:07:07 +01:00
ba27cc6571 Mac os happiness 2017-09-05 15:00:16 +01:00
d856327250 Merge branch 'release/dirac-ITT' into develop 2017-09-05 14:56:12 +01:00
d75369cb56 Merge branch 'release/dirac-ITT' 2017-09-05 14:55:54 +01:00
bf973d0d56 SHM complete 2017-09-05 14:30:29 +01:00
837bf8a5be Updating to control the SHM allocation scheme under configure time options 2017-09-05 12:51:02 +01:00
c05b2199f6 Improvements to huge memory 2017-09-04 10:41:21 -04:00
a5fe07c077 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-09-04 14:10:15 +01:00
b83b2b1415 Stability improvement to BCG. Force m_rr hermitian beyond rounding. 2017-09-04 14:09:47 +01:00
b331be9101 Better reporting 2017-08-31 11:32:57 +01:00
49c20a9fa8 Patch to reporting 2017-08-31 11:32:21 +01:00
7359df3501 Full reporting for benchmark; save robustness factor 2017-08-31 10:42:35 +01:00
59bd1fe21b Fix for 'perm' and 'local' not being set for hand-unrolled external-site Dslash, which caused incorrect behavior of G-parity kernel 2017-08-29 13:07:37 -07:00
4e907fef2c Merge remote-tracking branch 'grid/develop' into feature/arm-neon 2017-08-29 17:47:36 +02:00
67888b657f Merge branch 'gparity-handunroll' of https://github.com/giltirn/Grid into gparity-handunroll 2017-08-29 09:52:05 -04:00
74af885d4e Removed some no-longer-needed associated with G-parity hand unrolled kernel 2017-08-29 09:50:37 -04:00
d36d2fb40d Added ability to override default Ls in Benchmark_dwf 2017-08-28 06:53:56 -07:00
5b9267e88d Cleaner comms benchmark treatment for one node runs 2017-08-27 18:24:48 -04:00
15fd4003ef Improving presentation of results 2017-08-27 13:46:02 +01:00
4b4c2a715b fcntl.h needed 2017-08-26 11:38:04 +01:00
54a5e6c1d0 Check if we get huge pages on linux. Larry Meadows piece of magic. 2017-08-25 22:36:08 +01:00
73aeca7dea Merge branch 'feature/multi-communicator' into develop 2017-08-25 21:55:09 +01:00
ad89abb018 Fix 2017-08-25 20:43:37 +01:00
80c5bce5bb Merge branch 'develop' into feature/multi-communicator 2017-08-25 20:21:26 +01:00
f68b5de9c8 No compile fix on Clang 2017-08-25 19:35:21 +01:00
d0f3d525d5 Optimal block size for KNL 2017-08-25 19:33:54 +01:00
f365a83fae In G-parity unrolled kernel, replaced calls to permute and exchange with run-time-evaluated permute type with explicit calls to appropriate underlying functions 2017-08-25 14:24:11 -04:00
3a58217405 Updated 2017-08-25 14:29:53 +01:00
c289699d9a updated from cambridge mpi3 shakeout 2017-08-25 11:41:01 +01:00
c3b1263e75 Benchmark prep 2017-08-25 09:25:54 +01:00
34a9aeb331 Reduced number of if-statement evaluations in G-parity unrolled kernel 2017-08-24 13:53:50 -07:00
102ea9ae66 CI update 2017-08-24 18:17:09 +01:00
5fa386ddc9 FFT test compile fixed 2017-08-24 10:17:52 +01:00
edabb3577f Imported Benchmark_gparity 2017-08-23 16:54:06 -04:00
ce5df177ee Removed superfluous implementation of G-parity twist for hand-unrolled kernel from GparityWilsonImpl 2017-08-23 15:05:22 -04:00
a0bb8e5b46 Added hand-unrolled kernel implementations of all the other dslash precision / comms precision combinations with G-parity 2017-08-23 14:44:40 -04:00
46f88e6d72 G-parity hand-unrolled intrinsics twist now uses one less permute and one less temporary 2017-08-23 13:21:10 -04:00
dd8f1ea189 Vectorized Mobius EOFA Dperp + shift operation 2017-08-23 13:17:26 -04:00
b61835c1a5 Added inplace version of intrinsic G-parity twist to hand-unrolled kernel 2017-08-23 12:33:48 -04:00
d9cd4f0273 Staggered multinode block cg debugged. Missing global sum.
Code stalls and resumes on KNL at cambridge. Curious.

CG iterations 23ms each, then 3200 ms pauses. Mean bandwidth reports
as 200MB/s. Comms dominant in the report. However, the time behaviour suggests it
is *bursty*.... Could be swap to disk?
2017-08-23 15:07:18 +01:00
459f70e8d4 Check-in of working Mobius EOFA class and tests 2017-08-22 22:38:30 -04:00
061e48fd73 Replaced slow unpack-repack in G-parity BC twist with intrinsics version 2017-08-22 18:12:12 -04:00
ab50145001 Implemented first, unoptimized version of hand-unrolled G-parity kernels
Improved Test_gparity
2017-08-22 17:12:25 -04:00
b49bec0cec MAP_HUGETLB portability fix 2017-08-20 03:08:54 +01:00
ae56e556c6 finalise issue on new OPA revert 2017-08-20 02:53:12 +01:00
1cdf999668 Moving multicommunicator into mpi3 also for threading 2017-08-20 02:39:10 +01:00
11062fb686 Comms none fail fix 2017-08-20 01:37:07 +01:00
383ca7d392 Switch off comms for now until feature/multi-communicator is merged 2017-08-20 01:27:48 +01:00
a446d95c33 Trying to pass TeamCity and Travis 2017-08-20 01:10:50 +01:00
be66e7dd95 Merge branch 'develop' into feature/multi-communicator 2017-08-19 23:12:38 +01:00
6d0d064a6c Update TODO 2017-08-19 23:11:30 +01:00
bfef525ed2 New benchmark prep 2017-08-19 23:10:12 +01:00
0b0cf62193 Fix mpi 3 interface change 2017-08-19 13:18:50 -04:00
7d88198387 Merge branch 'develop' into feature/multi-communicator 2017-08-19 13:03:35 -04:00
2f619482b8 Enable blocking stencil send 2017-08-19 12:53:59 -04:00
d6472eda8d Use mmap 2017-08-19 12:53:18 -04:00
9e658de238 Use Vector 2017-08-19 12:52:44 -04:00
bcefdd7c4e Align both allocator calls to 2MB 2017-08-19 12:49:02 -04:00
9d45fca8bc Implement MobiusEOFAFermioncache.cc 2017-08-17 23:45:36 -04:00
ac9e6b63c0 More re-import of Mobius EOFA 2017-08-17 19:28:53 -04:00
e140b3f802 Beginning to re-import Mobius EOFA 2017-08-16 23:36:23 -04:00
d9d3d30cc7 Minor clean-up 2017-08-16 20:57:51 -04:00
47a12ec7b5 Implement EOFA pseudofermion force and Shamir tests for G-parity and non G-parity cases 2017-08-16 19:50:08 -04:00
ec1e2f7a40 Add (mostly implemented) ExactOneFlavourRatio pseudofermion class and tests of Shamir heatbath and action 2017-08-16 12:38:59 -04:00
41f73ec083 Add ChronoForecast class for forecasting solutions across poles in the EOFA heatbath 2017-08-16 12:37:38 -04:00
fd367d8bfd Debugging the PointerCache 2017-08-16 09:42:57 +01:00
6d0786ff9d Typo fixes and check-in of G-parity action test for DWF 2017-08-15 22:47:00 -04:00
b7f93aeb4d Change CayleyFermion5D::SetCoefficientsInternal to virtual to allow overriding in derived EOFA classes 2017-08-15 14:18:51 -04:00
202a7fe900 Re-import DWF and abstract base EOFA fermion classes and tests 2017-08-15 13:36:08 -04:00
8a3fe60a27 Added more asserts at grid creation time 2017-08-08 11:36:20 +01:00
44051aecd1 Checking for integer divisions in cartesian full 2017-08-08 10:31:12 +01:00
06e6f8de00 Check that the reduced dim is an integer 2017-08-08 10:22:12 +01:00
dbe4d7850c Make a test file compatible with all architectures 2017-08-06 10:49:45 +01:00
4fe182e5a7 Added high level HMC support for overriding default SIMD lane decomposition 2017-08-06 10:46:19 +01:00
175f393f9d Binary IO error checking 2017-08-04 12:14:10 +01:00
7d867a8134 Merge branch 'develop' into feature/CG-reliable-update 2017-08-02 09:48:04 -04:00
9939b267d2 Added switching to fallback linear operator in reliable update CG, and added recalculation of b parameter on update. 2017-07-31 13:39:44 -04:00
14d53e1c9e Threaded MPI calls patches 2017-07-29 13:08:10 -04:00
8bd869da37 Correcting a bug in the IO routines 2017-07-27 15:12:50 +01:00
c7036f6717 Adding checks for libm and libstdc++ 2017-07-27 11:15:09 +01:00
c0485d799d Explicit parameter declaration in the WilsonGauge test 2017-07-26 16:26:04 +01:00
7abc5613bd Added smearing to the topological charge observable 2017-07-26 16:21:17 +01:00
237cfd11ab Solving the spurious O2 flags 2017-07-26 12:08:51 +01:00
a4b7dddb67 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-07-26 12:07:38 +01:00
5696781862 Debug error in Tensor mult 2017-07-26 12:07:34 +01:00
8f4b3049cd Merge branch 'feature/CG-reliable-update' into ckelly_develop 2017-07-25 11:55:26 -04:00
2a6e673a91 Merge branch 'develop' into feature/CG-reliable-update 2017-07-25 11:54:43 -04:00
9b6cde173f Merge branch 'feature/CG-reliable-update' into ckelly_develop 2017-07-25 11:51:08 -04:00
9f280b82c4 Added mixed-precision CG with reliable updates 2017-07-25 11:30:41 -04:00
c3f0889eda Merge pull request #123 from giltirn/develop
Fix for 'using namespace' in lib/qcd/utils/GaugeFix.h
2017-07-25 11:32:02 -03:00
7a53dc3715 Added integer reduce functionality 2017-07-24 11:12:59 +02:00
0f214ad427 Moved FourierAcceleratedGaugeFixer into Grid::QCD namespace and removed 'using namespace' directives from header 2017-07-21 11:13:51 -04:00
fe4912880d Update README.md 2017-07-17 09:53:07 +01:00
f038c6babe Update README.md 2017-07-14 22:59:16 +01:00
169f4b2711 Update README.md 2017-07-14 22:56:06 +01:00
2d8aff36fe Update README.md 2017-07-14 22:52:16 +01:00
9fa07eecde Merge branch 'develop' into feature/json-fix 2017-07-12 15:47:22 +01:00
659d7d1a40 For test/solver
Fixed
2017-07-12 15:01:48 +01:00
f64fb7bd77 Fix gcc error on JSON compilation 2017-07-12 14:55:42 +01:00
2a35449b91 Merge branch 'develop' into feature/json-fix 2017-07-12 14:47:00 +01:00
184af5bd05 Added support for std::pair in the JSON serialiser 2017-07-12 14:44:53 +01:00
097c9637ee Fixed the JSON parsing error 2017-07-11 14:31:57 +01:00
dc6f078246 fixed the header file for mpi3 2017-07-11 14:15:08 +01:00
8a4714a4a6 Update README.md 2017-07-09 00:11:54 +01:00
40e119c61c NUMA improvements worth preserving from AMD EPYC tests 2017-07-08 22:27:11 -04:00
d9593c4b81 Merge branch 'develop' into feature/json-fix 2017-07-07 14:17:50 +01:00
ac740f73ce Works on Cori 2017-07-02 16:47:58 -07:00
75dc7794b9 Working on Cori 2017-07-02 16:47:42 -07:00
dee68fc728 IO working multiple nodes again. Strategy of all nodes writing metadata is unsafe.
Only one rank should do this. must identify this rank. Means pass communicator to the
Objects.
2017-07-02 23:33:48 +01:00
a2d3643634 Merge branch 'feature/dwf-multirhs' of https://github.com/paboyle/Grid into feature/dwf-multirhs 2017-07-02 14:59:22 -07:00
57002924bc NERSC shakeout of this 2017-07-02 14:58:30 -07:00
7b0237b081 Update README.md 2017-07-01 10:24:41 +01:00
b68ad0cc0b Update README.md 2017-07-01 10:20:07 +01:00
37263fd9b1 Update README.md 2017-07-01 10:06:24 +01:00
3d09e3e9e0 Update README.md 2017-07-01 10:05:46 +01:00
1354b46338 Update README.md 2017-07-01 10:04:32 +01:00
251a97fe1b Update README.md 2017-07-01 09:55:36 +01:00
e18929eaa0 Update README.md 2017-07-01 09:53:15 +01:00
f3b0a92e71 Update README.md 2017-07-01 09:48:00 +01:00
a0be3f7330 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-06-30 10:53:50 +01:00
b5a6e4f1fd Best option for Xeon cache blocking set 2017-06-30 10:53:22 +01:00
7a788db3dc Guard first touch 2017-06-30 10:49:08 +01:00
f20eceb6cd First touch once per page in a threaded loop 2017-06-30 10:48:27 +01:00
38325ebbc6 Interleave code path; not enabled 2017-06-30 10:23:51 +01:00
b73bd151bb Switch off counters by default 2017-06-30 10:16:35 +01:00
694b305cab Update to reporting 2017-06-30 10:16:13 +01:00
2d3737a133 O3, KNL 2017-06-30 10:15:59 +01:00
ac1f1838bc KNL only 2017-06-30 10:15:32 +01:00
09d09d0fe5 Update README.md 2017-06-29 11:48:11 +01:00
bf630a6821 README file update 2017-06-29 11:42:25 +01:00
8859a151cc Small corrections to the NEON port 2017-06-29 11:30:29 +01:00
688a39cfd9 Merge pull request #114 from nmeyer-ur/feature/arm-neon
ARM neon intrinsics support
Guido: checked and approved
2017-06-29 09:57:17 +01:00
6f5a5cd9b3 Improved threaded comms benchmark 2017-06-28 23:27:02 +01:00
0933aeefd4 corrected Grid_neon.h 2017-06-28 20:22:22 +02:00
322f61acee Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-06-28 15:30:35 +01:00
08e04b9676 Better benchmarks 2017-06-28 15:30:06 +01:00
feaa2ac947 Merge branch 'feature/scalar-hmc-update' into develop 2017-06-28 12:46:18 +01:00
07de925127 minor scalar action fixes 2017-06-28 12:45:44 +01:00
a9c816a268 moved file to correct folder 2017-06-27 21:39:15 +02:00
e43a8b6b8a removed comments 2017-06-27 20:58:48 +02:00
bf729766dd removed collision with QPX implementation 2017-06-27 20:32:24 +02:00
dafb351d38 Merge pull request #120 from paboyle/feature/scalar-hmc-update
Scalar HMC update. 
I agree with the changes.
2017-06-27 16:23:14 +01:00
0b707b861c Merge branch 'develop' into feature/scalar-hmc-update 2017-06-27 14:40:05 +01:00
15e87a4607 HDF5 IO fix 2017-06-27 14:39:27 +01:00
7d7220cbd7 scalar: lambda/4! convention 2017-06-27 14:38:45 +01:00
54e94360ad Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit 2017-06-24 23:10:24 +01:00
0af740dc15 minor scalar HMC code improvement 2017-06-24 23:04:05 +01:00
d2e8372df3 SU(N) algebra fix (was not working) 2017-06-24 23:03:39 +01:00
869b99ec1e Threaded calls to multiple communicators 2017-06-24 10:55:54 +01:00
4a29ab0d0a Merge branch 'feature/dwf-multirhs' of https://github.com/paboyle/Grid into feature/dwf-multirhs 2017-06-23 23:10:43 +01:00
0165bcb58e Added an update to TODO list 2017-06-23 23:10:24 +01:00
4372d04ad4 Merge pull request #118 from Lanny91/hotfix/bgq
Hotfix/bgq
2017-06-23 16:59:27 +01:00
349d75e483 Precision fix 2017-06-23 02:57:59 -07:00
56abbdf4c2 AVX512 integer reduce fix (for non-intel compiler) 2017-06-23 11:09:14 +02:00
af71c63f4c AVX2 fix 2017-06-23 11:03:12 +02:00
e51475703a Ticking off lots on the TODO list 2017-06-23 09:42:21 +01:00
1feddf4ba6 const fixes 2017-06-22 19:32:41 +01:00
600d7ddc2e Proof of concept : Multi RHS solver, running independent solves on different ranks 2017-06-22 18:54:34 +01:00
e504260f3d Able to run a test job splitting into multiple MPI subdomains. 2017-06-22 18:53:11 +01:00
0440d4ce66 Merge branch 'develop' of https://github.com/paboyle/Grid into hotfix/bgq 2017-06-22 17:09:42 +02:00
5e4bea8f20 Benchmark DWF works 2017-06-22 08:38:54 +01:00
6ebf9f15b7 Splitting communicators first cut 2017-06-22 08:14:34 +01:00
1d7aa673a4 Include BlockCG by default 2017-06-21 21:08:53 +01:00
b9104f3072 Block CG 2017-06-21 21:08:03 +01:00
b22eab8c8b Merge commit 'a7d56523abee6c9030fdd9303c79954897b1086f' into feature/hadrons 2017-06-21 18:32:48 +01:00
a7d56523ab Merge branch 'feature/lanczos-simplify' into develop 2017-06-21 14:03:20 +01:00
1e8a2e1621 various compatibility fixes after merge 2017-06-20 17:24:55 +01:00
7587df831a Merge branch 'develop' into feature/hadrons
# Conflicts:
#	lib/qcd/action/scalar/ScalarImpl.h
2017-06-20 15:50:39 +01:00
b672717096 Test_serialiation update for JSON 2017-06-19 14:38:39 +01:00
284ee194b1 JSON update 2017-06-19 14:38:15 +01:00
81b18f843a Merge branch 'feature/scalar_adjointFT' into feature/hadrons
# Conflicts:
#	lib/qcd/action/scalar/ScalarImpl.h
2017-06-16 17:59:55 +01:00
a833f88c32 Added missing SIMD integer reduction implementation for AVX, AVX-512, SSE4, IMCI 2017-06-16 15:58:47 +01:00
07b2c1b253 Placeholder precision change functions to allow Grid to compile with QPX (warning: no actual functionality) 2017-06-16 15:04:26 +01:00
735cbdb983 QPX Integer reduction (+ integer reduction test) 2017-06-14 10:55:10 +01:00
2ad54c5a02 QPX exchange support 2017-06-14 10:53:39 +01:00
3d04dc33c6 ARM neon intrinsics support 2017-06-13 13:26:59 +02:00
2490816297 Hadrons: rare kaon program removed 2017-06-07 20:11:02 -05:00
5f55bca378 Hadrons: Quark module renamed MFermion::GaugeProp 2017-06-07 20:10:48 -05:00
f6aa82b7f2 Merge branch 'develop' into feature/hadrons 2017-06-06 11:46:33 -05:00
22749699a3 Fixes after merge and point sink module 2017-06-06 11:45:30 -05:00
0503c028be Merge branch 'feature/qed-fvol' into feature/hadrons (non-trivial conflicts on scalar Impl)
# Conflicts:
#	configure.ac
#	lib/qcd/action/scalar/Scalar.h
2017-06-05 16:37:47 -05:00
22f4feee7b Merge branch 'develop' into feature/scalar_adjointFT 2017-05-17 13:27:13 +02:00
3f858d6755 Scalar: phi^2 observable 2017-05-17 13:25:14 +02:00
35fa3d1dfd Merge branch 'master' into feature/scalar_adjointFT 2017-05-12 10:41:39 +01:00
c4435e6beb Merge branch 'release/v0.7.0' 2017-05-12 01:15:59 +01:00
d1ece74137 HMC scalar test: magnetisation measurement 2017-05-11 11:40:44 +01:00
43c817cc67 Scalar action: const fix 2017-05-11 00:07:17 +01:00
51bf1501fc Merge branch 'release/v0.7.0' 2017-05-06 18:42:50 +01:00
741bc836f6 Exposing support for Ncolours and Ndimensions and JSON input file for the ScalarAction 2017-05-05 17:36:43 +01:00
8546d01a4c Merge branch 'develop' into feature/scalar_adjointFT 2017-05-05 15:47:33 +01:00
1407418755 Old qed-fvol program build disabled 2017-04-13 15:32:30 +01:00
a6a0da873f Merge branch 'feature/hadrons' into feature/qed-fvol 2017-04-13 15:31:06 +01:00
7b03d8d087 Fixing the remaining merge conflicts 2017-04-05 16:17:46 +01:00
4b759b8f2a Merge branch 'feature/hmc_generalise' into feature/scalar_adjointFT 2017-04-05 14:50:28 +01:00
038b6ee9cd Fixing JSON compilation error 2017-03-16 01:09:24 +09:00
38806343a8 Improving efficiency of the force term 2017-03-15 15:16:16 +09:00
831ca4e3bf Added Scalar action for fields in the adjoint representation 2017-03-14 14:55:18 +09:00
eedcaf6470 Merge branch 'feature/hadrons' into feature/qed-fvol 2017-02-01 15:53:10 -08:00
b39f0d1fb6 Hadrons: default I/O to HDF5 if possible, XML otherwise 2017-01-27 18:12:35 -08:00
9f1267dfe6 Merge branch 'feature/qed-fvol' of github.com:paboyle/Grid into feature/qed-fvol 2017-01-27 17:06:34 -08:00
2e90285232 Merge pull request #80 from jch1g10/feature/qed-fvol
ChargedProp: remove ScalarField fs
2017-01-27 17:06:13 -08:00
e254de982e Merge branch 'feature/qed-fvol' of github.com:paboyle/Grid into feature/qed-fvol 2017-01-27 17:02:35 -08:00
28d99b5297 Merge branch 'develop' into feature/qed-fvol 2017-01-27 16:59:53 -08:00
ee93f0218b ChargedProp: remove ScalarField fs 2017-01-27 12:22:48 +00:00
161ed102a5 Merge pull request #79 from jch1g10/feature/qed-fvol
Fixed bug in ChargedProp
2017-01-26 19:49:14 -08:00
f65a585236 ChargedProp: Switch to HDF5 output 2017-01-26 15:02:30 +00:00
ae99e99da2 Fixed bug in ChargedProp 2017-01-23 17:27:50 +00:00
f3ca29af6c Merge branch 'feature/hadrons' into feature/qed-fvol 2017-01-21 13:41:05 -08:00
37988221a8 Merge branch 'feature/serialisation-hdf5' into feature/qed-fvol 2017-01-20 14:04:20 -08:00
7a327a3f28 Merge branch 'develop' into feature/qed-fvol 2017-01-19 14:22:36 -08:00
92f8950a56 Charged scalar prop: cleaning and output 2017-01-13 13:30:56 +00:00
65987a8a58 First implementation of the scalar QED propagator, runs but absolutely not checked 2017-01-12 20:44:23 +00:00
889d828bc2 Code cleaning 2017-01-12 18:17:44 +00:00
ad98b6193d creating the necessary caches for the FFT EM scalar propagator 2017-01-11 18:40:43 +00:00
fc760016b3 More uniform cache name for scalar momentum propagators 2017-01-11 18:39:58 +00:00
2da86f7dae Merge branch 'feature/hadrons' into feature/qed-fvol 2017-01-11 18:38:05 +00:00
97843e2b58 Hadrons: free scalar buffer fix and output 2017-01-05 14:58:55 +00:00
82b3f54697 scalar free propagator fix 2017-01-05 14:58:07 +00:00
673994b281 Hadrons: modules for scalar propagators 2016-12-29 22:44:58 +01:00
bbc0eff078 Hadrons: scalar sources 2016-12-29 22:44:22 +01:00
4c60e31070 Hadrons: code cleaning 2016-12-29 22:44:08 +01:00
afbf7d4c37 QED Gimpl moved in Photon.h 2016-12-29 22:43:38 +01:00
8c3cc32364 Scalar action 2016-12-29 22:42:58 +01:00
4c3fd9fa3f stochastic QED field module in Hadrons 2016-12-22 00:29:41 +01:00
17b3a10d46 stochastic QED: function to cache 1/sqrt(khat^2) 2016-12-22 00:29:19 +01:00
149a46b92c Merge branch 'feature/hadrons' into feature/qed-fvol 2016-12-22 00:26:43 +01:00
db9c28a773 qed-fvol: Photon parameter name fix 2016-12-20 12:41:39 +01:00
9ac3ac41df serialisable Photon parameters 2016-12-20 12:41:01 +01:00
2af9ab9034 old Makefile cleaning 2016-12-20 12:40:26 +01:00
6f1ea96293 Merge branch 'develop' into feature/qed-fvol 2016-12-20 12:33:02 +01:00
2e3c5890b6 qed-fvol: build fix 2016-12-15 20:06:46 +00:00
bc6678732f Merge branch 'feature/hadrons' into feature/qed-fvol
# Conflicts:
#	Makefile.am
#	configure.ac
#	lib/qcd/action/gauge/Photon.h
2016-12-15 19:53:00 +00:00
b10ae00c8a Merge commit '6ad73145bc9754a5f26093eee5a34473ba0cff82' into feature/qed-fvol 2016-12-15 19:48:58 +00:00
0cd6b1858c Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering 2016-12-14 09:23:22 +00:00
6ad73145bc Calculate Wilson loop average over multiple configurations. 2016-11-30 15:17:22 +00:00
f7293f2ddb Merge pull request #69 from jch1g10/feature/qed-fvol 2016-11-26 07:04:04 +09:00
6b8ee7bae0 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-11-15 13:08:08 +00:00
739c2308b5 Set imaginary part of stochastic QED field to zero using real() instead of conjugate(). 2016-11-15 13:07:52 +00:00
a71b69389b QedFVol: calculate square Wilson loops up to 10x10 2016-11-14 18:23:04 +00:00
d49e502f53 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-11-14 18:00:33 +00:00
92ec3404f8 Set imaginary part of stochastic QED field to zero after FFT into position space 2016-11-14 17:59:02 +00:00
f4ebea3381 QedFVol: add functions for computing spatial and timelike Wilson loops 2016-11-14 17:51:53 +00:00
cf167d0cd1 QedFVol: implement exponentiation of photon field 2016-11-14 17:02:29 +00:00
c363bdd784 Merge branch 'release/v0.6.0' 2016-11-09 12:43:14 +00:00
c30d96ea50 QedFVol: x86intrin.h namespace fix 2016-11-09 11:06:20 +00:00
7ffe17ada1 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-11-08 14:52:43 +00:00
330a9b3f4c Merge pull request #65 from jch1g10/feature/qed-fvol 2016-11-01 19:53:25 +00:00
28ff66a381 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-11-01 16:07:46 +00:00
78c7bcee36 QedFVol: Change variables of type "double" to type "Real". 2016-11-01 16:06:05 +00:00
00a7b95631 Merge remote-tracking branch 'gh-james/feature/qed-fvol' into feature/qed-fvol 2016-10-31 18:46:23 +00:00
94d8321d01 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-10-31 18:41:30 +00:00
ac24cc9f99 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-10-29 11:05:26 +01:00
3ab4c8c0bb QedFVol: calculate plaquette and 2x2 Wilson loop of stochastic QED field 2016-10-25 13:32:02 +01:00
26d124283e Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-10-21 15:23:31 +01:00
0d889b7041 QedFVol: first attempt at generating a QED field 2016-10-21 15:21:32 +01:00
ab31ad006a Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-10-21 14:42:18 +01:00
6e4a06e180 qed-fvol: initial commit 2016-10-20 15:04:00 +01:00
446c768cd3 Merge branch 'hotfix/v0.5.1'
Double precision compile fix
2016-07-01 16:33:59 +01:00
223 changed files with 29504 additions and 9560 deletions

View File

@ -9,68 +9,6 @@ matrix:
- os: osx
osx_image: xcode8.3
compiler: clang
- compiler: gcc
dist: trusty
sudo: required
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.9
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: VERSION=-4.9
- compiler: gcc
dist: trusty
sudo: required
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-5
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: VERSION=-5
- compiler: clang
dist: trusty
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.8
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
- compiler: clang
dist: trusty
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.8
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
before_install:
- export GRIDDIR=`pwd`
@ -106,9 +44,3 @@ script:
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check
- echo make clean
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi

277
README.md
View File

@ -1,27 +1,44 @@
# Grid
<table>
<tr>
<td>Last stable release</td>
<td><a href="https://travis-ci.org/paboyle/Grid">
<img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
</td>
</tr>
<tr>
<td>Development branch</td>
<td><a href="https://travis-ci.org/paboyle/Grid">
<img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
</td>
</tr>
</table>
# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
**Data parallel C++ mathematical object library.**
License: GPL v2.
Last update Nov 2016.
Last update June 2017.
_Please do not send pull requests to the `master` branch which is reserved for releases._
### Description
This library provides data parallel C++ container classes with internal memory layout
that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
are provided, similar to HPF and cmfortran, and user control is given over the mapping of
array indices to both MPI tasks and SIMD processing elements.
* Identically shaped arrays then be processed with perfect data parallelisation.
* Such identically shaped arrays are called conformable arrays.
The transformation is based on the observation that Cartesian array processing involves
identical processing to be performed on different regions of the Cartesian array.
The library will both geometrically decompose into MPI tasks and across SIMD lanes.
Local vector loops are parallelised with OpenMP pragmas.
Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
for most programmers.
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types.
The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
MPI, OpenMP, and SIMD parallelism are present in the library.
Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
### Compilers
Intel ICPC v16.0.3 and later
@ -56,35 +73,25 @@ When you file an issue, please go though the following checklist:
6. Attach the output of `make V=1`.
7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
### Required libraries
Grid requires:
[GMP](https://gmplib.org/),
### Description
This library provides data parallel C++ container classes with internal memory layout
that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
are provided, similar to HPF and cmfortran, and user control is given over the mapping of
array indices to both MPI tasks and SIMD processing elements.
[MPFR](http://www.mpfr.org/)
* Identically shaped arrays then be processed with perfect data parallelisation.
* Such identically shaped arrays are called conformable arrays.
Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
The transformation is based on the observation that Cartesian array processing involves
identical processing to be performed on different regions of the Cartesian array.
Grid optionally uses:
The library will both geometrically decompose into MPI tasks and across SIMD lanes.
Local vector loops are parallelised with OpenMP pragmas.
[HDF5](https://support.hdfgroup.org/HDF5/)
Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
for most programmers.
[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support.
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
LAPACK either generic version or Intel MKL library.
MPI, OpenMP, and SIMD parallelism are present in the library.
Please see https://arxiv.org/abs/1512.03487 for more detail.
### Quick start
First, start by cloning the repository:
@ -155,7 +162,6 @@ The following options can be use with the `--enable-comms=` option to target dif
| `none` | no communications |
| `mpi[-auto]` | MPI communications |
| `mpi3[-auto]` | MPI communications using MPI 3 shared memory |
| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
| `shmem ` | Cray SHMEM communications |
For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.
@ -173,7 +179,8 @@ The following options can be use with the `--enable-simd=` option to target diff
| `AVXFMA4` | AVX (256 bit) + FMA4 |
| `AVX2` | AVX 2 (256 bit) |
| `AVX512` | AVX 512 bit |
| `QPX` | QPX (256 bit) |
| `NEONv8` | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit) |
| `QPX` | IBM QPX (256 bit) |
Alternatively, some CPU codenames can be directly used:
@ -196,20 +203,204 @@ The following configuration is recommended for the Intel Knights Landing platfor
../configure --enable-precision=double\
--enable-simd=KNL \
--enable-comms=mpi-auto \
--with-gmp=<path> \
--with-mpfr=<path> \
--enable-mkl \
CXX=icpc MPICXX=mpiicpc
```
The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
--enable-comms=mpi \
--with-gmp=<path> \
--with-mpfr=<path> \
--enable-mkl \
CXX=CC CC=cc
```
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
``` bash
--with-gmp=<path> \
--with-mpfr=<path> \
```
where `<path>` is the UNIX prefix where GMP and MPFR are installed.
Knight's Landing with Intel Omnipath adapters with two adapters per node
presently performs better with use of more than one rank per node, using shared memory
for interior communication. This is the mpi3 communications implementation.
We recommend four ranks per node for best performance, but optimum is local volume dependent.
``` bash
../configure --enable-precision=double\
--enable-simd=KNL \
--enable-comms=mpi3-auto \
--enable-mkl \
CC=icpc MPICXX=mpiicpc
```
### Build setup for Intel Haswell Xeon platform
The following configuration is recommended for the Intel Haswell platform:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX2 \
--enable-comms=mpi3-auto \
--enable-mkl \
CXX=icpc MPICXX=mpiicpc
```
The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
``` bash
--with-gmp=<path> \
--with-mpfr=<path> \
```
where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX2 \
--enable-comms=mpi3 \
--enable-mkl \
CXX=CC CC=cc
```
Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of
one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
```
export I_MPI_PIN=1
```
This is the default.
### Build setup for Intel Skylake Xeon platform
The following configuration is recommended for the Intel Skylake platform:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX512 \
--enable-comms=mpi3 \
--enable-mkl \
CXX=mpiicpc
```
The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
``` bash
--with-gmp=<path> \
--with-mpfr=<path> \
```
where `<path>` is the UNIX prefix where GMP and MPFR are installed.
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
``` bash
../configure --enable-precision=double\
--enable-simd=AVX512 \
--enable-comms=mpi3 \
--enable-mkl \
CXX=CC CC=cc
```
Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of
one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
```
export I_MPI_PIN=1
```
This is the default.
#### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping):
mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18
TBA
### Build setup for AMD EPYC / RYZEN
The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
are common. Each chip within the module exposes a separate NUMA domain.
There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
MPI-3 is recommended with the use of four ranks per socket,
and 8 threads per rank.
The following configuration is recommended for the AMD EPYC platform.
``` bash
../configure --enable-precision=double\
--enable-simd=AVX2 \
--enable-comms=mpi3 \
CXX=mpicxx
```
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
``` bash
--with-gmp=<path> \
--with-mpfr=<path> \
```
where `<path>` is the UNIX prefix where GMP and MPFR are installed.
Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this.
It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
shared memory to communicate within this node:
mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4
Where omp_bind.sh does the following:
```
#!/bin/bash
numanode=` expr $PMI_RANK % 8 `
basecore=`expr $numanode \* 16`
core0=`expr $basecore + 0 `
core1=`expr $basecore + 2 `
core2=`expr $basecore + 4 `
core3=`expr $basecore + 6 `
core4=`expr $basecore + 8 `
core5=`expr $basecore + 10 `
core6=`expr $basecore + 12 `
core7=`expr $basecore + 14 `
export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
$@
```
Performance:
#### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping):
mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
TBA
### Build setup for BlueGene/Q
To be written...
### Build setup for ARM Neon
To be written...
### Build setup for laptops, other compilers, non-cluster builds
Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
and omit the enable-mkl flag.
Single node builds are enabled with
```
--enable-comms=none
```
FFTW support that is not in the default search path may then enabled with
```
--with-fftw=<installpath>
```
BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.

18
TODO
View File

@ -2,18 +2,20 @@ TODO:
---------------
Large item work list:
1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
1)- BG/Q port and check ; Andrew says ok.
2)- Christoph's local basis expansion Lanczos
3)- BG/Q port and check
4)- Precision conversion and sort out localConvert <-- partial
- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
5)- Physical propagator interface
6)- Conserved currents
7)- Multigrid Wilson and DWF, compare to other Multigrid implementations
8)- HDCR resume
--
3a)- RNG I/O in ILDG/SciDAC (minor)
3b)- Precision conversion and sort out localConvert <-- partial/easy
3c)- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
4)- Physical propagator interface
5)- Conserved currents
6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
7)- HDCR resume
Recent DONE
-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O ; <-- DONE ; bmark cori
-- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
-- GaugeFix into central location <-- DONE
-- Scidac and Ildg metadata handling <-- DONE

800
benchmarks/Benchmark_ITT.cc Normal file
View File

@ -0,0 +1,800 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
std::vector<int> L_list;
std::vector<int> Ls_list;
std::vector<double> mflop_list;
double mflop_ref;
double mflop_ref_err;
int NN_global;
struct time_statistics{
double mean;
double err;
double min;
double max;
void statistics(std::vector<double> v){
double sum = std::accumulate(v.begin(), v.end(), 0.0);
mean = sum / v.size();
std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
auto result = std::minmax_element(v.begin(), v.end());
min = *result.first;
max = *result.second;
}
};
void comms_header(){
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
<<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
struct controls {
int Opt;
int CommsOverlap;
Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
// int HugePages;
};
class Benchmark {
public:
static void Decomposition (void ) {
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
std::cout<<GridLogMessage<<"\tvReal : "<<sizeof(vReal )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplex : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
static void Comms(void)
{
int Nloop=200;
int nmu=0;
int maxlat=32;
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::vector<double> t_time(Nloop);
time_statistics timestat;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
comms_header();
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
Grid.ShmBufferFreeAll();
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
int ncomm;
double dbytes;
std::vector<double> times(Nloop);
for(int i=0;i<Nloop;i++){
double start=usecond();
dbytes=0;
ncomm=0;
parallel_for(int dir=0;dir<8;dir++){
double tbytes;
int mu =dir % 4;
if (mpi_layout[mu]>1 ) {
int xmit_to_rank;
int recv_from_rank;
if ( dir == mu ) {
int comm_proc=1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
} else {
int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank,
bytes,dir);
#ifdef GRID_OMP
#pragma omp atomic
#endif
ncomm++;
#ifdef GRID_OMP
#pragma omp atomic
#endif
dbytes+=tbytes;
}
}
Grid.Barrier();
double stop=usecond();
t_time[i] = stop-start; // microseconds
}
timestat.statistics(t_time);
// for(int i=0;i<t_time.size();i++){
// std::cout << i<<" "<<t_time[i]<<std::endl;
// }
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
double rbytes = dbytes*0.5;
double bidibytes = dbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
}
}
return;
}
static void Memory(void)
{
const int Nvec=8;
typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
typedef iVector<vReal,Nvec> Vec;
std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t NP;
uint64_t NN;
uint64_t lmax=48;
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=4){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
NP= Grid.RankCount();
NN =Grid.NodeCount();
Vec rn ; random(sRNG,rn);
LatticeVec z(&Grid); z=rn;
LatticeVec x(&Grid); x=rn;
LatticeVec y(&Grid); y=rn;
double a=2.0;
uint64_t Nloop=NLOOP;
double start=usecond();
for(int i=0;i<Nloop;i++){
z=a*x-y;
x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
y._odata[4]=z._odata[4];
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
double flops=vol*Nvec*2;// mul,add
double bytes=3.0*vol*Nvec*sizeof(Real);
std::cout<<GridLogMessage<<std::setprecision(3)
<< lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
<< "\t\t"<< bytes/time/NN <<std::endl;
}
};
static double DWF5(int Ls,int L)
{
RealD mass=0.1;
RealD M5 =1.8;
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
std::vector<int> local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
std::vector<int> internal;
if ( SHM == 1 ) internal = std::vector<int>({1,1,1,1});
else if ( SHM == 2 ) internal = std::vector<int>({2,1,1,1});
else if ( SHM == 4 ) internal = std::vector<int>({2,2,1,1});
else if ( SHM == 8 ) internal = std::vector<int>({2,2,2,1});
else assert(0);
std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
///////// Source preparation ////////////
LatticeFermion src (sFGrid); random(RNG5,src);
LatticeFermion tmp (sFGrid);
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
LatticeFermion src_e (sFrbGrid);
LatticeFermion src_o (sFrbGrid);
LatticeFermion r_e (sFrbGrid);
LatticeFermion r_o (sFrbGrid);
LatticeFermion r_eo (sFGrid);
LatticeFermion err (sFGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
#if defined(AVX512)
const int num_cases = 6;
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
#else
const int num_cases = 4;
std::string fmt("U/S ; U/O ; G/S ; G/O ");
#endif
controls Cases [] = {
#ifdef AVX512
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
#endif
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
};
for(int c=0;c<num_cases;c++) {
QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
QCD::WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 100;
uint64_t ncall = 1000;
double t0=usecond();
sFGrid->Barrier();
for(int i=0;i<nwarm;i++){
sDw.DhopEO(src_o,r_e,DaggerNo);
}
sFGrid->Barrier();
double t1=usecond();
sDw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
sDw.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
sFGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node "<< mflops/NN<<std::endl;
sDw.Report();
}
double robust = mflops_worst/mflops_best;;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness = "<< robust <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
return mflops_best;
}
static double DWF(int Ls,int L, double & robust)
{
RealD mass=0.1;
RealD M5 =1.8;
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
std::vector<int> local({L,L,L,L});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),
GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
std::vector<int> internal;
if ( SHM == 1 ) internal = std::vector<int>({1,1,1,1});
else if ( SHM == 2 ) internal = std::vector<int>({2,1,1,1});
else if ( SHM == 4 ) internal = std::vector<int>({2,2,1,1});
else if ( SHM == 8 ) internal = std::vector<int>({2,2,2,1});
else assert(0);
std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* MPI ranks : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Intranode : "<<GridCmdVectorIntToString(internal)<<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<GridCmdVectorIntToString(nodes)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
///////// Source preparation ////////////
LatticeFermion src (FGrid); random(RNG5,src);
LatticeFermion ref (FGrid);
LatticeFermion tmp (FGrid);
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
{
LatticeGaugeField Umu5d(FGrid);
std::vector<LatticeColourMatrix> U(4,FGrid);
for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
}
}
ref = zero;
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid);
LatticeFermion r_o (FrbGrid);
LatticeFermion r_eo (FGrid);
LatticeFermion err (FGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
#if defined(AVX512)
const int num_cases = 6;
std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
#else
const int num_cases = 4;
std::string fmt("U/S ; U/O ; G/S ; G/O ");
#endif
controls Cases [] = {
#ifdef AVX512
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
#endif
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
{ QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
};
for(int c=0;c<num_cases;c++) {
QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
QCD::WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 200;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
FGrid->Barrier();
double t1=usecond();
// uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
// if (ncall < 500) ncall = 500;
uint64_t ncall = 1000;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
Dw.ZeroCounters();
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Dw.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
Dw.Report();
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-ref;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert((norm2(err)<1.0e-4));
}
robust = mflops_worst/mflops_best;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness = "<< robust <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
return mflops_best;
}
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
#ifdef KNL
LebesgueOrder::Block = std::vector<int>({8,2,2,2});
#else
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
#endif
Benchmark::Decomposition();
int do_memory=1;
int do_comms =1;
int do_su3 =0;
int do_wilson=1;
int do_dwf =1;
if ( do_su3 ) {
// empty for now
}
#if 1
int sel=2;
std::vector<int> L_list({8,12,16,24});
#else
int sel=1;
std::vector<int> L_list({8,12});
#endif
int selm1=sel-1;
std::vector<double> robust_list;
std::vector<double> wilson;
std::vector<double> dwf4;
std::vector<double> dwf5;
if ( do_wilson ) {
int Ls=1;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double robust;
wilson.push_back(Benchmark::DWF(1,L_list[l],robust));
}
}
int Ls=16;
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double robust;
double result = Benchmark::DWF(Ls,L_list[l],robust) ;
dwf4.push_back(result);
robust_list.push_back(robust);
}
}
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
}
}
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
int NN=NN_global;
if ( do_memory ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::Memory();
}
if ( do_comms && (NN>1) ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::Comms();
}
if ( do_dwf ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4 \t\t DWF5 " <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
std::cout<<std::setprecision(3);
std::cout<<GridLogMessage << " Comparison point robustness: " << robust_list[sel] <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
Grid_finalize();
}

View File

@ -68,7 +68,7 @@ int main (int argc, char ** argv)
int Nloop=100;
int nmu=0;
int maxlat=24;
int maxlat=32;
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
@ -80,7 +80,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header();
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){
for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@ -92,11 +92,16 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
for(int mu=0;mu<8;mu++){
xbuf[mu].resize(lat*lat*lat*Ls);
rbuf[mu].resize(lat*lat*lat*Ls);
// std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
}
for(int i=0;i<Nloop;i++){
double start=usecond();
@ -112,7 +117,6 @@ int main (int argc, char ** argv)
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFromBegin(requests,
(void *)&xbuf[mu][0],
@ -163,7 +167,7 @@ int main (int argc, char ** argv)
header();
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){
for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat,lat,lat,lat});
@ -172,9 +176,14 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
for(int mu=0;mu<8;mu++){
xbuf[mu].resize(lat*lat*lat*Ls);
rbuf[mu].resize(lat*lat*lat*Ls);
// std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
}
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@ -249,7 +258,7 @@ int main (int argc, char ** argv)
header();
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){
for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@ -299,7 +308,7 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
bytes,mu);
comm_proc = mpi_layout[mu]-1;
@ -310,11 +319,11 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
bytes,mu+4);
}
}
Grid.StencilSendToRecvFromComplete(requests);
Grid.StencilSendToRecvFromComplete(requests,0);
Grid.Barrier();
double stop=usecond();
t_time[i] = stop-start; // microseconds
@ -346,7 +355,7 @@ int main (int argc, char ** argv)
header();
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=32;Ls*=2){
for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@ -393,8 +402,8 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
Grid.StencilSendToRecvFromComplete(requests);
bytes,mu);
Grid.StencilSendToRecvFromComplete(requests,mu);
requests.resize(0);
comm_proc = mpi_layout[mu]-1;
@ -406,8 +415,8 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
Grid.StencilSendToRecvFromComplete(requests);
bytes,mu+4);
Grid.StencilSendToRecvFromComplete(requests,mu+4);
requests.resize(0);
}
@ -436,5 +445,97 @@ int main (int argc, char ** argv)
}
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header();
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
Grid.ShmBufferFreeAll();
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
double dbytes;
for(int i=0;i<Nloop;i++){
double start=usecond();
std::vector<CartesianCommunicator::CommsRequest_t> requests;
dbytes=0;
ncomm=0;
parallel_for(int dir=0;dir<8;dir++){
double tbytes;
int mu =dir % 4;
if (mpi_layout[mu]>1 ) {
ncomm++;
int xmit_to_rank;
int recv_from_rank;
if ( dir == mu ) {
int comm_proc=1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
} else {
int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
#pragma omp atomic
dbytes+=tbytes;
}
}
Grid.Barrier();
double stop=usecond();
t_time[i] = stop-start; // microseconds
}
timestat.statistics(t_time);
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
double rbytes = dbytes*0.5;
double bidibytes = dbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
}
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
Grid_finalize();
}

View File

@ -51,7 +51,13 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> latt4 = GridDefaultLatt();
const int Ls=16;
int Ls=16;
for(int i=0;i<argc;i++)
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@ -165,7 +171,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =1000;
int ncall =500;
if (1) {
FGrid->Barrier();
Dw.ZeroCounters();
@ -303,6 +309,7 @@ int main (int argc, char ** argv)
}
assert(sum < 1.0e-4);
if(1){
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
@ -381,7 +388,22 @@ int main (int argc, char ** argv)
}
assert(error<1.0e-4);
}
if(0){
std::cout << "Single cache warm call to sDw.Dhop " <<std::endl;
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
sDw.Dhop(ssrc,sresult,0);
PerformanceCounter Counter(i);
Counter.Start();
sDw.Dhop(ssrc,sresult,0);
Counter.Stop();
Counter.Report();
}
}
}
if (1)
{ // Naive wilson dag implementation
@ -487,9 +509,9 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
//assert(norm2(src_e)<1.0e-4);
//assert(norm2(src_o)<1.0e-4);
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
Grid_finalize();
exit(0);
}

View File

@ -0,0 +1,190 @@
#include <Grid/Grid.h>
#include <sstream>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
template<class d>
struct scal {
d internal;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
typedef typename GparityDomainWallFermionF::FermionField GparityLatticeFermionF;
typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int Ls=16;
for(int i=0;i<argc;i++)
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "Ls = " << Ls << std::endl;
std::vector<int> latt4 = GridDefaultLatt();
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
GparityLatticeFermionF src (FGrid); random(RNG5,src);
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
GparityLatticeFermionF result(FGrid); result=zero;
GparityLatticeFermionF ref(FGrid); ref=zero;
GparityLatticeFermionF tmp(FGrid);
GparityLatticeFermionF err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid);
SU3::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount();
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* SINGLE/SINGLE"<<std::endl;
GparityDomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =1000;
if (1) {
FGrid->Barrier();
Dw.ZeroCounters();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
Dw.Dhop(src,result,0);
__SSC_STOP;
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=2*1344*volume*ncall;
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
Dw.Report();
}
std::cout << GridLogMessage<< "* SINGLE/HALF"<<std::endl;
GparityDomainWallFermionFH DwH(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
if (1) {
FGrid->Barrier();
DwH.ZeroCounters();
DwH.Dhop(src,result,0);
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
DwH.Dhop(src,result,0);
__SSC_STOP;
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=2*1344*volume*ncall;
std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
DwH.Report();
}
GridCartesian * UGrid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_d);
GridCartesian * FGrid_d = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_d);
GridRedBlackCartesian * FrbGrid_d = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_d);
std::cout << GridLogMessage<< "* DOUBLE/DOUBLE"<<std::endl;
GparityLatticeFermionD src_d(FGrid_d);
precisionChange(src_d,src);
LatticeGaugeFieldD Umu_d(UGrid_d);
precisionChange(Umu_d,Umu);
GparityLatticeFermionD result_d(FGrid_d);
GparityDomainWallFermionD DwD(Umu_d,*FGrid_d,*FrbGrid_d,*UGrid_d,*UrbGrid_d,mass,M5);
if (1) {
FGrid_d->Barrier();
DwD.ZeroCounters();
DwD.Dhop(src_d,result_d,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
DwD.Dhop(src_d,result_d,0);
__SSC_STOP;
}
double t1=usecond();
FGrid_d->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=2*1344*volume*ncall;
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
DwD.Report();
}
Grid_finalize();
}

View File

@ -55,21 +55,21 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t lmax=64;
#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
for(int lat=4;lat<=lmax;lat+=4){
uint64_t lmax=96;
#define NLOOP (10*lmax*lmax*lmax*lmax/vol)
for(int lat=8;lat<=lmax;lat+=8){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
uint64_t Nloop=NLOOP;
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
LatticeVec y(&Grid); //random(pRNG,y);
LatticeVec z(&Grid);// random(pRNG,z);
LatticeVec x(&Grid);// random(pRNG,x);
LatticeVec y(&Grid);// random(pRNG,y);
double a=2.0;
@ -83,7 +83,7 @@ int main (int argc, char ** argv)
double time = (stop-start)/Nloop*1000;
double flops=vol*Nvec*2;// mul,add
double bytes=3*vol*Nvec*sizeof(Real);
double bytes=3.0*vol*Nvec*sizeof(Real);
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
}
@ -94,17 +94,17 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=4;lat<=lmax;lat+=4){
for(int lat=8;lat<=lmax;lat+=8){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
LatticeVec y(&Grid); //random(pRNG,y);
LatticeVec z(&Grid);// random(pRNG,z);
LatticeVec x(&Grid);// random(pRNG,x);
LatticeVec y(&Grid);// random(pRNG,y);
double a=2.0;
uint64_t Nloop=NLOOP;
@ -119,7 +119,7 @@ int main (int argc, char ** argv)
double time = (stop-start)/Nloop*1000;
double flops=vol*Nvec*2;// mul,add
double bytes=3*vol*Nvec*sizeof(Real);
double bytes=3.0*vol*Nvec*sizeof(Real);
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
}
@ -129,20 +129,20 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
for(int lat=4;lat<=lmax;lat+=4){
for(int lat=8;lat<=lmax;lat+=8){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
uint64_t Nloop=NLOOP;
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
LatticeVec y(&Grid); //random(pRNG,y);
LatticeVec z(&Grid);// random(pRNG,z);
LatticeVec x(&Grid);// random(pRNG,x);
LatticeVec y(&Grid);// random(pRNG,y);
RealD a=2.0;
@ -154,7 +154,7 @@ int main (int argc, char ** argv)
double stop=usecond();
double time = (stop-start)/Nloop*1000;
double bytes=2*vol*Nvec*sizeof(Real);
double bytes=2.0*vol*Nvec*sizeof(Real);
double flops=vol*Nvec*1;// mul
std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
@ -166,17 +166,17 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=4;lat<=lmax;lat+=4){
for(int lat=8;lat<=lmax;lat+=8){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
uint64_t Nloop=NLOOP;
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
LatticeVec y(&Grid); //random(pRNG,y);
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeVec z(&Grid);// random(pRNG,z);
LatticeVec x(&Grid);// random(pRNG,x);
LatticeVec y(&Grid);// random(pRNG,y);
RealD a=2.0;
Real nn;
double start=usecond();
@ -187,7 +187,7 @@ int main (int argc, char ** argv)
double stop=usecond();
double time = (stop-start)/Nloop*1000;
double bytes=vol*Nvec*sizeof(Real);
double bytes=1.0*vol*Nvec*sizeof(Real);
double flops=vol*Nvec*2;// mul,add
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;

View File

@ -40,7 +40,7 @@ int main (int argc, char ** argv)
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGrid(&Grid);
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

View File

@ -37,12 +37,12 @@ int main (int argc, char ** argv)
Grid_init(&argc,&argv);
#define LMAX (64)
int Nloop=20;
int64_t Nloop=20;
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
int threads = GridThread::GetThreads();
int64_t threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@ -54,16 +54,16 @@ int main (int argc, char ** argv)
for(int lat=2;lat<=LMAX;lat+=2){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeColourMatrix z(&Grid);// random(pRNG,z);
LatticeColourMatrix x(&Grid);// random(pRNG,x);
LatticeColourMatrix y(&Grid);// random(pRNG,y);
LatticeColourMatrix z(&Grid); random(pRNG,z);
LatticeColourMatrix x(&Grid); random(pRNG,x);
LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
for(int i=0;i<Nloop;i++){
for(int64_t i=0;i<Nloop;i++){
x=x*y;
}
double stop=usecond();
@ -86,17 +86,17 @@ int main (int argc, char ** argv)
for(int lat=2;lat<=LMAX;lat+=2){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeColourMatrix z(&Grid); //random(pRNG,z);
LatticeColourMatrix x(&Grid); //random(pRNG,x);
LatticeColourMatrix y(&Grid); //random(pRNG,y);
LatticeColourMatrix z(&Grid); random(pRNG,z);
LatticeColourMatrix x(&Grid); random(pRNG,x);
LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
for(int i=0;i<Nloop;i++){
for(int64_t i=0;i<Nloop;i++){
z=x*y;
}
double stop=usecond();
@ -117,17 +117,17 @@ int main (int argc, char ** argv)
for(int lat=2;lat<=LMAX;lat+=2){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeColourMatrix z(&Grid); //random(pRNG,z);
LatticeColourMatrix x(&Grid); //random(pRNG,x);
LatticeColourMatrix y(&Grid); //random(pRNG,y);
LatticeColourMatrix z(&Grid); random(pRNG,z);
LatticeColourMatrix x(&Grid); random(pRNG,x);
LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
for(int i=0;i<Nloop;i++){
for(int64_t i=0;i<Nloop;i++){
mult(z,x,y);
}
double stop=usecond();
@ -148,17 +148,17 @@ int main (int argc, char ** argv)
for(int lat=2;lat<=LMAX;lat+=2){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeColourMatrix z(&Grid); //random(pRNG,z);
LatticeColourMatrix x(&Grid); //random(pRNG,x);
LatticeColourMatrix y(&Grid); //random(pRNG,y);
LatticeColourMatrix z(&Grid); random(pRNG,z);
LatticeColourMatrix x(&Grid); random(pRNG,x);
LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
for(int i=0;i<Nloop;i++){
for(int64_t i=0;i<Nloop;i++){
mac(z,x,y);
}
double stop=usecond();

View File

@ -58,7 +58,7 @@ int main (int argc, char ** argv)
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGrid(&Grid);
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

View File

@ -93,7 +93,7 @@ int main (int argc, char ** argv)
std::cout << latt_size.back() << "\t\t";
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGrid(&Grid);
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
LatticeGaugeField Umu(&Grid); random(pRNG,Umu);

View File

@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
################ Get git info
#AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
################ Set flags
# do not move!
CXXFLAGS="-O3 $CXXFLAGS"
############### Checks for programs
AC_PROG_CXX
AC_PROG_RANLIB
@ -27,7 +31,6 @@ AX_GXX_VERSION
AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
[version of g++ that will compile the code])
CXXFLAGS="-g $CXXFLAGS"
############### Checks for typedefs, structures, and compiler characteristics
@ -51,9 +54,14 @@ AC_CHECK_HEADERS(malloc/malloc.h)
AC_CHECK_HEADERS(malloc.h)
AC_CHECK_HEADERS(endian.h)
AC_CHECK_HEADERS(execinfo.h)
AC_CHECK_HEADERS(numaif.h)
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
############## Standard libraries
AC_CHECK_LIB([m],[cos])
AC_CHECK_LIB([stdc++],[abort])
############### GMP and MPFR
AC_ARG_WITH([gmp],
[AS_HELP_STRING([--with-gmp=prefix],
@ -186,9 +194,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
AC_SEARCH_LIBS([crc32], [z],
[AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
[have_zlib=true],
[have_zlib=true] [LIBS="${LIBS} -lz"],
[AC_MSG_ERROR(zlib library was not found in your system.)])
AC_SEARCH_LIBS([move_pages], [numa],
[AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
[have_libnuma=true] [LIBS="${LIBS} -lnuma"],
[AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
[AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
[have_hdf5=true]
@ -241,6 +254,7 @@ case ${ax_cv_cxx_compiler_vendor} in
SIMD_FLAGS='';;
KNL)
AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
AC_DEFINE([KNL],[1],[Knights landing processor])
SIMD_FLAGS='-march=knl';;
GEN)
AC_DEFINE([GEN],[1],[generic vector code])
@ -248,6 +262,9 @@ case ${ax_cv_cxx_compiler_vendor} in
[generic SIMD vector width (in bytes)])
SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
SIMD_FLAGS='';;
NEONv8)
AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
SIMD_FLAGS='-march=armv8-a';;
QPX|BGQ)
AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
SIMD_FLAGS='';;
@ -276,6 +293,7 @@ case ${ax_cv_cxx_compiler_vendor} in
SIMD_FLAGS='';;
KNL)
AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
AC_DEFINE([KNL],[1],[Knights landing processor])
SIMD_FLAGS='-xmic-avx512';;
GEN)
AC_DEFINE([GEN],[1],[generic vector code])
@ -313,8 +331,41 @@ case ${ac_PRECISION} in
double)
AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
;;
*)
AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
;;
esac
###################### Shared memory allocation technique under MPI3
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmget|shmopen|hugetlbfs],
[Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
case ${ac_SHM} in
shmget)
AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
;;
shmopen)
AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
;;
hugetlbfs)
AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
;;
*)
AC_MSG_ERROR([${ac_SHM} unsupported --enable-shm option]);
;;
esac
###################### Shared base path for SHMMMAP
AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
[Select SHM mmap base path for hugetlbfs])],
[ac_SHMPATH=${enable_shmpath}],
[ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/])
AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
############### communication type selection
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem],
[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
@ -324,14 +375,14 @@ case ${ac_COMMS} in
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
comms_type='none'
;;
mpi3l*)
AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
comms_type='mpi3l'
;;
mpi3*)
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
comms_type='mpi3'
;;
mpit)
AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
comms_type='mpit'
;;
mpi*)
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
comms_type='mpi'
@ -359,7 +410,7 @@ esac
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI, [ test "${comms_type}X" == "mpiX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI3, [ test "${comms_type}X" == "mpi3X" ] )
AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
AM_CONDITIONAL(BUILD_COMMS_MPIT, [ test "${comms_type}X" == "mpitX" ] )
AM_CONDITIONAL(BUILD_COMMS_NONE, [ test "${comms_type}X" == "noneX" ])
############### RNG selection
@ -464,6 +515,8 @@ compiler version : ${ax_cv_gxx_version}
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
Threading : ${ac_openmp}
Communications type : ${comms_type}
Shared memory allocator : ${ac_SHM}
Shared memory mmap path : ${ac_SHMPATH}
Default precision : ${ac_PRECISION}
Software FP16 conversion : ${ac_SFW_FP16}
RNG choice : ${ac_RNG}
@ -497,6 +550,7 @@ AC_CONFIG_FILES(tests/forces/Makefile)
AC_CONFIG_FILES(tests/hadrons/Makefile)
AC_CONFIG_FILES(tests/hmc/Makefile)
AC_CONFIG_FILES(tests/solver/Makefile)
AC_CONFIG_FILES(tests/lanczos/Makefile)
AC_CONFIG_FILES(tests/smearing/Makefile)
AC_CONFIG_FILES(tests/qdpxx/Makefile)
AC_CONFIG_FILES(tests/testu01/Makefile)

View File

@ -41,9 +41,10 @@ using namespace Hadrons;
// constructor /////////////////////////////////////////////////////////////////
Environment::Environment(void)
{
nd_ = GridDefaultLatt().size();
dim_ = GridDefaultLatt();
nd_ = dim_.size();
grid4d_.reset(SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(nd_, vComplex::Nsimd()),
dim_, GridDefaultSimd(nd_, vComplex::Nsimd()),
GridDefaultMpi()));
gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get()));
auto loc = getGrid()->LocalDimensions();
@ -132,6 +133,16 @@ unsigned int Environment::getNd(void) const
return nd_;
}
std::vector<int> Environment::getDim(void) const
{
return dim_;
}
int Environment::getDim(const unsigned int mu) const
{
return dim_[mu];
}
// random number generator /////////////////////////////////////////////////////
void Environment::setSeed(const std::vector<int> &seed)
{
@ -271,6 +282,21 @@ std::string Environment::getModuleType(const std::string name) const
return getModuleType(getModuleAddress(name));
}
std::string Environment::getModuleNamespace(const unsigned int address) const
{
std::string type = getModuleType(address), ns;
auto pos2 = type.rfind("::");
auto pos1 = type.rfind("::", pos2 - 2);
return type.substr(pos1 + 2, pos2 - pos1 - 2);
}
std::string Environment::getModuleNamespace(const std::string name) const
{
return getModuleNamespace(getModuleAddress(name));
}
bool Environment::hasModule(const unsigned int address) const
{
return (address < module_.size());
@ -491,9 +517,16 @@ std::string Environment::getObjectName(const unsigned int address) const
std::string Environment::getObjectType(const unsigned int address) const
{
if (hasRegisteredObject(address))
{
if (object_[address].type)
{
return typeName(object_[address].type);
}
else
{
return "<no type>";
}
}
else if (hasObject(address))
{
HADRON_ERROR("object with address " + std::to_string(address)
@ -532,6 +565,23 @@ Environment::Size Environment::getObjectSize(const std::string name) const
return getObjectSize(getObjectAddress(name));
}
unsigned int Environment::getObjectModule(const unsigned int address) const
{
if (hasObject(address))
{
return object_[address].module;
}
else
{
HADRON_ERROR("no object with address " + std::to_string(address));
}
}
unsigned int Environment::getObjectModule(const std::string name) const
{
return getObjectModule(getObjectAddress(name));
}
unsigned int Environment::getObjectLs(const unsigned int address) const
{
if (hasRegisteredObject(address))

View File

@ -106,6 +106,8 @@ public:
void createGrid(const unsigned int Ls);
GridCartesian * getGrid(const unsigned int Ls = 1) const;
GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const;
std::vector<int> getDim(void) const;
int getDim(const unsigned int mu) const;
unsigned int getNd(void) const;
// random number generator
void setSeed(const std::vector<int> &seed);
@ -131,6 +133,8 @@ public:
std::string getModuleName(const unsigned int address) const;
std::string getModuleType(const unsigned int address) const;
std::string getModuleType(const std::string name) const;
std::string getModuleNamespace(const unsigned int address) const;
std::string getModuleNamespace(const std::string name) const;
bool hasModule(const unsigned int address) const;
bool hasModule(const std::string name) const;
Graph<unsigned int> makeModuleGraph(void) const;
@ -171,6 +175,8 @@ public:
std::string getObjectType(const std::string name) const;
Size getObjectSize(const unsigned int address) const;
Size getObjectSize(const std::string name) const;
unsigned int getObjectModule(const unsigned int address) const;
unsigned int getObjectModule(const std::string name) const;
unsigned int getObjectLs(const unsigned int address) const;
unsigned int getObjectLs(const std::string name) const;
bool hasObject(const unsigned int address) const;
@ -181,6 +187,10 @@ public:
bool hasCreatedObject(const std::string name) const;
bool isObject5d(const unsigned int address) const;
bool isObject5d(const std::string name) const;
template <typename T>
bool isObjectOfType(const unsigned int address) const;
template <typename T>
bool isObjectOfType(const std::string name) const;
Environment::Size getTotalSize(void) const;
void addOwnership(const unsigned int owner,
const unsigned int property);
@ -197,6 +207,7 @@ private:
bool dryRun_{false};
unsigned int traj_, locVol_;
// grids
std::vector<int> dim_;
GridPt grid4d_;
std::map<unsigned int, GridPt> grid5d_;
GridRbPt gridRb4d_;
@ -343,7 +354,7 @@ T * Environment::getObject(const unsigned int address) const
else
{
HADRON_ERROR("object with address " + std::to_string(address) +
" does not have type '" + typeid(T).name() +
" does not have type '" + typeName(&typeid(T)) +
"' (has type '" + getObjectType(address) + "')");
}
}
@ -380,6 +391,37 @@ T * Environment::createLattice(const std::string name)
return createLattice<T>(getObjectAddress(name));
}
template <typename T>
bool Environment::isObjectOfType(const unsigned int address) const
{
if (hasRegisteredObject(address))
{
if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
{
return true;
}
else
{
return false;
}
}
else if (hasObject(address))
{
HADRON_ERROR("object with address " + std::to_string(address) +
" exists but is not registered");
}
else
{
HADRON_ERROR("no object with address " + std::to_string(address));
}
}
template <typename T>
bool Environment::isObjectOfType(const std::string name) const
{
return isObjectOfType<T>(getObjectAddress(name));
}
END_HADRONS_NAMESPACE
#endif // Hadrons_Environment_hpp_

View File

@ -51,23 +51,43 @@ using Grid::operator<<;
* error with GCC 5 (clang & GCC 6 compile fine without it).
*/
// FIXME: find a way to do that in a more general fashion
#ifndef FIMPL
#define FIMPL WilsonImplR
#endif
#ifndef SIMPL
#define SIMPL ScalarImplCR
#endif
BEGIN_HADRONS_NAMESPACE
// type aliases
#define TYPE_ALIASES(FImpl, suffix)\
#define FERM_TYPE_ALIASES(FImpl, suffix)\
typedef FermionOperator<FImpl> FMat##suffix; \
typedef typename FImpl::FermionField FermionField##suffix; \
typedef typename FImpl::PropagatorField PropagatorField##suffix; \
typedef typename FImpl::SitePropagator SitePropagator##suffix; \
typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;\
typedef std::function<void(FermionField##suffix &, \
typedef std::vector<typename FImpl::SitePropagator::scalar_object> \
SlicedPropagator##suffix;
#define GAUGE_TYPE_ALIASES(FImpl, suffix)\
typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
#define SCALAR_TYPE_ALIASES(SImpl, suffix)\
typedef typename SImpl::Field ScalarField##suffix;\
typedef typename SImpl::Field PropagatorField##suffix;
#define SOLVER_TYPE_ALIASES(FImpl, suffix)\
typedef std::function<void(FermionField##suffix &,\
const FermionField##suffix &)> SolverFn##suffix;
#define SINK_TYPE_ALIASES(suffix)\
typedef std::function<SlicedPropagator##suffix(const PropagatorField##suffix &)> SinkFn##suffix;
#define FGS_TYPE_ALIASES(FImpl, suffix)\
FERM_TYPE_ALIASES(FImpl, suffix)\
GAUGE_TYPE_ALIASES(FImpl, suffix)\
SOLVER_TYPE_ALIASES(FImpl, suffix)
// logger
class HadronsLogger: public Logger
{

View File

@ -1,31 +1,3 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules.hpp
Copyright (C) 2015
Copyright (C) 2016
Author: Antonin Portelli <antonin.portelli@me.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Hadrons/Modules/MAction/DWF.hpp>
#include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
#include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
@ -36,13 +8,18 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
#include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
#include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
#include <Grid/Hadrons/Modules/MGauge/Load.hpp>
#include <Grid/Hadrons/Modules/MGauge/Random.hpp>
#include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
#include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
#include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
#include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
#include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
#include <Grid/Hadrons/Modules/MSink/Point.hpp>
#include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
#include <Grid/Hadrons/Modules/MSource/Point.hpp>
#include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
#include <Grid/Hadrons/Modules/MSource/Wall.hpp>
#include <Grid/Hadrons/Modules/MSource/Z2.hpp>
#include <Grid/Hadrons/Modules/Quark.hpp>

View File

@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_DWF_hpp_
#define Hadrons_DWF_hpp_
#ifndef Hadrons_MAction_DWF_hpp_
#define Hadrons_MAction_DWF_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -56,7 +56,7 @@ template <typename FImpl>
class TDWF: public Module<DWFPar>
{
public:
TYPE_ALIASES(FImpl,);
FGS_TYPE_ALIASES(FImpl,);
public:
// constructor
TDWF(const std::string name);
@ -137,4 +137,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_DWF_hpp_
#endif // Hadrons_MAction_DWF_hpp_

View File

@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_Wilson_hpp_
#define Hadrons_Wilson_hpp_
#ifndef Hadrons_MAction_Wilson_hpp_
#define Hadrons_MAction_Wilson_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -54,7 +54,7 @@ template <typename FImpl>
class TWilson: public Module<WilsonPar>
{
public:
TYPE_ALIASES(FImpl,);
FGS_TYPE_ALIASES(FImpl,);
public:
// constructor
TWilson(const std::string name);

View File

@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_Baryon_hpp_
#define Hadrons_Baryon_hpp_
#ifndef Hadrons_MContraction_Baryon_hpp_
#define Hadrons_MContraction_Baryon_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -55,9 +55,9 @@ template <typename FImpl1, typename FImpl2, typename FImpl3>
class TBaryon: public Module<BaryonPar>
{
public:
TYPE_ALIASES(FImpl1, 1);
TYPE_ALIASES(FImpl2, 2);
TYPE_ALIASES(FImpl3, 3);
FERM_TYPE_ALIASES(FImpl1, 1);
FERM_TYPE_ALIASES(FImpl2, 2);
FERM_TYPE_ALIASES(FImpl3, 3);
class Result: Serializable
{
public:
@ -121,11 +121,11 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
// FIXME: do contractions
write(writer, "meson", result);
// write(writer, "meson", result);
}
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_Baryon_hpp_
#endif // Hadrons_MContraction_Baryon_hpp_

View File

@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_DiscLoop_hpp_
#define Hadrons_DiscLoop_hpp_
#ifndef Hadrons_MContraction_DiscLoop_hpp_
#define Hadrons_MContraction_DiscLoop_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -52,7 +52,7 @@ public:
template <typename FImpl>
class TDiscLoop: public Module<DiscLoopPar>
{
TYPE_ALIASES(FImpl,);
FERM_TYPE_ALIASES(FImpl,);
class Result: Serializable
{
public:
@ -141,4 +141,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_DiscLoop_hpp_
#endif // Hadrons_MContraction_DiscLoop_hpp_

View File

@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_Gamma3pt_hpp_
#define Hadrons_Gamma3pt_hpp_
#ifndef Hadrons_MContraction_Gamma3pt_hpp_
#define Hadrons_MContraction_Gamma3pt_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -72,9 +72,9 @@ public:
template <typename FImpl1, typename FImpl2, typename FImpl3>
class TGamma3pt: public Module<Gamma3ptPar>
{
TYPE_ALIASES(FImpl1, 1);
TYPE_ALIASES(FImpl2, 2);
TYPE_ALIASES(FImpl3, 3);
FERM_TYPE_ALIASES(FImpl1, 1);
FERM_TYPE_ALIASES(FImpl2, 2);
FERM_TYPE_ALIASES(FImpl3, 3);
class Result: Serializable
{
public:
@ -167,4 +167,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_Gamma3pt_hpp_
#endif // Hadrons_MContraction_Gamma3pt_hpp_

View File

@ -29,8 +29,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_Meson_hpp_
#define Hadrons_Meson_hpp_
#ifndef Hadrons_MContraction_Meson_hpp_
#define Hadrons_MContraction_Meson_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -69,7 +69,7 @@ public:
std::string, q1,
std::string, q2,
std::string, gammas,
std::string, mom,
std::string, sink,
std::string, output);
};
@ -77,8 +77,10 @@ template <typename FImpl1, typename FImpl2>
class TMeson: public Module<MesonPar>
{
public:
TYPE_ALIASES(FImpl1, 1);
TYPE_ALIASES(FImpl2, 2);
FERM_TYPE_ALIASES(FImpl1, 1);
FERM_TYPE_ALIASES(FImpl2, 2);
FERM_TYPE_ALIASES(ScalarImplCR, Scalar);
SINK_TYPE_ALIASES(Scalar);
class Result: Serializable
{
public:
@ -115,7 +117,7 @@ TMeson<FImpl1, FImpl2>::TMeson(const std::string name)
template <typename FImpl1, typename FImpl2>
std::vector<std::string> TMeson<FImpl1, FImpl2>::getInput(void)
{
std::vector<std::string> input = {par().q1, par().q2};
std::vector<std::string> input = {par().q1, par().q2, par().sink};
return input;
}
@ -154,6 +156,9 @@ void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList)
// execution ///////////////////////////////////////////////////////////////////
#define mesonConnected(q1, q2, gSnk, gSrc) \
(g5*(gSnk))*(q1)*(adj(gSrc)*g5)*adj(q2)
template <typename FImpl1, typename FImpl2>
void TMeson<FImpl1, FImpl2>::execute(void)
{
@ -162,44 +167,73 @@ void TMeson<FImpl1, FImpl2>::execute(void)
<< std::endl;
CorrWriter writer(par().output);
PropagatorField1 &q1 = *env().template getObject<PropagatorField1>(par().q1);
PropagatorField2 &q2 = *env().template getObject<PropagatorField2>(par().q2);
LatticeComplex c(env().getGrid());
Gamma g5(Gamma::Algebra::Gamma5);
std::vector<GammaPair> gammaList;
std::vector<TComplex> buf;
std::vector<Result> result;
std::vector<Real> p;
p = strToVec<Real>(par().mom);
LatticeComplex ph(env().getGrid()), coor(env().getGrid());
Complex i(0.0,1.0);
ph = zero;
for(unsigned int mu = 0; mu < env().getNd(); mu++)
{
LatticeCoordinate(coor, mu);
ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
}
ph = exp((Real)(2*M_PI)*i*ph);
Gamma g5(Gamma::Algebra::Gamma5);
std::vector<GammaPair> gammaList;
int nt = env().getDim(Tp);
parseGammaString(gammaList);
result.resize(gammaList.size());
for (unsigned int i = 0; i < result.size(); ++i)
{
result[i].gamma_snk = gammaList[i].first;
result[i].gamma_src = gammaList[i].second;
result[i].corr.resize(nt);
}
if (env().template isObjectOfType<SlicedPropagator1>(par().q1) and
env().template isObjectOfType<SlicedPropagator2>(par().q2))
{
SlicedPropagator1 &q1 = *env().template getObject<SlicedPropagator1>(par().q1);
SlicedPropagator2 &q2 = *env().template getObject<SlicedPropagator2>(par().q2);
LOG(Message) << "(propagator already sinked)" << std::endl;
for (unsigned int i = 0; i < result.size(); ++i)
{
Gamma gSnk(gammaList[i].first);
Gamma gSrc(gammaList[i].second);
c = trace((g5*gSnk)*q1*(adj(gSrc)*g5)*adj(q2))*ph;
sliceSum(c, buf, Tp);
result[i].gamma_snk = gammaList[i].first;
result[i].gamma_src = gammaList[i].second;
result[i].corr.resize(buf.size());
for (unsigned int t = 0; t < buf.size(); ++t)
{
result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
}
}
}
else
{
PropagatorField1 &q1 = *env().template getObject<PropagatorField1>(par().q1);
PropagatorField2 &q2 = *env().template getObject<PropagatorField2>(par().q2);
LatticeComplex c(env().getGrid());
LOG(Message) << "(using sink '" << par().sink << "')" << std::endl;
for (unsigned int i = 0; i < result.size(); ++i)
{
Gamma gSnk(gammaList[i].first);
Gamma gSrc(gammaList[i].second);
std::string ns;
ns = env().getModuleNamespace(env().getObjectModule(par().sink));
if (ns == "MSource")
{
PropagatorField1 &sink =
*env().template getObject<PropagatorField1>(par().sink);
c = trace(mesonConnected(q1, q2, gSnk, gSrc)*sink);
sliceSum(c, buf, Tp);
}
else if (ns == "MSink")
{
SinkFnScalar &sink = *env().template getObject<SinkFnScalar>(par().sink);
c = trace(mesonConnected(q1, q2, gSnk, gSrc));
buf = sink(c);
}
for (unsigned int t = 0; t < buf.size(); ++t)
{
result[i].corr[t] = TensorRemove(buf[t]);
}
}
}
write(writer, "meson", result);
}
@ -207,4 +241,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_Meson_hpp_
#endif // Hadrons_MContraction_Meson_hpp_

View File

@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_WeakHamiltonian_hpp_
#define Hadrons_WeakHamiltonian_hpp_
#ifndef Hadrons_MContraction_WeakHamiltonian_hpp_
#define Hadrons_MContraction_WeakHamiltonian_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -83,7 +83,7 @@ public:
class T##modname: public Module<WeakHamiltonianPar>\
{\
public:\
TYPE_ALIASES(FIMPL,)\
FERM_TYPE_ALIASES(FIMPL,)\
class Result: Serializable\
{\
public:\
@ -111,4 +111,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_WeakHamiltonian_hpp_
#endif // Hadrons_MContraction_WeakHamiltonian_hpp_

View File

@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_WeakHamiltonianEye_hpp_
#define Hadrons_WeakHamiltonianEye_hpp_
#ifndef Hadrons_MContraction_WeakHamiltonianEye_hpp_
#define Hadrons_MContraction_WeakHamiltonianEye_hpp_
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
@ -55,4 +55,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_WeakHamiltonianEye_hpp_
#endif // Hadrons_MContraction_WeakHamiltonianEye_hpp_

View File

@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_WeakHamiltonianNonEye_hpp_
#define Hadrons_WeakHamiltonianNonEye_hpp_
#ifndef Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
#define Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
@ -54,4 +54,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_WeakHamiltonianNonEye_hpp_
#endif // Hadrons_MContraction_WeakHamiltonianNonEye_hpp_

View File

@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_WeakNeutral4ptDisc_hpp_
#define Hadrons_WeakNeutral4ptDisc_hpp_
#ifndef Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
#define Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
@ -56,4 +56,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_WeakNeutral4ptDisc_hpp_
#endif // Hadrons_MContraction_WeakNeutral4ptDisc_hpp_

View File

@ -1,34 +1,5 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/Quark.hpp
Copyright (C) 2015
Copyright (C) 2016
Author: Antonin Portelli <antonin.portelli@me.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_Quark_hpp_
#define Hadrons_Quark_hpp_
#ifndef Hadrons_MFermion_GaugeProp_hpp_
#define Hadrons_MFermion_GaugeProp_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -37,27 +8,29 @@ See the full license in the file "LICENSE" in the top level distribution directo
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
* TQuark *
* GaugeProp *
******************************************************************************/
class QuarkPar: Serializable
BEGIN_MODULE_NAMESPACE(MFermion)
class GaugePropPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(QuarkPar,
GRID_SERIALIZABLE_CLASS_MEMBERS(GaugePropPar,
std::string, source,
std::string, solver);
};
template <typename FImpl>
class TQuark: public Module<QuarkPar>
class TGaugeProp: public Module<GaugePropPar>
{
public:
TYPE_ALIASES(FImpl,);
FGS_TYPE_ALIASES(FImpl,);
public:
// constructor
TQuark(const std::string name);
TGaugeProp(const std::string name);
// destructor
virtual ~TQuark(void) = default;
// dependencies/products
virtual ~TGaugeProp(void) = default;
// dependency relation
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
// setup
@ -69,20 +42,20 @@ private:
SolverFn *solver_{nullptr};
};
MODULE_REGISTER(Quark, TQuark<FIMPL>);
MODULE_REGISTER_NS(GaugeProp, TGaugeProp<FIMPL>, MFermion);
/******************************************************************************
* TQuark implementation *
* TGaugeProp implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
template <typename FImpl>
TQuark<FImpl>::TQuark(const std::string name)
: Module(name)
TGaugeProp<FImpl>::TGaugeProp(const std::string name)
: Module<GaugePropPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
template <typename FImpl>
std::vector<std::string> TQuark<FImpl>::getInput(void)
std::vector<std::string> TGaugeProp<FImpl>::getInput(void)
{
std::vector<std::string> in = {par().source, par().solver};
@ -90,7 +63,7 @@ std::vector<std::string> TQuark<FImpl>::getInput(void)
}
template <typename FImpl>
std::vector<std::string> TQuark<FImpl>::getOutput(void)
std::vector<std::string> TGaugeProp<FImpl>::getOutput(void)
{
std::vector<std::string> out = {getName(), getName() + "_5d"};
@ -99,7 +72,7 @@ std::vector<std::string> TQuark<FImpl>::getOutput(void)
// setup ///////////////////////////////////////////////////////////////////////
template <typename FImpl>
void TQuark<FImpl>::setup(void)
void TGaugeProp<FImpl>::setup(void)
{
Ls_ = env().getObjectLs(par().solver);
env().template registerLattice<PropagatorField>(getName());
@ -111,7 +84,7 @@ void TQuark<FImpl>::setup(void)
// execution ///////////////////////////////////////////////////////////////////
template <typename FImpl>
void TQuark<FImpl>::execute(void)
void TGaugeProp<FImpl>::execute(void)
{
LOG(Message) << "Computing quark propagator '" << getName() << "'"
<< std::endl;
@ -180,6 +153,8 @@ void TQuark<FImpl>::execute(void)
}
}
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_Quark_hpp_
#endif // Hadrons_MFermion_GaugeProp_hpp_

View File

@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_Load_hpp_
#define Hadrons_Load_hpp_
#ifndef Hadrons_MGauge_Load_hpp_
#define Hadrons_MGauge_Load_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -70,4 +70,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_Load_hpp_
#endif // Hadrons_MGauge_Load_hpp_

View File

@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_Random_hpp_
#define Hadrons_Random_hpp_
#ifndef Hadrons_MGauge_Random_hpp_
#define Hadrons_MGauge_Random_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_Random_hpp_
#endif // Hadrons_MGauge_Random_hpp_

View File

@ -0,0 +1,88 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MGauge/StochEm.cc
Copyright (C) 2015
Copyright (C) 2016
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
using namespace Grid;
using namespace Hadrons;
using namespace MGauge;
/******************************************************************************
* TStochEm implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
TStochEm::TStochEm(const std::string name)
: Module<StochEmPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
std::vector<std::string> TStochEm::getInput(void)
{
std::vector<std::string> in;
return in;
}
std::vector<std::string> TStochEm::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
void TStochEm::setup(void)
{
if (!env().hasRegisteredObject("_" + getName() + "_weight"))
{
env().registerLattice<EmComp>("_" + getName() + "_weight");
}
env().registerLattice<EmField>(getName());
}
// execution ///////////////////////////////////////////////////////////////////
void TStochEm::execute(void)
{
PhotonR photon(par().gauge, par().zmScheme);
EmField &a = *env().createLattice<EmField>(getName());
EmComp *w;
if (!env().hasCreatedObject("_" + getName() + "_weight"))
{
LOG(Message) << "Caching stochatic EM potential weight (gauge: "
<< par().gauge << ", zero-mode scheme: "
<< par().zmScheme << ")..." << std::endl;
w = env().createLattice<EmComp>("_" + getName() + "_weight");
photon.StochasticWeight(*w);
}
else
{
w = env().getObject<EmComp>("_" + getName() + "_weight");
}
LOG(Message) << "Generating stochatic EM potential..." << std::endl;
photon.StochasticField(a, *env().get4dRng(), *w);
}

View File

@ -0,0 +1,75 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MGauge/StochEm.hpp
Copyright (C) 2015
Copyright (C) 2016
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_MGauge_StochEm_hpp_
#define Hadrons_MGauge_StochEm_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
#include <Grid/Hadrons/ModuleFactory.hpp>
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
* StochEm *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MGauge)
class StochEmPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(StochEmPar,
PhotonR::Gauge, gauge,
PhotonR::ZmScheme, zmScheme);
};
class TStochEm: public Module<StochEmPar>
{
public:
typedef PhotonR::GaugeField EmField;
typedef PhotonR::GaugeLinkField EmComp;
public:
// constructor
TStochEm(const std::string name);
// destructor
virtual ~TStochEm(void) = default;
// dependency relation
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
// setup
virtual void setup(void);
// execution
virtual void execute(void);
};
MODULE_REGISTER_NS(StochEm, TStochEm, MGauge);
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_MGauge_StochEm_hpp_

View File

@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_Unit_hpp_
#define Hadrons_Unit_hpp_
#ifndef Hadrons_MGauge_Unit_hpp_
#define Hadrons_MGauge_Unit_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_Unit_hpp_
#endif // Hadrons_MGauge_Unit_hpp_

View File

@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_NoiseLoop_hpp_
#define Hadrons_NoiseLoop_hpp_
#ifndef Hadrons_MLoop_NoiseLoop_hpp_
#define Hadrons_MLoop_NoiseLoop_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -65,7 +65,7 @@ template <typename FImpl>
class TNoiseLoop: public Module<NoiseLoopPar>
{
public:
TYPE_ALIASES(FImpl,);
FERM_TYPE_ALIASES(FImpl,);
public:
// constructor
TNoiseLoop(const std::string name);
@ -129,4 +129,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_NoiseLoop_hpp_
#endif // Hadrons_MLoop_NoiseLoop_hpp_

View File

@ -0,0 +1,226 @@
#include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
using namespace Grid;
using namespace Hadrons;
using namespace MScalar;
/******************************************************************************
* TChargedProp implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
TChargedProp::TChargedProp(const std::string name)
: Module<ChargedPropPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
std::vector<std::string> TChargedProp::getInput(void)
{
std::vector<std::string> in = {par().source, par().emField};
return in;
}
std::vector<std::string> TChargedProp::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
void TChargedProp::setup(void)
{
freeMomPropName_ = FREEMOMPROP(par().mass);
phaseName_.clear();
for (unsigned int mu = 0; mu < env().getNd(); ++mu)
{
phaseName_.push_back("_shiftphase_" + std::to_string(mu));
}
GFSrcName_ = "_" + getName() + "_DinvSrc";
if (!env().hasRegisteredObject(freeMomPropName_))
{
env().registerLattice<ScalarField>(freeMomPropName_);
}
if (!env().hasRegisteredObject(phaseName_[0]))
{
for (unsigned int mu = 0; mu < env().getNd(); ++mu)
{
env().registerLattice<ScalarField>(phaseName_[mu]);
}
}
if (!env().hasRegisteredObject(GFSrcName_))
{
env().registerLattice<ScalarField>(GFSrcName_);
}
env().registerLattice<ScalarField>(getName());
}
// execution ///////////////////////////////////////////////////////////////////
void TChargedProp::execute(void)
{
// CACHING ANALYTIC EXPRESSIONS
ScalarField &source = *env().getObject<ScalarField>(par().source);
Complex ci(0.0,1.0);
FFT fft(env().getGrid());
// cache free scalar propagator
if (!env().hasCreatedObject(freeMomPropName_))
{
LOG(Message) << "Caching momentum space free scalar propagator"
<< " (mass= " << par().mass << ")..." << std::endl;
freeMomProp_ = env().createLattice<ScalarField>(freeMomPropName_);
SIMPL::MomentumSpacePropagator(*freeMomProp_, par().mass);
}
else
{
freeMomProp_ = env().getObject<ScalarField>(freeMomPropName_);
}
// cache G*F*src
if (!env().hasCreatedObject(GFSrcName_))
{
GFSrc_ = env().createLattice<ScalarField>(GFSrcName_);
fft.FFT_all_dim(*GFSrc_, source, FFT::forward);
*GFSrc_ = (*freeMomProp_)*(*GFSrc_);
}
else
{
GFSrc_ = env().getObject<ScalarField>(GFSrcName_);
}
// cache phases
if (!env().hasCreatedObject(phaseName_[0]))
{
std::vector<int> &l = env().getGrid()->_fdimensions;
LOG(Message) << "Caching shift phases..." << std::endl;
for (unsigned int mu = 0; mu < env().getNd(); ++mu)
{
Real twoPiL = M_PI*2./l[mu];
phase_.push_back(env().createLattice<ScalarField>(phaseName_[mu]));
LatticeCoordinate(*(phase_[mu]), mu);
*(phase_[mu]) = exp(ci*twoPiL*(*(phase_[mu])));
}
}
else
{
for (unsigned int mu = 0; mu < env().getNd(); ++mu)
{
phase_.push_back(env().getObject<ScalarField>(phaseName_[mu]));
}
}
// PROPAGATOR CALCULATION
LOG(Message) << "Computing charged scalar propagator"
<< " (mass= " << par().mass
<< ", charge= " << par().charge << ")..." << std::endl;
ScalarField &prop = *env().createLattice<ScalarField>(getName());
ScalarField buf(env().getGrid());
ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_;
double q = par().charge;
// G*F*Src
prop = GFSrc;
// - q*G*momD1*G*F*Src (momD1 = F*D1*Finv)
buf = GFSrc;
momD1(buf, fft);
buf = G*buf;
prop = prop - q*buf;
// + q^2*G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
momD1(buf, fft);
prop = prop + q*q*G*buf;
// - q^2*G*momD2*G*F*Src (momD2 = F*D2*Finv)
buf = GFSrc;
momD2(buf, fft);
prop = prop - q*q*G*buf;
// final FT
fft.FFT_all_dim(prop, prop, FFT::backward);
// OUTPUT IF NECESSARY
if (!par().output.empty())
{
std::string filename = par().output + "." +
std::to_string(env().getTrajectory());
LOG(Message) << "Saving zero-momentum projection to '"
<< filename << "'..." << std::endl;
CorrWriter writer(filename);
std::vector<TComplex> vecBuf;
std::vector<Complex> result;
sliceSum(prop, vecBuf, Tp);
result.resize(vecBuf.size());
for (unsigned int t = 0; t < vecBuf.size(); ++t)
{
result[t] = TensorRemove(vecBuf[t]);
}
write(writer, "charge", q);
write(writer, "prop", result);
}
}
void TChargedProp::momD1(ScalarField &s, FFT &fft)
{
EmField &A = *env().getObject<EmField>(par().emField);
ScalarField buf(env().getGrid()), result(env().getGrid()),
Amu(env().getGrid());
Complex ci(0.0,1.0);
result = zero;
for (unsigned int mu = 0; mu < env().getNd(); ++mu)
{
Amu = peekLorentz(A, mu);
buf = (*phase_[mu])*s;
fft.FFT_all_dim(buf, buf, FFT::backward);
buf = Amu*buf;
fft.FFT_all_dim(buf, buf, FFT::forward);
result = result - ci*buf;
}
fft.FFT_all_dim(s, s, FFT::backward);
for (unsigned int mu = 0; mu < env().getNd(); ++mu)
{
Amu = peekLorentz(A, mu);
buf = Amu*s;
fft.FFT_all_dim(buf, buf, FFT::forward);
result = result + ci*adj(*phase_[mu])*buf;
}
s = result;
}
void TChargedProp::momD2(ScalarField &s, FFT &fft)
{
EmField &A = *env().getObject<EmField>(par().emField);
ScalarField buf(env().getGrid()), result(env().getGrid()),
Amu(env().getGrid());
result = zero;
for (unsigned int mu = 0; mu < env().getNd(); ++mu)
{
Amu = peekLorentz(A, mu);
buf = (*phase_[mu])*s;
fft.FFT_all_dim(buf, buf, FFT::backward);
buf = Amu*Amu*buf;
fft.FFT_all_dim(buf, buf, FFT::forward);
result = result + .5*buf;
}
fft.FFT_all_dim(s, s, FFT::backward);
for (unsigned int mu = 0; mu < env().getNd(); ++mu)
{
Amu = peekLorentz(A, mu);
buf = Amu*Amu*s;
fft.FFT_all_dim(buf, buf, FFT::forward);
result = result + .5*adj(*phase_[mu])*buf;
}
s = result;
}

View File

@ -0,0 +1,61 @@
#ifndef Hadrons_MScalar_ChargedProp_hpp_
#define Hadrons_MScalar_ChargedProp_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
#include <Grid/Hadrons/ModuleFactory.hpp>
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
* Charged scalar propagator *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MScalar)
class ChargedPropPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(ChargedPropPar,
std::string, emField,
std::string, source,
double, mass,
double, charge,
std::string, output);
};
class TChargedProp: public Module<ChargedPropPar>
{
public:
SCALAR_TYPE_ALIASES(SIMPL,);
typedef PhotonR::GaugeField EmField;
typedef PhotonR::GaugeLinkField EmComp;
public:
// constructor
TChargedProp(const std::string name);
// destructor
virtual ~TChargedProp(void) = default;
// dependency relation
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
// setup
virtual void setup(void);
// execution
virtual void execute(void);
private:
void momD1(ScalarField &s, FFT &fft);
void momD2(ScalarField &s, FFT &fft);
private:
std::string freeMomPropName_, GFSrcName_;
std::vector<std::string> phaseName_;
ScalarField *freeMomProp_, *GFSrc_;
std::vector<ScalarField *> phase_;
EmField *A;
};
MODULE_REGISTER_NS(ChargedProp, TChargedProp, MScalar);
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_MScalar_ChargedProp_hpp_

View File

@ -0,0 +1,79 @@
#include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
using namespace Grid;
using namespace Hadrons;
using namespace MScalar;
/******************************************************************************
* TFreeProp implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
TFreeProp::TFreeProp(const std::string name)
: Module<FreePropPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
std::vector<std::string> TFreeProp::getInput(void)
{
std::vector<std::string> in = {par().source};
return in;
}
std::vector<std::string> TFreeProp::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
void TFreeProp::setup(void)
{
freeMomPropName_ = FREEMOMPROP(par().mass);
if (!env().hasRegisteredObject(freeMomPropName_))
{
env().registerLattice<ScalarField>(freeMomPropName_);
}
env().registerLattice<ScalarField>(getName());
}
// execution ///////////////////////////////////////////////////////////////////
void TFreeProp::execute(void)
{
ScalarField &prop = *env().createLattice<ScalarField>(getName());
ScalarField &source = *env().getObject<ScalarField>(par().source);
ScalarField *freeMomProp;
if (!env().hasCreatedObject(freeMomPropName_))
{
LOG(Message) << "Caching momentum space free scalar propagator"
<< " (mass= " << par().mass << ")..." << std::endl;
freeMomProp = env().createLattice<ScalarField>(freeMomPropName_);
SIMPL::MomentumSpacePropagator(*freeMomProp, par().mass);
}
else
{
freeMomProp = env().getObject<ScalarField>(freeMomPropName_);
}
LOG(Message) << "Computing free scalar propagator..." << std::endl;
SIMPL::FreePropagator(source, prop, *freeMomProp);
if (!par().output.empty())
{
TextWriter writer(par().output + "." +
std::to_string(env().getTrajectory()));
std::vector<TComplex> buf;
std::vector<Complex> result;
sliceSum(prop, buf, Tp);
result.resize(buf.size());
for (unsigned int t = 0; t < buf.size(); ++t)
{
result[t] = TensorRemove(buf[t]);
}
write(writer, "prop", result);
}
}

View File

@ -0,0 +1,50 @@
#ifndef Hadrons_MScalar_FreeProp_hpp_
#define Hadrons_MScalar_FreeProp_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
#include <Grid/Hadrons/ModuleFactory.hpp>
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
* FreeProp *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MScalar)
class FreePropPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar,
std::string, source,
double, mass,
std::string, output);
};
class TFreeProp: public Module<FreePropPar>
{
public:
SCALAR_TYPE_ALIASES(SIMPL,);
public:
// constructor
TFreeProp(const std::string name);
// destructor
virtual ~TFreeProp(void) = default;
// dependency relation
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
// setup
virtual void setup(void);
// execution
virtual void execute(void);
private:
std::string freeMomPropName_;
};
MODULE_REGISTER_NS(FreeProp, TFreeProp, MScalar);
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_MScalar_FreeProp_hpp_

View File

@ -0,0 +1,6 @@
#ifndef Hadrons_Scalar_hpp_
#define Hadrons_Scalar_hpp_
#define FREEMOMPROP(m) "_scalar_mom_prop_" + std::to_string(m)
#endif // Hadrons_Scalar_hpp_

View File

@ -0,0 +1,114 @@
#ifndef Hadrons_MSink_Point_hpp_
#define Hadrons_MSink_Point_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
#include <Grid/Hadrons/ModuleFactory.hpp>
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
* Point *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MSink)
class PointPar: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(PointPar,
std::string, mom);
};
template <typename FImpl>
class TPoint: public Module<PointPar>
{
public:
FERM_TYPE_ALIASES(FImpl,);
SINK_TYPE_ALIASES();
public:
// constructor
TPoint(const std::string name);
// destructor
virtual ~TPoint(void) = default;
// dependency relation
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
// setup
virtual void setup(void);
// execution
virtual void execute(void);
};
MODULE_REGISTER_NS(Point, TPoint<FIMPL>, MSink);
MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSink);
/******************************************************************************
* TPoint implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
template <typename FImpl>
TPoint<FImpl>::TPoint(const std::string name)
: Module<PointPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
template <typename FImpl>
std::vector<std::string> TPoint<FImpl>::getInput(void)
{
std::vector<std::string> in;
return in;
}
template <typename FImpl>
std::vector<std::string> TPoint<FImpl>::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
template <typename FImpl>
void TPoint<FImpl>::setup(void)
{
unsigned int size;
size = env().template lattice4dSize<LatticeComplex>();
env().registerObject(getName(), size);
}
// execution ///////////////////////////////////////////////////////////////////
template <typename FImpl>
void TPoint<FImpl>::execute(void)
{
std::vector<Real> p = strToVec<Real>(par().mom);
LatticeComplex ph(env().getGrid()), coor(env().getGrid());
Complex i(0.0,1.0);
LOG(Message) << "Setting up point sink function for momentum ["
<< par().mom << "]" << std::endl;
ph = zero;
for(unsigned int mu = 0; mu < env().getNd(); mu++)
{
LatticeCoordinate(coor, mu);
ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
}
ph = exp((Real)(2*M_PI)*i*ph);
auto sink = [ph](const PropagatorField &field)
{
SlicedPropagator res;
PropagatorField tmp = ph*field;
sliceSum(tmp, res, Tp);
return res;
};
env().setObject(getName(), new SinkFn(sink));
}
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_MSink_Point_hpp_

View File

@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_RBPrecCG_hpp_
#define Hadrons_RBPrecCG_hpp_
#ifndef Hadrons_MSolver_RBPrecCG_hpp_
#define Hadrons_MSolver_RBPrecCG_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -53,7 +53,7 @@ template <typename FImpl>
class TRBPrecCG: public Module<RBPrecCGPar>
{
public:
TYPE_ALIASES(FImpl,);
FGS_TYPE_ALIASES(FImpl,);
public:
// constructor
TRBPrecCG(const std::string name);
@ -129,4 +129,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_RBPrecCG_hpp_
#endif // Hadrons_MSolver_RBPrecCG_hpp_

View File

@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_Point_hpp_
#define Hadrons_Point_hpp_
#ifndef Hadrons_MSource_Point_hpp_
#define Hadrons_MSource_Point_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -63,7 +63,7 @@ template <typename FImpl>
class TPoint: public Module<PointPar>
{
public:
TYPE_ALIASES(FImpl,);
FERM_TYPE_ALIASES(FImpl,);
public:
// constructor
TPoint(const std::string name);
@ -79,6 +79,7 @@ public:
};
MODULE_REGISTER_NS(Point, TPoint<FIMPL>, MSource);
MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSource);
/******************************************************************************
* TPoint template implementation *
@ -132,4 +133,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_Point_hpp_
#endif // Hadrons_MSource_Point_hpp_

View File

@ -28,8 +28,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_SeqGamma_hpp_
#define Hadrons_SeqGamma_hpp_
#ifndef Hadrons_MSource_SeqGamma_hpp_
#define Hadrons_MSource_SeqGamma_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -72,7 +72,7 @@ template <typename FImpl>
class TSeqGamma: public Module<SeqGammaPar>
{
public:
TYPE_ALIASES(FImpl,);
FGS_TYPE_ALIASES(FImpl,);
public:
// constructor
TSeqGamma(const std::string name);
@ -161,4 +161,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_SeqGamma_hpp_
#endif // Hadrons_MSource_SeqGamma_hpp_

View File

@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_WallSource_hpp_
#define Hadrons_WallSource_hpp_
#ifndef Hadrons_MSource_WallSource_hpp_
#define Hadrons_MSource_WallSource_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -64,7 +64,7 @@ template <typename FImpl>
class TWall: public Module<WallPar>
{
public:
TYPE_ALIASES(FImpl,);
FERM_TYPE_ALIASES(FImpl,);
public:
// constructor
TWall(const std::string name);
@ -144,4 +144,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_WallSource_hpp_
#endif // Hadrons_MSource_WallSource_hpp_

View File

@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_Z2_hpp_
#define Hadrons_Z2_hpp_
#ifndef Hadrons_MSource_Z2_hpp_
#define Hadrons_MSource_Z2_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -67,7 +67,7 @@ template <typename FImpl>
class TZ2: public Module<Z2Par>
{
public:
TYPE_ALIASES(FImpl,);
FERM_TYPE_ALIASES(FImpl,);
public:
// constructor
TZ2(const std::string name);
@ -83,6 +83,7 @@ public:
};
MODULE_REGISTER_NS(Z2, TZ2<FIMPL>, MSource);
MODULE_REGISTER_NS(ScalarZ2, TZ2<ScalarImplCR>, MSource);
/******************************************************************************
* TZ2 template implementation *
@ -148,4 +149,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_Z2_hpp_
#endif // Hadrons_MSource_Z2_hpp_

View File

@ -1,5 +1,5 @@
#ifndef Hadrons____FILEBASENAME____hpp_
#define Hadrons____FILEBASENAME____hpp_
#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -41,4 +41,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons____FILEBASENAME____hpp_
#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_

View File

@ -1,5 +1,5 @@
#ifndef Hadrons____FILEBASENAME____hpp_
#define Hadrons____FILEBASENAME____hpp_
#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
@ -82,4 +82,4 @@ END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons____FILEBASENAME____hpp_
#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_

View File

@ -4,7 +4,10 @@ modules_cc =\
Modules/MContraction/WeakNeutral4ptDisc.cc \
Modules/MGauge/Load.cc \
Modules/MGauge/Random.cc \
Modules/MGauge/Unit.cc
Modules/MGauge/StochEm.cc \
Modules/MGauge/Unit.cc \
Modules/MScalar/ChargedProp.cc \
Modules/MScalar/FreeProp.cc
modules_hpp =\
Modules/MAction/DWF.hpp \
@ -17,14 +20,19 @@ modules_hpp =\
Modules/MContraction/WeakHamiltonianEye.hpp \
Modules/MContraction/WeakHamiltonianNonEye.hpp \
Modules/MContraction/WeakNeutral4ptDisc.hpp \
Modules/MFermion/GaugeProp.hpp \
Modules/MGauge/Load.hpp \
Modules/MGauge/Random.hpp \
Modules/MGauge/StochEm.hpp \
Modules/MGauge/Unit.hpp \
Modules/MLoop/NoiseLoop.hpp \
Modules/MScalar/ChargedProp.hpp \
Modules/MScalar/FreeProp.hpp \
Modules/MScalar/Scalar.hpp \
Modules/MSink/Point.hpp \
Modules/MSolver/RBPrecCG.hpp \
Modules/MSource/Point.hpp \
Modules/MSource/SeqGamma.hpp \
Modules/MSource/Wall.hpp \
Modules/MSource/Z2.hpp \
Modules/Quark.hpp
Modules/MSource/Z2.hpp

11
extras/qed-fvol/Global.cc Normal file
View File

@ -0,0 +1,11 @@
#include <qed-fvol/Global.hpp>
using namespace Grid;
using namespace QCD;
using namespace QedFVol;
QedFVolLogger QedFVol::QedFVolLogError(1,"Error");
QedFVolLogger QedFVol::QedFVolLogWarning(1,"Warning");
QedFVolLogger QedFVol::QedFVolLogMessage(1,"Message");
QedFVolLogger QedFVol::QedFVolLogIterative(1,"Iterative");
QedFVolLogger QedFVol::QedFVolLogDebug(1,"Debug");

View File

@ -0,0 +1,42 @@
#ifndef QedFVol_Global_hpp_
#define QedFVol_Global_hpp_
#include <Grid/Grid.h>
#define BEGIN_QEDFVOL_NAMESPACE \
namespace Grid {\
using namespace QCD;\
namespace QedFVol {\
using Grid::operator<<;
#define END_QEDFVOL_NAMESPACE }}
/* the 'using Grid::operator<<;' statement prevents a very nasty compilation
* error with GCC (clang compiles fine without it).
*/
BEGIN_QEDFVOL_NAMESPACE
class QedFVolLogger: public Logger
{
public:
QedFVolLogger(int on, std::string nm): Logger("QedFVol", on, nm,
GridLogColours, "BLACK"){};
};
#define LOG(channel) std::cout << QedFVolLog##channel
#define QEDFVOL_ERROR(msg)\
LOG(Error) << msg << " (" << __FUNCTION__ << " at " << __FILE__ << ":"\
<< __LINE__ << ")" << std::endl;\
abort();
#define DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
extern QedFVolLogger QedFVolLogError;
extern QedFVolLogger QedFVolLogWarning;
extern QedFVolLogger QedFVolLogMessage;
extern QedFVolLogger QedFVolLogIterative;
extern QedFVolLogger QedFVolLogDebug;
END_QEDFVOL_NAMESPACE
#endif // QedFVol_Global_hpp_

View File

@ -0,0 +1,9 @@
AM_CXXFLAGS += -I$(top_srcdir)/extras
bin_PROGRAMS = qed-fvol
qed_fvol_SOURCES = \
qed-fvol.cc \
Global.cc
qed_fvol_LDADD = -lGrid

View File

@ -0,0 +1,265 @@
#ifndef QEDFVOL_WILSONLOOPS_H
#define QEDFVOL_WILSONLOOPS_H
#include <Global.hpp>
BEGIN_QEDFVOL_NAMESPACE
template <class Gimpl> class NewWilsonLoops : public Gimpl {
public:
INHERIT_GIMPL_TYPES(Gimpl);
typedef typename Gimpl::GaugeLinkField GaugeMat;
typedef typename Gimpl::GaugeField GaugeLorentz;
//////////////////////////////////////////////////
// directed plaquette oriented in mu,nu plane
//////////////////////////////////////////////////
static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U,
const int mu, const int nu) {
// Annoyingly, must use either scope resolution to find dependent base
// class,
// or this-> ; there is no "this" in a static method. This forces explicit
// Gimpl scope
// resolution throughout the usage in this file, and rather defeats the
// purpose of deriving
// from Gimpl.
plaq = Gimpl::CovShiftBackward(
U[mu], mu, Gimpl::CovShiftBackward(
U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
}
//////////////////////////////////////////////////
// trace of directed plaquette oriented in mu,nu plane
//////////////////////////////////////////////////
static void traceDirPlaquette(LatticeComplex &plaq,
const std::vector<GaugeMat> &U, const int mu,
const int nu) {
GaugeMat sp(U[0]._grid);
dirPlaquette(sp, U, mu, nu);
plaq = trace(sp);
}
//////////////////////////////////////////////////
// sum over all planes of plaquette
//////////////////////////////////////////////////
static void sitePlaquette(LatticeComplex &Plaq,
const std::vector<GaugeMat> &U) {
LatticeComplex sitePlaq(U[0]._grid);
Plaq = zero;
for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
for (int nu = 0; nu < mu; nu++) {
traceDirPlaquette(sitePlaq, U, mu, nu);
Plaq = Plaq + sitePlaq;
}
}
}
//////////////////////////////////////////////////
// sum over all x,y,z,t and over all planes of plaquette
//////////////////////////////////////////////////
static Real sumPlaquette(const GaugeLorentz &Umu) {
std::vector<GaugeMat> U(4, Umu._grid);
for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
LatticeComplex Plaq(Umu._grid);
sitePlaquette(Plaq, U);
TComplex Tp = sum(Plaq);
Complex p = TensorRemove(Tp);
return p.real();
}
//////////////////////////////////////////////////
// average over all x,y,z,t and over all planes of plaquette
//////////////////////////////////////////////////
static Real avgPlaquette(const GaugeLorentz &Umu) {
int ndim = Umu._grid->_ndimension;
Real sumplaq = sumPlaquette(Umu);
Real vol = Umu._grid->gSites();
Real faces = (1.0 * ndim * (ndim - 1)) / 2.0;
return sumplaq / vol / faces / Nc; // Nc dependent... FIXME
}
//////////////////////////////////////////////////
// Wilson loop of size (R1, R2), oriented in mu,nu plane
//////////////////////////////////////////////////
static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U,
const int Rmu, const int Rnu,
const int mu, const int nu) {
wl = U[nu];
for(int i = 0; i < Rnu-1; i++){
wl = Gimpl::CovShiftForward(U[nu], nu, wl);
}
for(int i = 0; i < Rmu; i++){
wl = Gimpl::CovShiftForward(U[mu], mu, wl);
}
for(int i = 0; i < Rnu; i++){
wl = Gimpl::CovShiftBackward(U[nu], nu, wl);
}
for(int i = 0; i < Rmu; i++){
wl = Gimpl::CovShiftBackward(U[mu], mu, wl);
}
}
//////////////////////////////////////////////////
// trace of Wilson Loop oriented in mu,nu plane
//////////////////////////////////////////////////
static void traceWilsonLoop(LatticeComplex &wl,
const std::vector<GaugeMat> &U,
const int Rmu, const int Rnu,
const int mu, const int nu) {
GaugeMat sp(U[0]._grid);
wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
wl = trace(sp);
}
//////////////////////////////////////////////////
// sum over all planes of Wilson loop
//////////////////////////////////////////////////
static void siteWilsonLoop(LatticeComplex &Wl,
const std::vector<GaugeMat> &U,
const int R1, const int R2) {
LatticeComplex siteWl(U[0]._grid);
Wl = zero;
for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
for (int nu = 0; nu < mu; nu++) {
traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
Wl = Wl + siteWl;
traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
Wl = Wl + siteWl;
}
}
}
//////////////////////////////////////////////////
// sum over planes of Wilson loop with length R1
// in the time direction
//////////////////////////////////////////////////
static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
const std::vector<GaugeMat> &U,
const int R1, const int R2) {
LatticeComplex siteWl(U[0]._grid);
int ndim = U[0]._grid->_ndimension;
Wl = zero;
for (int nu = 0; nu < ndim - 1; nu++) {
traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
Wl = Wl + siteWl;
}
}
//////////////////////////////////////////////////
// sum Wilson loop over all planes orthogonal to the time direction
//////////////////////////////////////////////////
static void siteSpatialWilsonLoop(LatticeComplex &Wl,
const std::vector<GaugeMat> &U,
const int R1, const int R2) {
LatticeComplex siteWl(U[0]._grid);
Wl = zero;
for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
for (int nu = 0; nu < mu; nu++) {
traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
Wl = Wl + siteWl;
traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
Wl = Wl + siteWl;
}
}
}
//////////////////////////////////////////////////
// sum over all x,y,z,t and over all planes of Wilson loop
//////////////////////////////////////////////////
static Real sumWilsonLoop(const GaugeLorentz &Umu,
const int R1, const int R2) {
std::vector<GaugeMat> U(4, Umu._grid);
for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
LatticeComplex Wl(Umu._grid);
siteWilsonLoop(Wl, U, R1, R2);
TComplex Tp = sum(Wl);
Complex p = TensorRemove(Tp);
return p.real();
}
//////////////////////////////////////////////////
// sum over all x,y,z,t and over all planes of timelike Wilson loop
//////////////////////////////////////////////////
static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
const int R1, const int R2) {
std::vector<GaugeMat> U(4, Umu._grid);
for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
LatticeComplex Wl(Umu._grid);
siteTimelikeWilsonLoop(Wl, U, R1, R2);
TComplex Tp = sum(Wl);
Complex p = TensorRemove(Tp);
return p.real();
}
//////////////////////////////////////////////////
// sum over all x,y,z,t and over all planes of spatial Wilson loop
//////////////////////////////////////////////////
static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
const int R1, const int R2) {
std::vector<GaugeMat> U(4, Umu._grid);
for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
LatticeComplex Wl(Umu._grid);
siteSpatialWilsonLoop(Wl, U, R1, R2);
TComplex Tp = sum(Wl);
Complex p = TensorRemove(Tp);
return p.real();
}
//////////////////////////////////////////////////
// average over all x,y,z,t and over all planes of Wilson loop
//////////////////////////////////////////////////
static Real avgWilsonLoop(const GaugeLorentz &Umu,
const int R1, const int R2) {
int ndim = Umu._grid->_ndimension;
Real sumWl = sumWilsonLoop(Umu, R1, R2);
Real vol = Umu._grid->gSites();
Real faces = 1.0 * ndim * (ndim - 1);
return sumWl / vol / faces / Nc; // Nc dependent... FIXME
}
//////////////////////////////////////////////////
// average over all x,y,z,t and over all planes of timelike Wilson loop
//////////////////////////////////////////////////
static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
const int R1, const int R2) {
int ndim = Umu._grid->_ndimension;
Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
Real vol = Umu._grid->gSites();
Real faces = 1.0 * (ndim - 1);
return sumWl / vol / faces / Nc; // Nc dependent... FIXME
}
//////////////////////////////////////////////////
// average over all x,y,z,t and over all planes of spatial Wilson loop
//////////////////////////////////////////////////
static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
const int R1, const int R2) {
int ndim = Umu._grid->_ndimension;
Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
Real vol = Umu._grid->gSites();
Real faces = 1.0 * (ndim - 1) * (ndim - 2);
return sumWl / vol / faces / Nc; // Nc dependent... FIXME
}
};
END_QEDFVOL_NAMESPACE
#endif // QEDFVOL_WILSONLOOPS_H

View File

@ -0,0 +1,88 @@
#include <Global.hpp>
#include <WilsonLoops.h>
using namespace Grid;
using namespace QCD;
using namespace QedFVol;
typedef PeriodicGaugeImpl<QedGimplR> QedPeriodicGimplR;
typedef PhotonR::GaugeField EmField;
typedef PhotonR::GaugeLinkField EmComp;
const int NCONFIGS = 10;
const int NWILSON = 10;
int main(int argc, char *argv[])
{
// parse command line
std::string parameterFileName;
if (argc < 2)
{
std::cerr << "usage: " << argv[0] << " <parameter file> [Grid options]";
std::cerr << std::endl;
std::exit(EXIT_FAILURE);
}
parameterFileName = argv[1];
// initialization
Grid_init(&argc, &argv);
QedFVolLogError.Active(GridLogError.isActive());
QedFVolLogWarning.Active(GridLogWarning.isActive());
QedFVolLogMessage.Active(GridLogMessage.isActive());
QedFVolLogIterative.Active(GridLogIterative.isActive());
QedFVolLogDebug.Active(GridLogDebug.isActive());
LOG(Message) << "Grid initialized" << std::endl;
// QED stuff
std::vector<int> latt_size = GridDefaultLatt();
std::vector<int> simd_layout = GridDefaultSimd(4, vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
GridCartesian grid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&grid);
PhotonR photon(PhotonR::Gauge::feynman,
PhotonR::ZmScheme::qedL);
EmField a(&grid);
EmField expA(&grid);
Complex imag_unit(0, 1);
Real wlA;
std::vector<Real> logWlAvg(NWILSON, 0.0), logWlTime(NWILSON, 0.0), logWlSpace(NWILSON, 0.0);
pRNG.SeedRandomDevice();
LOG(Message) << "Wilson loop calculation beginning" << std::endl;
for(int ic = 0; ic < NCONFIGS; ic++){
LOG(Message) << "Configuration " << ic <<std::endl;
photon.StochasticField(a, pRNG);
// Exponentiate photon field
expA = exp(imag_unit*a);
// Calculate Wilson loops
for(int iw=1; iw<=NWILSON; iw++){
wlA = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, iw, iw) * 3;
logWlAvg[iw-1] -= 2*log(wlA);
wlA = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, iw, iw) * 3;
logWlTime[iw-1] -= 2*log(wlA);
wlA = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, iw, iw) * 3;
logWlSpace[iw-1] -= 2*log(wlA);
}
}
LOG(Message) << "Wilson loop calculation completed" << std::endl;
// Calculate Wilson loops
for(int iw=1; iw<=10; iw++){
LOG(Message) << iw << 'x' << iw << " Wilson loop" << std::endl;
LOG(Message) << "-2log(W) average: " << logWlAvg[iw-1]/NCONFIGS << std::endl;
LOG(Message) << "-2log(W) timelike: " << logWlTime[iw-1]/NCONFIGS << std::endl;
LOG(Message) << "-2log(W) spatial: " << logWlSpace[iw-1]/NCONFIGS << std::endl;
}
// epilogue
LOG(Message) << "Grid is finalizing now" << std::endl;
Grid_finalize();
return EXIT_SUCCESS;
}

View File

@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_base.cc
endif
if BUILD_COMMS_MPI3L
extra_sources+=communicator/Communicator_mpi3_leader.cc
if BUILD_COMMS_MPIT
extra_sources+=communicator/Communicator_mpit.cc
extra_sources+=communicator/Communicator_base.cc
endif

View File

@ -37,6 +37,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/approx/Chebyshev.h>
#include <Grid/algorithms/approx/Remez.h>
#include <Grid/algorithms/approx/MultiShiftFunction.h>
#include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateResidual.h>
@ -44,30 +45,16 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
// Lanczos support
//#include <Grid/algorithms/iterative/MatrixUtils.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/CoarsenedMatrix.h>
#include <Grid/algorithms/FFT.h>
// Eigen/lanczos
// EigCg
// MCR
// Pcg
// Multishift CG
// Hdcg
// GCR
// etc..
// integrator/Leapfrog
// integrator/Omelyan
// integrator/ForceGradient
// montecarlo/hmc
// montecarlo/rhmc
// montecarlo/metropolis
// etc...
#endif

View File

@ -103,29 +103,32 @@ namespace Grid {
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid) :
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid),
subspace(nbasis,_FineGrid)
subspace(nbasis,_FineGrid),
checkerboard(_checkerboard)
{
};
void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
// CheckOrthogonal();
}
void CheckOrthogonal(void){
CoarseVector iProj(CoarseGrid);
CoarseVector eProj(CoarseGrid);
Lattice<CComplex> pokey(CoarseGrid);
for(int i=0;i<nbasis;i++){
blockProject(iProj,subspace[i],subspace);
eProj=zero;
for(int ss=0;ss<CoarseGrid->oSites();ss++){
parallel_for(int ss=0;ss<CoarseGrid->oSites();ss++){
eProj._odata[ss](i)=CComplex(1.0);
}
eProj=eProj - iProj;
@ -137,6 +140,7 @@ namespace Grid {
blockProject(CoarseVec,FineVec,subspace);
}
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.checkerboard = subspace[0].checkerboard;
blockPromote(CoarseVec,FineVec,subspace);
}
void CreateSubspaceRandom(GridParallelRNG &RNG){
@ -147,6 +151,7 @@ namespace Grid {
Orthogonalise();
}
/*
virtual void CreateSubspaceLanczos(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
{
// Run a Lanczos with sloppy convergence
@ -195,7 +200,7 @@ namespace Grid {
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
}
}
*/
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
RealD scale;

View File

@ -230,6 +230,7 @@ namespace Grid {
// Barrel shift and collect global pencil
std::vector<int> lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
PARALLEL_REGION
{
@ -240,7 +241,8 @@ namespace Grid {
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf);
cbuf[dim]+=p*L;
cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf);
}
}
@ -278,7 +280,6 @@ namespace Grid {
flops+= flops_call*NN;
// writing out result
int pc = processor_coor[dim];
PARALLEL_REGION
{
std::vector<int> clbuf(Nd), cgbuf(Nd);

View File

@ -162,15 +162,10 @@ namespace Grid {
_Mat.M(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
ComplexD dot;
_Mat.M(in,out);
dot= innerProduct(in,out);
n1=real(dot);
dot = innerProduct(out,out);
n2=real(dot);
ComplexD dot= innerProduct(in,out); n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.M(in,out);
@ -192,10 +187,10 @@ namespace Grid {
ni=Mpc(in,tmp);
no=MpcDag(tmp,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
MpcDagMpc(in,out,n1,n2);
}
void HermOp(const Field &in, Field &out){
virtual void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
@ -212,7 +207,6 @@ namespace Grid {
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
}
};
template<class Matrix,class Field>
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
@ -270,7 +264,6 @@ namespace Grid {
return axpy_norm(out,-1.0,tmp,in);
}
};
template<class Matrix,class Field>
class SchurDiagTwoOperator : public SchurOperatorBase<Field> {
protected:
@ -299,6 +292,45 @@ namespace Grid {
return axpy_norm(out,-1.0,tmp,in);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
///////////////////////////////////////////////////////////////////////////////////////////////////
// Staggered use
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
public:
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
n2 = Mpc(in,out);
ComplexD dot= innerProduct(in,out);
n1 = real(dot);
}
virtual void HermOp(const Field &in, Field &out){
Mpc(in,out);
}
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid);
_Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp);
_Mat.Mooee(in,out);
return axpy_norm(out,-1.0,tmp,out);
}
virtual RealD MpcDag (const Field &in, Field &out){
return Mpc(in,out);
}
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
assert(0);// Never need with staggered
}
};
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
/////////////////////////////////////////////////////////////
@ -314,6 +346,14 @@ namespace Grid {
virtual void operator() (const Field &in, Field &out) = 0;
};
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
public:
void operator() (const Field &in, Field &out){
out = in;
};
};
/////////////////////////////////////////////////////////////
// Base classes for Multishift solvers for operators
/////////////////////////////////////////////////////////////
@ -336,6 +376,64 @@ namespace Grid {
};
*/
////////////////////////////////////////////////////////////////////////////////////////////
// Hermitian operator Linear function and operator function
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class HermOpOperatorFunction : public OperatorFunction<Field> {
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Linop.HermOp(in,out);
};
};
template<typename Field>
class PlainHermOp : public LinearFunction<Field> {
public:
LinearOperatorBase<Field> &_Linop;
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
{}
void operator()(const Field& in, Field& out) {
_Linop.HermOp(in,out);
}
};
template<typename Field>
class FunctionHermOp : public LinearFunction<Field> {
public:
OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;
FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop)
: _poly(poly), _Linop(linop) {};
void operator()(const Field& in, Field& out) {
_poly(_Linop,in,out);
}
};
template<class Field>
class Polynomial : public OperatorFunction<Field> {
private:
std::vector<RealD> Coeffs;
public:
Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Field AtoN(in._grid);
Field Mtmp(in._grid);
AtoN = in;
out = AtoN*Coeffs[0];
for(int n=1;n<Coeffs.size();n++){
Mtmp = AtoN;
Linop.HermOp(Mtmp,AtoN);
out=out+AtoN*Coeffs[n];
}
};
};
}

View File

@ -8,6 +8,7 @@
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <clehner@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -33,41 +34,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid {
////////////////////////////////////////////////////////////////////////////////////////////
// Simple general polynomial with user supplied coefficients
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class HermOpOperatorFunction : public OperatorFunction<Field> {
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Linop.HermOp(in,out);
};
};
template<class Field>
class Polynomial : public OperatorFunction<Field> {
private:
std::vector<RealD> Coeffs;
public:
Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Field AtoN(in._grid);
Field Mtmp(in._grid);
AtoN = in;
out = AtoN*Coeffs[0];
// std::cout <<"Poly in " <<norm2(in)<<" size "<< Coeffs.size()<<std::endl;
// std::cout <<"Coeffs[0]= "<<Coeffs[0]<< " 0 " <<norm2(out)<<std::endl;
for(int n=1;n<Coeffs.size();n++){
Mtmp = AtoN;
Linop.HermOp(Mtmp,AtoN);
out=out+AtoN*Coeffs[n];
// std::cout <<"Coeffs "<<n<<"= "<< Coeffs[n]<< " 0 " <<std::endl;
// std::cout << n<<" " <<norm2(out)<<std::endl;
}
};
};
struct ChebyParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
RealD, alpha,
RealD, beta,
int, Npoly);
};
////////////////////////////////////////////////////////////////////////////////////////////
// Generic Chebyshev approximations
@ -83,7 +55,9 @@ namespace Grid {
public:
void csv(std::ostream &out){
RealD diff = hi-lo;
for (RealD x=lo-0.2*diff; x<hi+0.2*diff; x+=(hi-lo)/1000) {
RealD delta = (hi-lo)*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1;
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
@ -99,6 +73,7 @@ namespace Grid {
};
Chebyshev(){};
Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
@ -193,6 +168,47 @@ namespace Grid {
return sum;
};
RealD approxD(RealD x)
{
RealD Un;
RealD Unm;
RealD Unp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD U0=1;
RealD U1=2*y;
RealD sum;
sum = Coeffs[1]*U0;
sum+= Coeffs[2]*U1*2.0;
Un =U1;
Unm=U0;
for(int i=2;i<order-1;i++){
Unp=2*y*Un-Unm;
Unm=Un;
Un =Unp;
sum+= Un*Coeffs[i+1]*(i+1.0);
}
return sum/(0.5*(hi-lo));
};
RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
RealD x = x0;
RealD eps;
int i;
for (i=0;i<maxiter;i++) {
eps = approx(x) - z;
if (fabs(eps / z) < resid)
return x;
x = x - eps / approxD(x);
}
return std::numeric_limits<double>::quiet_NaN();
}
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {

View File

@ -0,0 +1,152 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/Forecast.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef INCLUDED_FORECAST_H
#define INCLUDED_FORECAST_H
namespace Grid {
// Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
// and returns a forecasted solution to the system D*psi = phi (psi).
template<class Matrix, class Field>
class Forecast
{
public:
virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
};
// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
// used to forecast solutions across poles of the EOFA heatbath.
//
// Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
template<class Matrix, class Field>
class ChronoForecast : public Forecast<Matrix,Field>
{
public:
Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
{
int degree = prev_solns.size();
Field chi(phi); // forecasted solution
// Trivial cases
if(degree == 0){ chi = zero; return chi; }
else if(degree == 1){ return prev_solns[0]; }
RealD dot;
ComplexD xp;
Field r(phi); // residual
Field Mv(phi);
std::vector<Field> v(prev_solns); // orthonormalized previous solutions
std::vector<Field> MdagMv(degree,phi);
// Array to hold the matrix elements
std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
// Solution and source vectors
std::vector<ComplexD> a(degree);
std::vector<ComplexD> b(degree);
// Orthonormalize the vector basis
for(int i=0; i<degree; i++){
v[i] *= 1.0/std::sqrt(norm2(v[i]));
for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
}
// Perform sparse matrix multiplication and construct rhs
for(int i=0; i<degree; i++){
b[i] = innerProduct(v[i],phi);
Mat.M(v[i],Mv);
Mat.Mdag(Mv,MdagMv[i]);
G[i][i] = innerProduct(v[i],MdagMv[i]);
}
// Construct the matrix
for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = std::conj(G[j][k]);
}}
// Gauss-Jordan elimination with partial pivoting
for(int i=0; i<degree; i++){
// Perform partial pivoting
int k = i;
for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
if(k != i){
xp = b[k];
b[k] = b[i];
b[i] = xp;
for(int j=0; j<degree; j++){
xp = G[k][j];
G[k][j] = G[i][j];
G[i][j] = xp;
}
}
// Convert matrix to upper triangular form
for(int j=i+1; j<degree; j++){
xp = G[j][i]/G[i][i];
b[j] -= xp * b[i];
for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
}
}
// Use Gaussian elimination to solve equations and calculate initial guess
chi = zero;
r = phi;
for(int i=degree-1; i>=0; i--){
a[i] = 0.0;
for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
a[i] = (b[i]-a[i])/G[i][i];
chi += a[i]*v[i];
r -= a[i]*MdagMv[i];
}
RealD true_r(0.0);
ComplexD tmp;
for(int i=0; i<degree; i++){
tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = std::conj(tmp)*tmp;
true_r += std::sqrt(tmp.real());
}
RealD error = std::sqrt(norm2(r)/norm2(phi));
std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
return chi;
};
};
}
#endif

View File

@ -87,15 +87,22 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
////////////////////////////////////////////////////////////////////////////////////////////////////
sliceInnerProductMatrix(m_rr,R,R,Orthog);
////////////////////////////////////////////////////////////////////////////////////////////////////
// Cholesky from Eigen
// There exists a ldlt that is documented as more stable
////////////////////////////////////////////////////////////////////////////////////////////////////
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// Force manifest hermitian to avoid rounding related
m_rr = 0.5*(m_rr+m_rr.adjoint());
#if 0
std::cout << " Calling Cholesky ldlt on m_rr " << m_rr <<std::endl;
Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL();
std::cout << " Called Cholesky ldlt on m_rr " << L_ldlt <<std::endl;
auto D_ldlt = m_rr.ldlt().vectorD();
std::cout << " Called Cholesky ldlt on m_rr " << D_ldlt <<std::endl;
#endif
// std::cout << " Calling Cholesky llt on m_rr " <<std::endl;
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// std::cout << " Called Cholesky llt on m_rr " << L <<std::endl;
C = L.adjoint();
Cinv = C.inverse();
////////////////////////////////////////////////////////////////////////////////////////////////////
// Q = R C^{-1}
//
@ -103,7 +110,6 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
//
// NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
////////////////////////////////////////////////////////////////////////////////////////////////////
// FIXME:: make a sliceMulMatrix to avoid zero vector
sliceMulMatrix(Q,Cinv,R,Orthog);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -199,7 +205,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD);
tmp = B - AD;
//std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
//std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
//std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
//std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
//std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
D=Q;
std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@ -221,12 +232,14 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
MatrixTimer.Start();
Linop.HermOp(D, Z);
MatrixTimer.Stop();
//std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
//4. M = [D^dag Z]^{-1}
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
sliceInnerTimer.Stop();
m_M = m_DZ.inverse();
//std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
//5. X = X + D MC
m_tmp = m_M * m_C;

View File

@ -52,8 +52,8 @@ class ConjugateGradient : public OperatorFunction<Field> {
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
Field &psi) {
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard;
conformable(psi, src);
@ -78,12 +78,12 @@ class ConjugateGradient : public OperatorFunction<Field> {
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: p " << a << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
@ -92,7 +92,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
return;
}
std::cout << GridLogIterative << std::setprecision(4)
std::cout << GridLogIterative << std::setprecision(8)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;

View File

@ -0,0 +1,256 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
namespace Grid {
template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer ReliableUpdatesPerformed;
bool DoFinalCleanup; //Final DP cleanup, defaults to true
Integer IterationsToCleanup; //Final DP cleanup step iterations
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback;
RealD fallback_transition_tol;
ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
Delta(_delta),
Linop_f(_Linop_f),
Linop_d(_Linop_d),
SinglePrecGrid(_sp_grid),
ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true),
Linop_fallback(NULL)
{};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback;
fallback_transition_tol = _fallback_transition_tol;
}
void operator()(const FieldD &src, FieldD &psi) {
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false;
psi.checkerboard = src.checkerboard;
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred;
FieldD p(src);
FieldD mmp(src);
FieldD r(src);
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
return;
}
//Single prec initialization
FieldF r_f(SinglePrecGrid);
r_f.checkerboard = r.checkerboard;
precisionChange(r_f, r);
FieldF psi_f(r_f);
psi_f = zero;
FieldF p_f(r_f);
FieldF mmp_f(r_f);
RealD MaxResidSinceLastRelUp = cp; //initial residual
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k = 0;
int l = 0;
for (k = 1; k <= MaxIterations; k++) {
c = cp;
MatrixTimer.Start();
Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
MatrixTimer.Stop();
LinalgTimer.Start();
a = c / d;
b_pred = a * (a * qq - d) / c;
cp = axpy_norm(r_f, -a, mmp_f, r_f);
b = cp / c;
// Fuse these loops ; should be really easy
psi_f = a * p_f + psi_f;
//p_f = p_f * b + r_f;
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
if(cp > MaxResidSinceLastRelUp){
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
MaxResidSinceLastRelUp = cp;
}
// Stopping condition
if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f);
psi = psi + mmp;
SolverTimer.Stop();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
if(DoFinalCleanup){
//Do a final CG to cleanup
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
CG.ErrorOnNoConverge = ErrorOnNoConverge;
CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete;
}
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return;
}
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
psi = psi + mmp;
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
r = src - mmp;
psi_f = zero;
precisionChange(r_f, r);
cp = norm2(r);
MaxResidSinceLastRelUp = cp;
b = cp/c;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
l = l+1;
}
p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
Linop_f_use = Linop_fallback;
using_fallback = true;
}
}
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
}
};
};
#endif

View File

@ -7,8 +7,9 @@
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Chulwoo Jung
Author: Guido Cossu
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Chulwoo Jung <chulwoo@bnl.gov>
Author: Christoph Lehner <clehner@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -27,108 +28,269 @@ Author: Guido Cossu
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_IRL_H
#define GRID_IRL_H
#ifndef GRID_BIRL_H
#define GRID_BIRL_H
#include <string.h> //memset
//#include <zlib.h>
#include <sys/stat.h>
namespace Grid {
enum IRLdiagonalisation {
IRLdiagonaliseWithDSTEGR,
IRLdiagonaliseWithQR,
IRLdiagonaliseWithEigen
};
////////////////////////////////////////////////////////////////////////////////
// Helper class for sorting the evalues AND evectors by Field
// Use pointer swizzle on vectors
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h
////////////////////////////////////////////////////////
template<class Field>
class SortEigen {
private:
static bool less_lmd(RealD left,RealD right){
return left > right;
}
static bool less_pair(std::pair<RealD,Field const*>& left,
std::pair<RealD,Field const*>& right){
return left.first > (right.first);
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
public:
void push(std::vector<RealD>& lmd,std::vector<Field>& evec,int N) {
template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid;
////////////////////////////////////////////////////////////////////////
// PAB: FIXME: VERY VERY VERY wasteful: takes a copy of the entire vector set.
// : The vector reorder should be done by pointer swizzle somehow
////////////////////////////////////////////////////////////////////////
std::vector<Field> cpy(lmd.size(),evec[0]._grid);
for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
parallel_region
{
std::vector < vobj > B(Nm); // Thread private
std::vector<std::pair<RealD, Field const*> > emod(lmd.size());
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int i=0;i<lmd.size();++i) emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
typename std::vector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
for(int i=0;i<N;++i){
lmd[i]=it->first;
evec[i]=*(it->second);
++it;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis[k]._odata[ss];
}
}
void push(std::vector<RealD>& lmd,int N) {
std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
for(int j=j0; j<j1; ++j){
basis[j]._odata[ss] = B[j];
}
bool saturated(RealD lmd, RealD thrs) {
return fabs(lmd) > fabs(thrs);
}
};
}
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid;
result.checkerboard = basis[0].checkerboard;
parallel_for(int ss=0;ss < grid->oSites();ss++){
vobj B = zero;
for(int k=k0; k<k1; ++k){
B +=Qt(j,k) * basis[k]._odata[ss];
}
result._odata[ss] = B;
}
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = zero;
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
/////////////////////////////////////////////////////////////
// Implicitly restarted lanczos
/////////////////////////////////////////////////////////////
template<class Field> class ImplicitlyRestartedLanczosTester
{
public:
virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox);
virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox);
};
enum IRLdiagonalisation {
IRLdiagonaliseWithDSTEGR,
IRLdiagonaliseWithQR,
IRLdiagonaliseWithEigen
};
template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester<Field>
{
public:
LinearFunction<Field> &_HermOpTest;
ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOpTest) : _HermOpTest(HermOpTest) { };
int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
{
return TestConvergence(j,resid,B,eval,evalMaxApprox);
}
int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox)
{
Field v(B);
RealD eval_poly = eval;
// Apply operator
_HermOpTest(B,v);
RealD vnum = real(innerProduct(B,v)); // HermOp.
RealD vden = norm2(B);
RealD vv0 = norm2(v);
eval = vnum/vden;
v -= eval*B;
RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv;
}
};
template<class Field>
class ImplicitlyRestartedLanczos {
private:
int MaxIter; // Max iterations
private:
const RealD small = 1.0e-8;
int MaxIter;
int MinRestart; // Minimum number of restarts; only check for convergence after
int Nstop; // Number of evecs checked for convergence
int Nk; // Number of converged sought
// int Np; // Np -- Number of spare vecs in krylov space // == Nm - Nk
int Nm; // Nm -- total number of vectors
RealD eresid;
IRLdiagonalisation diagonalisation;
////////////////////////////////////
// Embedded objects
////////////////////////////////////
SortEigen<Field> _sort;
LinearOperatorBase<Field> &_Linop;
OperatorFunction<Field> &_poly;
int orth_period;
RealD OrthoTime;
RealD eresid, betastp;
////////////////////////////////
// Embedded objects
////////////////////////////////
LinearFunction<Field> &_HermOp;
LinearFunction<Field> &_HermOpTest;
ImplicitlyRestartedLanczosTester<Field> &_Tester;
// Default tester provided (we need a ref to something in default case)
ImplicitlyRestartedLanczosHermOpTester<Field> SimpleTester;
/////////////////////////
// Constructor
/////////////////////////
public:
ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
OperatorFunction<Field> & poly, // polynomial
int _Nstop, // really sought vecs
//////////////////////////////////////////////////////////////////
// PAB:
//////////////////////////////////////////////////////////////////
// Too many options & knobs. Do we really need orth_period
// What is the theoretical basis & guarantees of betastp ?
// Nstop=Nk viable?
// MinRestart avoidable with new convergence test?
// Could cut to HermOp, HermOpTest, Tester, Nk, Nm, resid, maxiter (+diagonalisation)
// HermOpTest could be eliminated if we dropped the Power method for max eval.
// -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear
//////////////////////////////////////////////////////////////////
ImplicitlyRestartedLanczos(LinearFunction<Field> & HermOp,
LinearFunction<Field> & HermOpTest,
ImplicitlyRestartedLanczosTester<Field> & Tester,
int _Nstop, // sought vecs
int _Nk, // sought vecs
int _Nm, // total vecs
RealD _eresid, // resid in lmd deficit
int _Nm, // spare vecs
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen ) :
_Linop(Linop), _poly(poly),
Nstop(_Nstop), Nk(_Nk), Nm(_Nm),
eresid(_eresid), MaxIter(_MaxIter),
diagonalisation(_diagonalisation)
{ };
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOpTest), _HermOp(HermOp), _HermOpTest(HermOpTest), _Tester(Tester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
eresid(_eresid), betastp(_betastp),
MaxIter(_MaxIter) , MinRestart(_MinRestart),
orth_period(_orth_period), diagonalisation(_diagonalisation) { };
ImplicitlyRestartedLanczos(LinearFunction<Field> & HermOp,
LinearFunction<Field> & HermOpTest,
int _Nstop, // sought vecs
int _Nk, // sought vecs
int _Nm, // spare vecs
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOpTest), _HermOp(HermOp), _HermOpTest(HermOpTest), _Tester(SimpleTester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
eresid(_eresid), betastp(_betastp),
MaxIter(_MaxIter) , MinRestart(_MinRestart),
orth_period(_orth_period), diagonalisation(_diagonalisation) { };
////////////////////////////////
// Helpers
////////////////////////////////
static RealD normalise(Field& v)
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
@ -136,16 +298,12 @@ public:
return nn;
}
void orthogonalize(Field& w, std::vector<Field>& evec, int k)
void orthogonalize(Field& w, std::vector<Field>& evec,int k)
{
typedef typename Field::scalar_type MyComplex;
MyComplex ip;
for(int j=0; j<k; ++j){
ip = innerProduct(evec[j],w);
w = w - ip * evec[j];
}
OrthoTime-=usecond()/1e6;
basisOrthogonalize(evec,w,k);
normalise(w);
OrthoTime+=usecond()/1e6;
}
/* Rudy Arthur's thesis pp.137
@ -165,184 +323,234 @@ repeat
→AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
until convergence
*/
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv)
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=true)
{
GridBase *grid = src._grid;
assert(grid == evec[0]._grid);
GridBase *grid = evec[0]._grid;
assert(grid == src._grid);
std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
std::cout << GridLogMessage <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl;
std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
std::cout << GridLogMessage <<" -- seek Nk = " << Nk <<" vectors"<< std::endl;
std::cout << GridLogMessage <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
std::cout << GridLogMessage <<" -- total Nm = " << Nm <<" vectors"<< std::endl;
std::cout << GridLogMessage <<" -- size of eval = " << eval.size() << std::endl;
std::cout << GridLogMessage <<" -- size of evec = " << evec.size() << std::endl;
GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl;
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" -- seek Nk = " << Nk <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- total Nm = " << Nm <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl;
std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl;
if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
std::cout << GridLogMessage << "Diagonalisation is DSTEGR "<<std::endl;
std::cout << GridLogIRL << "Diagonalisation is DSTEGR "<<std::endl;
} else if ( diagonalisation == IRLdiagonaliseWithQR ) {
std::cout << GridLogMessage << "Diagonalisation is QR "<<std::endl;
std::cout << GridLogIRL << "Diagonalisation is QR "<<std::endl;
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
std::cout << GridLogMessage << "Diagonalisation is Eigen "<<std::endl;
std::cout << GridLogIRL << "Diagonalisation is Eigen "<<std::endl;
}
std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
assert(Nm == evec.size() && Nm == eval.size());
assert(Nm <= evec.size() && Nm <= eval.size());
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0;
{
auto src_n = src;
auto tmp = src;
const int _MAX_ITER_IRL_MEVAPP_ = 50;
for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
_HermOpTest(src_n,tmp);
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
if (fabs(evalMaxApprox/na - 1.0) < 0.05)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
src_n = tmp;
}
}
std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm);
std::vector<RealD> eval2(Nm);
std::vector<RealD> eval2_copy(Nm);
Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm);
std::vector<int> Iconv(Nm);
std::vector<Field> B(Nm,grid); // waste of space replicating
Field f(grid);
Field v(grid);
int k1 = 1;
int k2 = Nk;
RealD beta_k;
Nconv = 0;
RealD beta_k;
// Set initial vector
evec[0] = src;
std::cout << GridLogMessage <<"norm2(src)= " << norm2(src)<<std::endl;
normalise(evec[0]);
std::cout << GridLogMessage <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
// Initial Nk steps
OrthoTime=0.;
for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
std::cout<<GridLogIRL <<"Initial "<< Nk <<"steps done "<<std::endl;
std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
//////////////////////////////////
// Restarting loop begins
//////////////////////////////////
int iter;
for(iter = 0; iter<MaxIter; ++iter){
OrthoTime=0.;
std::cout<< GridLogMessage <<" **********************"<< std::endl;
std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
std::cout<< GridLogMessage <<" **********************"<< std::endl;
std::cout<<GridLogIRL <<" running "<<Nm-Nk <<" steps: "<<std::endl;
for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
f *= lme[Nm-1];
std::cout<<GridLogIRL <<" "<<Nm-Nk <<" steps done "<<std::endl;
std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
//////////////////////////////////
// getting eigenvalues
//////////////////////////////////
for(int k=0; k<Nm; ++k){
eval2[k] = eval[k+k1-1];
lme2[k] = lme[k+k1-1];
}
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
std::cout<<GridLogIRL <<" diagonalized "<<std::endl;
//////////////////////////////////
// sorting
_sort.push(eval2,Nm);
//////////////////////////////////
eval2_copy = eval2;
std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
const int chunk=8;
for(int io=0; io<k2;io+=chunk){
std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
for(int ii=0;ii<chunk;ii++){
if ( (io+ii)<k2 )
std::cout<< " "<< std::setw(12)<< eval2[io+ii];
}
std::cout << std::endl;
}
//////////////////////////////////
// Implicitly shifted QR transformations
//////////////////////////////////
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
for(int ip=k2; ip<Nm; ++ip){
// Eigen replacement for qr_decomp ???
qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
}
std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
assert(k2<Nm); assert(k2<Nm); assert(k1>0);
for(int j=k1-1; j<k2+1; ++j){
for(int k=0; k<Nm; ++k){
B[j].checkerboard = evec[k].checkerboard;
B[j] += Qt(j,k) * evec[k];
}
}
for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j];
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt"<<std::endl;
////////////////////////////////////////////////////
// Compressed vector f and beta(k2)
////////////////////////////////////////////////////
f *= Qt(k2-1,Nm-1);
f += lme[k2-1] * evec[k2];
beta_k = norm2(f);
beta_k = sqrt(beta_k);
std::cout<< GridLogMessage<<" beta(k) = "<<beta_k<<std::endl;
std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
RealD betar = 1.0/beta_k;
evec[k2] = betar * f;
lme[k2-1] = beta_k;
////////////////////////////////////////////////////
// Convergence test
////////////////////////////////////////////////////
for(int k=0; k<Nm; ++k){
eval2[k] = eval[k];
lme2[k] = lme[k];
}
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
for(int k = 0; k<Nk; ++k) B[k]=0.0;
for(int j = 0; j<Nk; ++j){
for(int k = 0; k<Nk; ++k){
B[j].checkerboard = evec[k].checkerboard;
B[j] += Qt(j,k) * evec[k];
}
}
std::cout<<GridLogIRL <<" Diagonalized "<<std::endl;
Nconv = 0;
for(int i=0; i<Nk; ++i){
if (iter >= MinRestart) {
_Linop.HermOp(B[i],v);
std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
RealD vnum = real(innerProduct(B[i],v)); // HermOp.
RealD vden = norm2(B[i]);
eval2[i] = vnum/vden;
v -= eval2[i]*B[i];
RealD vv = norm2(v);
Field B(grid); B.checkerboard = evec[0].checkerboard;
std::cout.precision(13);
std::cout << GridLogMessage << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
std::cout << " |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
if((vv<eresid*eresid) && (i == Nconv) ){
Iconv[Nconv] = i;
++Nconv;
// power of two search pattern; not every evalue in eval2 is assessed.
for(int jj = 1; jj<=Nstop; jj*=2){
int j = Nstop-jj;
RealD e = eval2_copy[j]; // Discard the evalue
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
if( _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
if ( j > Nconv ) {
Nconv=j+1;
jj=Nstop; // Terminate the scan
}
} // i-loop end
std::cout<< GridLogMessage <<" #modes converged: "<<Nconv<<std::endl;
if( Nconv>=Nstop ){
}
}
// Do evec[0] for good measure
{
int j=0;
RealD e = eval2_copy[0];
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox);
}
// test if we converged, if so, terminate
std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
// if( Nconv>=Nstop || beta_k < betastp){
if( Nconv>=Nstop){
goto converged;
}
} // end of iter loop
std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
std::cout<< GridLogError <<" ImplicitlyRestartedLanczos::calc() NOT converged.";
std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
} else {
std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n";
} // end of iter loop
}
std::cout<<GridLogError<<"\n NOT converged.\n";
abort();
converged:
// Sorting
eval.resize(Nconv);
evec.resize(Nconv,grid);
for(int i=0; i<Nconv; ++i){
eval[i] = eval2[Iconv[i]];
evec[i] = B[Iconv[i]];
{
Field B(grid); B.checkerboard = evec[0].checkerboard;
basisRotate(evec,Qt,0,Nk,0,Nk,Nm);
std::cout << GridLogIRL << " Rotated basis"<<std::endl;
Nconv=0;
//////////////////////////////////////////////////////////////////////
// Full final convergence test; unconditionally applied
//////////////////////////////////////////////////////////////////////
for(int j = 0; j<=Nk; j++){
B=evec[j];
if( _Tester.ReconstructEval(j,eresid,B,eval2[j],evalMaxApprox) ) {
Nconv++;
}
_sort.push(eval,evec,Nconv);
std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
std::cout << GridLogMessage << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
std::cout << GridLogMessage << " -- Iterations = "<< iter << "\n";
std::cout << GridLogMessage << " -- beta(k) = "<< beta_k << "\n";
std::cout << GridLogMessage << " -- Nconv = "<< Nconv << "\n";
std::cout << GridLogMessage <<"**************************************************************************"<< std::endl;
}
private:
if ( Nconv < Nstop )
std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
eval=eval2;
basisSortInPlace(evec,eval,reverse);
}
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL << " -- Iterations = "<< iter << "\n";
std::cout << GridLogIRL << " -- beta(k) = "<< beta_k << "\n";
std::cout << GridLogIRL << " -- Nconv = "<< Nconv << "\n";
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
}
private:
/* Saad PP. 195
1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
2. For k = 1,2,...,m Do:
@ -361,14 +569,18 @@ private:
const RealD tiny = 1.0e-20;
assert( k< Nm );
_poly(_Linop,evec[k],w); // 3. wk:=Avkβkv_{k1}
GridStopWatch gsw_op,gsw_o;
Field& evec_k = evec[k];
_HermOp(evec_k,w); std::cout<<GridLogIRL << "Poly(HermOp)" <<std::endl;
if(k>0) w -= lme[k-1] * evec[k-1];
ComplexD zalph = innerProduct(evec[k],w); // 4. αk:=(wk,vk)
ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
RealD alph = real(zalph);
w = w - alph * evec[k];// 5. wk:=wkαkvk
w = w - alph * evec_k;// 5. wk:=wkαkvk
RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
// 7. vk+1 := wk/βk+1
@ -376,10 +588,16 @@ private:
lmd[k] = alph;
lme[k] = beta;
if ( k > 0 ) orthogonalize(w,evec,k); // orthonormalise
if ( k < Nm-1) evec[k+1] = w;
if (k>0 && k % orth_period == 0) {
orthogonalize(w,evec,k); // orthonormalise
std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
}
if ( beta < tiny ) std::cout << GridLogMessage << " beta is tiny "<<beta<<std::endl;
if(k < Nm-1) evec[k+1] = w;
std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
if ( beta < tiny )
std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
}
void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,
@ -404,11 +622,11 @@ private:
}
}
}
///////////////////////////////////////////////////////////////////////////
// File could end here if settle on Eigen ???
///////////////////////////////////////////////////////////////////////////
void qr_decomp(std::vector<RealD>& lmd, // Nm
///////////////////////////////////////////////////////////////////////////
// File could end here if settle on Eigen ??? !!!
///////////////////////////////////////////////////////////////////////////
void QR_decomp(std::vector<RealD>& lmd, // Nm
std::vector<RealD>& lme, // Nm
int Nk, int Nm, // Nk, Nm
Eigen::MatrixXd& Qt, // Nm x Nm matrix
@ -575,17 +793,17 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
#endif
}
void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd & Qt,
GridBase *grid)
{
int Niter = 100*Nm;
{
int QRiter = 100*Nm;
int kmin = 1;
int kmax = Nk;
// (this should be more sophisticated)
for(int iter=0; iter<Niter; ++iter){
for(int iter=0; iter<QRiter; ++iter){
// determination of 2x2 leading submatrix
RealD dsub = lmd[kmax-1]-lmd[kmax-2];
@ -594,7 +812,7 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
// (Dsh: shift)
// transformation
qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
QR_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
// Convergence criterion (redef of kmin and kamx)
for(int j=kmax-1; j>= kmin; --j){
@ -604,7 +822,7 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
goto continued;
}
}
Niter = iter;
QRiter = iter;
return;
continued:
@ -616,10 +834,9 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
}
}
}
std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<QRiter<<"\n";
abort();
}
};
}
};
}
#endif

View File

@ -0,0 +1,352 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h
Copyright (C) 2015
Author: Christoph Lehner <clehner@bnl.gov>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LOCAL_COHERENCE_IRL_H
#define GRID_LOCAL_COHERENCE_IRL_H
namespace Grid {
struct LanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
ChebyParams, Cheby,/*Chebyshev*/
int, Nstop, /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
int, Nk, /*Vecs in Lanczos seek converge*/
int, Nm, /*Total vecs in Lanczos include restart*/
RealD, resid, /*residual*/
int, MaxIt,
RealD, betastp, /* ? */
int, MinRes); // Must restart
};
struct LocalCoherenceLanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
bool, doFine,
bool, doFineRead,
bool, doCoarse,
bool, doCoarseRead,
LanczosParams, FineParams,
LanczosParams, CoarseParams,
ChebyParams, Smoother,
RealD , coarse_relax_tol,
std::vector<int>, blockSize,
std::string, config,
std::vector < std::complex<double> >, omega,
RealD, mass,
RealD, M5);
};
// Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function
template<class Fobj,class CComplex,int nbasis>
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
LinearOperatorBase<FineField> &_Linop;
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
ProjectedHermOp(LinearOperatorBase<FineField>& linop, Aggregation<Fobj,CComplex,nbasis> &aggregate) :
_Linop(linop),
_Aggregate(aggregate) { };
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = _Aggregate.FineGrid;
FineField fin(FineGrid);
FineField fout(FineGrid);
_Aggregate.PromoteFromSubspace(in,fin); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
_Aggregate.ProjectToSubspace(out,fout); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
}
};
template<class Fobj,class CComplex,int nbasis>
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
OperatorFunction<FineField> & _poly;
LinearOperatorBase<FineField> &_Linop;
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop,
Aggregation<Fobj,CComplex,nbasis> &aggregate) :
_poly(poly),
_Linop(linop),
_Aggregate(aggregate) { };
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = _Aggregate.FineGrid;
FineField fin(FineGrid) ;fin.checkerboard =_Aggregate.checkerboard;
FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard;
_Aggregate.PromoteFromSubspace(in,fin); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
_Aggregate.ProjectToSubspace(out,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
}
};
template<class Fobj,class CComplex,int nbasis>
class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
{
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
LinearFunction<CoarseField> & _Poly;
OperatorFunction<FineField> & _smoother;
LinearOperatorBase<FineField> &_Linop;
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
RealD _coarse_relax_tol;
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
OperatorFunction<FineField> &smoother,
LinearOperatorBase<FineField> &Linop,
Aggregation<Fobj,CComplex,nbasis> &Aggregate,
RealD coarse_relax_tol=5.0e3)
: _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol) { };
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
CoarseField v(B);
RealD eval_poly = eval;
// Apply operator
_Poly(B,v);
RealD vnum = real(innerProduct(B,v)); // HermOp.
RealD vden = norm2(B);
RealD vv0 = norm2(v);
eval = vnum/vden;
v -= eval*B;
RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv;
}
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
GridBase *FineGrid = _Aggregate.FineGrid;
int checkerboard = _Aggregate.checkerboard;
FineField fB(FineGrid);fB.checkerboard =checkerboard;
FineField fv(FineGrid);fv.checkerboard =checkerboard;
_Aggregate.PromoteFromSubspace(B,fv);
_smoother(_Linop,fv,fB);
RealD eval_poly = eval;
_Linop.HermOp(fB,fv);
RealD vnum = real(innerProduct(fB,fv)); // HermOp.
RealD vden = norm2(fB);
RealD vv0 = norm2(fv);
eval = vnum/vden;
fv -= eval*fB;
RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
if( (vv<eresid*eresid) ) return 1;
return 0;
}
};
////////////////////////////////////////////
// Make serializable Lanczos params
////////////////////////////////////////////
template<class Fobj,class CComplex,int nbasis>
class LocalCoherenceLanczos
{
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<Fobj> FineField;
protected:
GridBase *_CoarseGrid;
GridBase *_FineGrid;
int _checkerboard;
LinearOperatorBase<FineField> & _FineOp;
// FIXME replace Aggregation with vector of fine; the code reuse is too small for
// the hassle and complexity of cross coupling.
Aggregation<Fobj,CComplex,nbasis> _Aggregate;
std::vector<RealD> evals_fine;
std::vector<RealD> evals_coarse;
std::vector<CoarseField> evec_coarse;
public:
LocalCoherenceLanczos(GridBase *FineGrid,
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard) :
_CoarseGrid(CoarseGrid),
_FineGrid(FineGrid),
_Aggregate(CoarseGrid,FineGrid,checkerboard),
_FineOp(FineOp),
_checkerboard(checkerboard)
{
evals_fine.resize(0);
evals_coarse.resize(0);
};
void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = ::sqrt(nn);
v = v * (1.0/nn);
return nn;
}
void fakeFine(void)
{
int Nk = nbasis;
_Aggregate.subspace.resize(Nk,_FineGrid);
_Aggregate.subspace[0]=1.0;
_Aggregate.subspace[0].checkerboard=_checkerboard;
normalise(_Aggregate.subspace[0]);
PlainHermOp<FineField> Op(_FineOp);
for(int k=1;k<Nk;k++){
_Aggregate.subspace[k].checkerboard=_checkerboard;
Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
normalise(_Aggregate.subspace[k]);
}
}
void testFine(RealD resid)
{
assert(evals_fine.size() == nbasis);
assert(_Aggregate.subspace.size() == nbasis);
PlainHermOp<FineField> Op(_FineOp);
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
for(int k=0;k<nbasis;k++){
assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
}
}
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{
assert(evals_fine.size() == nbasis);
assert(_Aggregate.subspace.size() == nbasis);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
for(int k=0;k<evec_coarse.size();k++){
if ( k < nbasis ) {
assert(ChebySmoothTester.ReconstructEval(k,resid,evec_coarse[k],evals_coarse[k],1.0)==1);
} else {
assert(ChebySmoothTester.ReconstructEval(k,resid*relax,evec_coarse[k],evals_coarse[k],1.0)==1);
}
}
}
void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
assert(nbasis<=Nm);
Chebyshev<FineField> Cheby(cheby_parms);
FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
PlainHermOp<FineField> Op(_FineOp);
evals_fine.resize(Nm);
_Aggregate.subspace.resize(Nm,_FineGrid);
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
int Nconv;
IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
// Shrink down to number saved
assert(Nstop>=nbasis);
assert(Nconv>=nbasis);
evals_fine.resize(nbasis);
_Aggregate.subspace.resize(nbasis,_FineGrid);
}
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
int Nstop, int Nk, int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
Chebyshev<FineField> Cheby(cheby_op);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,_Aggregate);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
evals_coarse.resize(Nm);
evec_coarse.resize(Nm,_CoarseGrid);
CoarseField src(_CoarseGrid); src=1.0;
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
assert(Nconv>=Nstop);
evals_coarse.resize(Nstop);
evec_coarse.resize (Nstop,_CoarseGrid);
for (int i=0;i<Nstop;i++){
std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl;
}
}
};
}
#endif

View File

@ -53,16 +53,119 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
* M psi = eta
***********************
*Odd
* i) (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1} eta_o
* i) D_oo psi_o = L^{-1} eta_o
* eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
*
* Wilson:
* (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1} eta_o
* Stag:
* D_oo psi_o = L^{-1} eta = (eta_o - Moe Mee^{-1} eta_e)
*
* L^-1 eta_o= (1 0 ) (e
* (-MoeMee^{-1} 1 )
*
*Even
* ii) Mee psi_e + Meo psi_o = src_e
*
* => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
*
*
* TODO: Other options:
*
* a) change checkerboards for Schur e<->o
*
* Left precon by Moo^-1
* b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 = (D_oo)^dag M_oo^-dag Moo^-1 L^{-1} eta_o
* eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
*
* Right precon by Moo^-1
* c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1} eta_o
* eta_o' = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
* psi_o = M_oo^-1 phi_o
* TODO: Deflation
*/
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackStaggeredSolve {
private:
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver) :
_HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=0;
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
/////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
src_o = tmp; assert(src_o.checkerboard ==Odd);
// _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
// Verify the unprec residual
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
}
};
template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
@ -76,12 +179,10 @@ namespace Grid {
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver) :
_HermitianRBSolver(HermitianRBSolver)
SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0) : _HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=0;
CBfactorise=cb;
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
@ -141,5 +242,166 @@ namespace Grid {
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagTwoSolve {
private:
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver) :
_HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=0;
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
_HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
// Verify the unprec residual
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagTwoMixed {
private:
LinearFunction<Field> & _HermitianRBSolver;
int CBfactorise;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver) :
_HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=0;
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
// _HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
_HermitianRBSolver(src_o,tmp); assert(tmp.checkerboard==Odd);
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
// Verify the unprec residual
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
}
};
}
#endif

View File

@ -1,7 +1,5 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
namespace Grid {
@ -11,7 +9,7 @@ int PointerCache::victim;
void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return NULL;
if (bytes < 4096 ) return ptr;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) {
return NULL;
}
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
}

View File

@ -64,6 +64,8 @@ namespace Grid {
};
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
@ -92,18 +94,34 @@ public:
size_type bytes = __n*sizeof(_Tp);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
// if ( ptr != NULL )
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
//////////////////
// Hack 2MB align; could make option probably doesn't need configurability
//////////////////
//define GRID_ALLOC_ALIGN (128)
#define GRID_ALLOC_ALIGN (2*1024*1024)
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
// First touch optimise in threaded loop
uint8_t *cp = (uint8_t *)ptr;
#ifdef GRID_OMP
#pragma omp parallel for
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n * sizeof(_Tp);
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#ifdef HAVE_MM_MALLOC_H
@ -182,10 +200,19 @@ public:
pointer allocate(size_type __n, const void* _p= 0)
{
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
#else
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
#endif
size_type bytes = __n*sizeof(_Tp);
uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order
#pragma omp parallel for schedule(static)
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
}
return ptr;
}
void deallocate(pointer __p, size_type) {

View File

@ -44,11 +44,20 @@ namespace Grid{
class GridBase : public CartesianCommunicator , public GridThread {
public:
int dummy;
// Give Lattice access
template<class object> friend class Lattice;
GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
GridBase(const std::vector<int> & processor_grid,
const CartesianCommunicator &parent,
int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {};
GridBase(const std::vector<int> & processor_grid,
const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {};
virtual ~GridBase() = default;
// Physics Grid information.
std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
@ -185,17 +194,18 @@ public:
////////////////////////////////////////////////////////////////
void show_decomposition(){
std::cout << GridLogMessage << "Full Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "Global Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "Local Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "Outer strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "Inner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "iSites : " << _isites << std::endl;
std::cout << GridLogMessage << "oSites : " << _osites << std::endl;
std::cout << GridLogMessage << "lSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "gSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "Nd : " << _ndimension << std::endl;
std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
}
////////////////////////////////////////////////////////////////
@ -209,9 +219,6 @@ public:
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
gidx=0;
int mult=1;

View File

@ -38,7 +38,7 @@ namespace Grid{
class GridCartesian: public GridBase {
public:
int dummy;
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
@ -61,10 +61,38 @@ public:
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
return shift;
}
/////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator.
/////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid
) : GridBase(processor_grid)
const std::vector<int> &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{
Init(dimensions,simd_layout,processor_grid);
}
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{
Init(dimensions,simd_layout,processor_grid);
}
/////////////////////////////////////////////////////////////////////////
// Construct from comm world
/////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid)
{
Init(dimensions,simd_layout,processor_grid);
}
virtual ~GridCartesian() = default;
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid)
{
///////////////////////
// Grid information
@ -84,30 +112,36 @@ public:
_fsites = _gsites = _osites = _isites = 1;
for(int d=0;d<_ndimension;d++){
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
//FIXME check for exact division
// Use a reduced simd grid
_ldimensions[d]= _gdimensions[d]/_processors[d]; //local dimensions
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
_lstart[d] = _processor_coor[d]*_ldimensions[d];
_lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if ( d==0 ) {
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
} else {
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1];
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
@ -118,21 +152,20 @@ public:
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block =1;
int nblock=1;
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for(int d=0;d<_ndimension;d++){
nblock/=_rdimensions[d];
_slice_block[d] =block;
_slice_stride[d]=_ostride[d]*_rdimensions[d];
_slice_nblock[d]=nblock;
block = block*_rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
_slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
};
};
}
#endif

View File

@ -112,24 +112,59 @@ public:
}
};
GridRedBlackCartesian(const GridBase *base) : GridRedBlackCartesian(base->_fdimensions,base->_simd_layout,base->_processors) {};
////////////////////////////////////////////////////////////
// Create Redblack from original grid; require full grid pointer ?
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
{
int dims = base->_ndimension;
std::vector<int> checker_dim_mask(dims,1);
int checker_dim = 0;
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
};
GridRedBlackCartesian(const std::vector<int> &dimensions,
////////////////////////////////////////////////////////////
// Create redblack from original grid, with non-trivial checker dim mask
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(base->_processors,*base)
{
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
}
virtual ~GridRedBlackCartesian() = default;
#if 0
////////////////////////////////////////////////////////////
// Create redblack grid ;; deprecate these. Should not
// need direct creation of redblack without a full grid to base on
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(processor_grid)
) : GridBase(processor_grid,*base)
{
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
GridRedBlackCartesian(const std::vector<int> &dimensions,
////////////////////////////////////////////////////////////
// Create redblack grid
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid)
const std::vector<int> &processor_grid) : GridBase(processor_grid,*base)
{
std::vector<int> checker_dim_mask(dimensions.size(),1);
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
int checker_dim = 0;
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
#endif
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
@ -140,11 +175,11 @@ public:
// Grid information
///////////////////////
_checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim]==1);
assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size();
assert(checker_dim_mask.size()==_ndimension);
assert(processor_grid.size()==_ndimension);
assert(simd_layout.size()==_ndimension);
assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
@ -159,31 +194,39 @@ public:
_fsites = _gsites = _osites = _isites = 1;
_checker_dim_mask=checker_dim_mask;
_checker_dim_mask = checker_dim_mask;
for(int d=0;d<_ndimension;d++){
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
if (d==_checker_dim) {
_gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
if (d == _checker_dim)
{
assert((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2;
}
_ldimensions[d] = _gdimensions[d]/_processors[d];
_lstart[d] = _processor_coor[d]*_ldimensions[d];
_lend[d] = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
_ldimensions[d] = _gdimensions[d] / _processors[d];
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
// Use a reduced simd grid
_simd_layout[d] = simd_layout[d];
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
assert(_rdimensions[d]>0);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
assert(_rdimensions[d] > 0);
// all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
if ( _simd_layout[d]>1 ) {
if ( checker_dim_mask[d] ) {
assert( (_rdimensions[d]&0x1) == 0 );
if (_simd_layout[d] > 1)
{
if (checker_dim_mask[d])
{
assert((_rdimensions[d] & 0x1) == 0);
}
}
@ -191,15 +234,16 @@ public:
_isites *= _simd_layout[d];
// Addressing support
if ( d==0 ) {
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
} else {
_ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1];
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
////////////////////////////////////////////////////////////////////////////////////////////
@ -209,40 +253,48 @@ public:
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block =1;
int nblock=1;
for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for(int d=0;d<_ndimension;d++){
nblock/=_rdimensions[d];
_slice_block[d] =block;
_slice_stride[d]=_ostride[d]*_rdimensions[d];
_slice_nblock[d]=nblock;
block = block*_rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
_slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
////////////////////////////////////////////////
// Create a checkerboard lookup table
////////////////////////////////////////////////
int rvol = 1;
for(int d=0;d<_ndimension;d++){
rvol=rvol * _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
rvol = rvol * _rdimensions[d];
}
_checker_board.resize(rvol);
for(int osite=0;osite<_osites;osite++){
_checker_board[osite] = CheckerBoardFromOindex (osite);
for (int osite = 0; osite < _osites; osite++)
{
_checker_board[osite] = CheckerBoardFromOindex(osite);
}
};
protected:
protected:
virtual int oIndex(std::vector<int> &coor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) {
if( d==_checker_dim ) {
idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
} else {
idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
}
else
{
idx += _ostride[d] * (coor[d] % _rdimensions[d]);
}
}
return idx;
@ -250,17 +302,20 @@ protected:
virtual int iIndex(std::vector<int> &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) {
if( d==_checker_dim ) {
idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
} else {
idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
}
else
{
idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
}
}
return idx;
}
};
}
#endif

View File

@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/mman.h>
namespace Grid {
@ -33,8 +37,11 @@ namespace Grid {
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////
void * CartesianCommunicator::ShmCommBuf;
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 128*1024*1024;
CartesianCommunicator::CommunicatorPolicy_t CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
uint64_t CartesianCommunicator::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
CartesianCommunicator::CommunicatorPolicy_t
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
int CartesianCommunicator::nCommThreads = -1;
int CartesianCommunicator::Hugepages = 0;
/////////////////////////////////
// Alloc, free shmem region
@ -89,25 +96,175 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
GlobalSumVector((double *)c,2*N);
}
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) || defined (GRID_COMMS_MPI3)
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
{
_ndimension = processors.size();
assert(_ndimension = parent._ndimension);
//////////////////////////////////////////////////////////////////////////////////////////////////////
// split the communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
int Nparent;
MPI_Comm_size(parent.communicator,&Nparent);
int childsize=1;
for(int d=0;d<processors.size();d++) {
childsize *= processors[d];
}
int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent);
std::vector<int> ccoor(_ndimension); // coor within subcommunicator
std::vector<int> scoor(_ndimension); // coor of split within parent
std::vector<int> ssize(_ndimension); // coor of split within parent
for(int d=0;d<_ndimension;d++){
ccoor[d] = parent._processor_coor[d] % processors[d];
scoor[d] = parent._processor_coor[d] / processors[d];
ssize[d] = parent._processors[d] / processors[d];
}
int crank; // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
// Mpi uses the reverse Lexico convention to us
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors);
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize);
MPI_Comm comm_split;
if ( Nchild > 1 ) {
/*
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
for(int d=0;d<parent._processors.size();d++) std::cout << parent._processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
*/
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
assert(ierr==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Declare victory
//////////////////////////////////////////////////////////////////////////////////////////////////////
/*
std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
*/
} else {
comm_split=parent.communicator;
srank = 0;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Set up from the new split communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
InitFromMPICommunicator(processors,comm_split);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Take an MPI_Comm and self assemble
//////////////////////////////////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
{
_ndimension = processors.size();
_processor_coor.resize(_ndimension);
/////////////////////////////////
// Count the requested nodes
/////////////////////////////////
_Nprocessors=1;
_processors = processors;
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
std::vector<int> periodic(_ndimension,1);
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
if ( communicator_base != communicator_world ) {
std::cout << "Cartesian communicator created with a non-world communicator"<<std::endl;
std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
for(int d=0;d<_processors.size();d++){
std::cout << _processor_coor[d]<<" ";
}
std::cout << std::endl;
}
int Size;
MPI_Comm_size(communicator,&Size);
#ifdef GRID_COMMS_MPIT
communicator_halo.resize (2*_ndimension);
for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]);
}
#endif
assert(Size==_Nprocessors);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
InitFromMPICommunicator(processors,communicator_world);
}
#endif
#if !defined( GRID_COMMS_MPI3)
int CartesianCommunicator::NodeCount(void) { return ProcessorCount();};
int CartesianCommunicator::RankCount(void) { return ProcessorCount();};
#endif
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes, int dir)
{
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes)
int bytes, int dir)
{
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
#endif
#if !defined( GRID_COMMS_MPI3)
void CartesianCommunicator::StencilBarrier(void){};
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
@ -121,8 +278,30 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
return NULL;
}
void CartesianCommunicator::ShmInitGeneric(void){
#if 1
int mmap_flag =0;
#ifdef MAP_ANONYMOUS
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
#endif
#ifdef MAP_ANON
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
#endif
#ifdef MAP_HUGETLB
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
#endif
ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
if (ShmCommBuf == (void *)MAP_FAILED) {
perror("mmap failed ");
exit(EXIT_FAILURE);
}
#ifdef MADV_HUGEPAGE
if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
#endif
#else
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
ShmCommBuf=(void *)&ShmBufStorageVector[0];
#endif
bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
}
#endif

View File

@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_MPI3
#include <mpi.h>
#endif
#ifdef GRID_COMMS_MPI3L
#ifdef GRID_COMMS_MPIT
#include <mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM
@ -50,12 +50,24 @@ namespace Grid {
class CartesianCommunicator {
public:
// 65536 ranks per node adequate for now
////////////////////////////////////////////
// Isend/Irecv/Wait, or Sendrecv blocking
////////////////////////////////////////////
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
static CommunicatorPolicy_t CommunicatorPolicy;
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
///////////////////////////////////////////
// Up to 65536 ranks per node adequate for now
// 128MB shared memory for comms enought for 48^4 local vol comms
// Give external control (command line override?) of this
///////////////////////////////////////////
static const int MAXLOG2RANKSPERNODE = 16;
static uint64_t MAX_MPI_SHM_BYTES;
static int nCommThreads;
// use explicit huge pages
static int Hugepages;
// Communicator should know nothing of the physics grid, only processor grid.
int _Nprocessors; // How many in all
@ -64,14 +76,19 @@ class CartesianCommunicator {
std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension;
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
static MPI_Comm communicator_world;
MPI_Comm communicator;
std::vector<MPI_Comm> communicator_halo;
typedef MPI_Request CommsRequest_t;
#else
typedef int CommsRequest_t;
#endif
////////////////////////////////////////////////////////////////////
// Helper functionality for SHM Windows common to all other impls
////////////////////////////////////////////////////////////////////
@ -117,10 +134,6 @@ class CartesianCommunicator {
/////////////////////////////////
static void * ShmCommBuf;
// Isend/Irecv/Wait, or Sendrecv blocking
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
static CommunicatorPolicy_t CommunicatorPolicy;
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
size_t heap_top;
size_t heap_bytes;
@ -137,9 +150,22 @@ class CartesianCommunicator {
static void Init(int *argc, char ***argv);
////////////////////////////////////////////////
// Constructor of any given grid
// Constructors to sub-divide a parent communicator
// and default to comm world
////////////////////////////////////////////////
CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank);
CartesianCommunicator(const std::vector<int> &pdimensions_in);
virtual ~CartesianCommunicator();
private:
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT)
////////////////////////////////////////////////
// Private initialise from an MPI communicator
// Can use after an MPI_Comm_split, but hidden from user so private
////////////////////////////////////////////////
void InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base);
#endif
public:
////////////////////////////////////////////////////////////////////////////////////////
// Wraps MPI_Cart routines, or implements equivalent on other impls
@ -211,14 +237,21 @@ class CartesianCommunicator {
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
int bytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
void StencilBarrier(void);
////////////////////////////////////////////////////////////
@ -231,6 +264,27 @@ class CartesianCommunicator {
////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, int bytes);
////////////////////////////////////////////////////////////
// All2All down one dimension
////////////////////////////////////////////////////////////
template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
assert(dim>=0);
assert(dim<_ndimension);
int numnode = _processors[dim];
// std::cerr << " AllToAll in.size() "<<in.size()<<std::endl;
// std::cerr << " AllToAll out.size() "<<out.size()<<std::endl;
assert(in.size()==out.size());
uint64_t bytes=sizeof(T);
uint64_t words=in.size()/numnode;
assert(numnode * words == in.size());
assert(words < (1ULL<<32));
AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
}
void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes);
void AllToAll(void *in,void *out,uint64_t words ,uint64_t bytes);
template<class obj> void Broadcast(int root,obj &data)
{
Broadcast(root,(void *)&data,sizeof(data));

View File

@ -53,28 +53,14 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
ShmInitGeneric();
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
CartesianCommunicator::~CartesianCommunicator()
{
_ndimension = processors.size();
std::vector<int> periodic(_ndimension,1);
_Nprocessors=1;
_processors = processors;
_processor_coor.resize(_ndimension);
MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
int Size;
MPI_Comm_size(communicator,&Size);
assert(Size==_Nprocessors);
int MPI_is_finalised;
MPI_Finalized(&MPI_is_finalised);
if (communicator && MPI_is_finalised)
MPI_Comm_free(&communicator);
}
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
@ -210,6 +196,36 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
root,
communicator);
assert(ierr==0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
std::vector<int> row(_ndimension,1);
assert(dim>=0 && dim<_ndimension);
// Split the communicator
row[dim] = _processors[dim];
int me;
CartesianCommunicator Comm(row,*this,me);
Comm.AllToAll(in,out,words,bytes);
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
// MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
// (Turns up on 32^3 x 64 Gparity too)
MPI_Datatype object;
int iwords;
int ibytes;
iwords = words;
ibytes = bytes;
assert(words == iwords); // safe to cast to int ?
assert(bytes == ibytes); // safe to cast to int ?
MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
MPI_Type_commit(&object);
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
MPI_Type_free(&object);
}
///////////////////////////////////////////////////////
// Should only be used prior to Grid Init finished.
@ -230,5 +246,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
assert(ierr==0);
}
}

View File

@ -37,11 +37,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/mman.h>
//#include <zlib.h>
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
@ -197,7 +198,46 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
ShmCommBuf = 0;
ShmCommBufs.resize(ShmSize);
#if 1
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbf and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMMMAP
char shm_name [NAME_MAX];
for(int r=0;r<ShmSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
//sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
// printf("Opening file %s \n",shm_name);
int fd=open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd == -1) {
printf("open %s failed\n",shm_name);
perror("open hugetlbfs");
exit(0);
}
int mmap_flag = MAP_SHARED ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
ShmCommBufs[r] =ptr;
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for
// the posix shm virtual file system
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMOPEN
char shm_name [NAME_MAX];
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
@ -211,9 +251,37 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
ftruncate(fd, size);
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
int mmap_flag = MAP_SHARED;
#ifdef MAP_POPULATE
mmap_flag |= MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if (Hugepages) mmap_flag |= MAP_HUGETLB;
#endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
if ( ptr == (void * )MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0);
// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
#if 0
//#ifdef HAVE_NUMAIF_H
int status;
int flags=MPOL_MF_MOVE;
#ifdef KNL
int nodes=1; // numa domain == MCDRAM
// Find out if in SNC2,SNC4 mode ?
#else
int nodes=r; // numa domain == MPI ID
#endif
unsigned long count=1;
for(uint64_t page=0;page<size;page+=4096){
void *pages = (void *) ( page + (uint64_t)ptr );
uint64_t *cow_it = (uint64_t *)pages; *cow_it = 1;
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
if (ierr && (page==0)) perror("numa relocate command failed");
}
#endif
ShmCommBufs[r] =ptr;
}
@ -236,23 +304,34 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
ShmCommBufs[r] =ptr;
}
}
#else
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET SHMAT and SHM_HUGETLB flag
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMGET
std::vector<int> shmids(ShmSize);
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
key_t key = 0x4545 + r;
if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
key_t key = IPC_PRIVATE;
int flags = IPC_CREAT | SHM_R | SHM_W;
#ifdef SHM_HUGETLB
if (Hugepages) flags|=SHM_HUGETLB;
#endif
if ((shmids[r]= shmget(key,size, flags)) ==-1) {
int errsv = errno;
printf("Errno %d\n",errsv);
printf("key %d\n",key);
printf("size %lld\n",size);
printf("flags %d\n",flags);
perror("shmget");
exit(1);
}
} else {
printf("shmid: 0x%x\n", shmids[r]);
}
}
}
MPI_Barrier(ShmComm);
MPI_Bcast(&shmids[0],ShmSize*sizeof(int),MPI_BYTE,0,ShmComm);
MPI_Barrier(ShmComm);
@ -371,12 +450,27 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
assert(lr!=-1);
Lexicographic::CoorFromIndex(coor,lr,_processors);
}
//////////////////////////////////
// Try to subdivide communicator
//////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent)
: CartesianCommunicator(processors)
{
std::cout << "Attempts to split MPI3 communicators will fail until implemented" <<std::endl;
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
int ierr;
communicator=communicator_world;
_ndimension = processors.size();
communicator_halo.resize (2*_ndimension);
for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]);
}
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
@ -599,13 +693,28 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
}
}
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
int bytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
@ -624,26 +733,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
gfrom = MPI_UNDEFINED;
#endif
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}
if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list);
this->StencilSendToRecvFromComplete(list,dir);
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}

View File

@ -0,0 +1,268 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/ActionCore.h>
#include <mpi.h>
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Comm CartesianCommunicator::communicator_world;
// Should error check all MPI calls.
void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag;
int provided;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
if ( provided != MPI_THREAD_MULTIPLE ) {
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
}
}
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
ShmInitGeneric();
}
CartesianCommunicator::~CartesianCommunicator()
{
if (communicator && !MPI::Is_finalized())
MPI_Comm_free(&communicator);
}
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
int myrank = _processor;
int ierr;
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
MPI_Request xrq;
MPI_Request rrq;
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
int nreq=list.size();
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
}
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
}
///////////////////////////////////////////////////////
// Should only be used prior to Grid Init finished.
// Check for this?
///////////////////////////////////////////////////////
int CartesianCommunicator::RankWorld(void){
int r;
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir)
{
int myrank = _processor;
int ierr;
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
// Give the CPU to MPI immediately; can use threads to overlap optionally
MPI_Request req[2];
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[commdir],&req[0]);
list.push_back(req[0]);
list.push_back(req[1]);
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
int nreq=waitall.size();
MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
};
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir)
{
int myrank = _processor;
int ierr;
// std::cout << " sending on communicator "<<dir<<" " <<communicator_halo.size()<< <std::endl;
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
// Give the CPU to MPI immediately; can use threads to overlap optionally
MPI_Request req[2];
MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[commdir],&req[1]);
MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank ,myrank , communicator_halo[commdir],&req[0]);
MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
return 2.0*bytes;
}
}

View File

@ -38,6 +38,9 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
ShmInitGeneric();
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors) { srank=0;}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_processors = processors;
@ -53,6 +56,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
}
}
CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){}
@ -95,6 +100,14 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
bcopy(in,out,bytes*words);
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
bcopy(in,out,bytes*words);
}
int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){}

View File

@ -75,6 +75,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
ShmInitGeneric();
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent)
: CartesianCommunicator(processors)
{
std::cout << "Attempts to split SHMEM communicators will fail " <<std::endl;
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_ndimension = processors.size();

View File

@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/cshift/Cshift_mpi.h>
#endif
#ifdef GRID_COMMS_MPI3L
#ifdef GRID_COMMS_MPIT
#include <Grid/cshift/Cshift_mpi.h>
#endif

File diff suppressed because it is too large Load Diff

View File

@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
}
};
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
int NN = BlockSolverGrid->_ndimension;
@ -387,6 +388,7 @@ inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
}
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
}
*/
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl=1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@ -498,18 +501,19 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
Lattice<vobj> Lslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
@ -540,7 +544,7 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
vector_typeD rtmp = TensorRemove(tmp);
auto rtmp = TensorRemove(tmp);
mat_thread(i,j) += Reduce(rtmp);
}}
}}
@ -549,6 +553,14 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat += mat_thread;
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
}

View File

@ -109,8 +109,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
coarseData=zero;
// Loop with a cache friendly loop ordering
for(int sf=0;sf<fine->oSites();sf++){
// Loop over coars parallel, and then loop over fine associated with coarse.
parallel_for(int sf=0;sf<fine->oSites();sf++){
int sc;
std::vector<int> coor_c(_ndimension);
@ -119,6 +119,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
PARALLEL_CRITICAL
for(int i=0;i<nbasis;i++) {
coarseData._odata[sc](i)=coarseData._odata[sc](i)
@ -139,6 +140,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
GridBase * coarse= coarseA._grid;
fineZ.checkerboard=fineX.checkerboard;
assert(fineX.checkerboard==fineY.checkerboard);
subdivides(coarse,fine); // require they map
conformable(fineX,fineY);
conformable(fineX,fineZ);
@ -180,9 +182,10 @@ template<class vobj,class CComplex>
GridBase *coarse(CoarseInner._grid);
GridBase *fine (fineX._grid);
Lattice<dotp> fine_inner(fine);
Lattice<dotp> fine_inner(fine); fine_inner.checkerboard = fineX.checkerboard;
Lattice<dotp> coarse_inner(coarse);
// Precision promotion?
fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner);
parallel_for(int ss=0;ss<coarse->oSites();ss++){
@ -193,7 +196,7 @@ template<class vobj,class CComplex>
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
{
GridBase *coarse = ip._grid;
Lattice<vobj> zz(fineX._grid); zz=zero;
Lattice<vobj> zz(fineX._grid); zz=zero; zz.checkerboard=fineX.checkerboard;
blockInnerProduct(ip,fineX,fineX);
ip = pow(ip,-0.5);
blockZAXPY(fineX,ip,fineX,zz);
@ -216,20 +219,26 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
}
// Turn this around to loop threaded over sc and interior loop
// over sf would thread better
coarseData=zero;
for(int sf=0;sf<fine->oSites();sf++){
parallel_region {
int sc;
std::vector<int> coor_c(_ndimension);
std::vector<int> coor_f(_ndimension);
parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
PARALLEL_CRITICAL
coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
}
}
return;
}
@ -238,7 +247,7 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
{
GridBase * fine = unpicked._grid;
Lattice<vobj> zz(fine);
Lattice<vobj> zz(fine); zz.checkerboard = unpicked.checkerboard;
Lattice<iScalar<vInteger> > fcoor(fine);
zz = zero;
@ -303,12 +312,13 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
}
// Loop with a cache friendly loop ordering
for(int sf=0;sf<fine->oSites();sf++){
parallel_region {
int sc;
std::vector<int> coor_c(_ndimension);
std::vector<int> coor_f(_ndimension);
parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
@ -316,7 +326,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int i=0;i<nbasis;i++) {
if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
else fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf];
}
}
}
return;
@ -685,5 +695,314 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
}
}
////////////////////////////////////////////////////////////////////////////////
// Communicate between grids
////////////////////////////////////////////////////////////////////////////////
//
// All to all plan
//
// Subvolume on fine grid is v. Vectors a,b,c,d
//
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// SIMPLEST CASE:
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Mesh of nodes (2) ; subdivide to 1 subdivisions
//
// Lex ord:
// N0 va0 vb0 N1 va1 vb1
//
// For each dimension do an all to all
//
// full AllToAll(0)
// N0 va0 va1 N1 vb0 vb1
//
// REARRANGE
// N0 va01 N1 vb01
//
// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
// NB: Easiest to programme if keep in lex order.
//
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// SIMPLE CASE:
///////////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Mesh of nodes (2x2) ; subdivide to 1x1 subdivisions
//
// Lex ord:
// N0 va0 vb0 vc0 vd0 N1 va1 vb1 vc1 vd1
// N2 va2 vb2 vc2 vd2 N3 va3 vb3 vc3 vd3
//
// Ratio = full[dim] / split[dim]
//
// For each dimension do an all to all; get Nvec -> Nvec / ratio
// Ldim -> Ldim * ratio
// LocalVol -> LocalVol * ratio
// full AllToAll(0)
// N0 va0 vb0 va1 vb1 N1 vc0 vd0 vc1 vd1
// N2 va2 vb2 va3 vb3 N3 vc2 vd2 vc3 vd3
//
// REARRANGE
// N0 va01 vb01 N1 vc01 vd01
// N2 va23 vb23 N3 vc23 vd23
//
// full AllToAll(1) // Not what is wanted. FIXME
// N0 va01 va23 N1 vc01 vc23
// N2 vb01 vb23 N3 vd01 vd23
//
// REARRANGE
// N0 va0123 N1 vc0123
// N2 vb0123 N3 vd0123
//
// Must also rearrange data to get into the NEW lex order of grid at each stage. Some kind of "insert/extract".
// NB: Easiest to programme if keep in lex order.
//
/////////////////////////////////////////////////////////
template<class Vobj>
void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
{
typedef typename Vobj::scalar_object Sobj;
int full_vecs = full.size();
assert(full_vecs>=1);
GridBase * full_grid = full[0]._grid;
GridBase *split_grid = split._grid;
int ndim = full_grid->_ndimension;
int full_nproc = full_grid->_Nprocessors;
int split_nproc =split_grid->_Nprocessors;
////////////////////////////////
// Checkerboard management
////////////////////////////////
int cb = full[0].checkerboard;
split.checkerboard = cb;
//////////////////////////////
// Checks
//////////////////////////////
assert(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){
assert(full[n].checkerboard == cb);
for(int d=0;d<ndim;d++){
assert(full[n]._grid->_gdimensions[d]==split._grid->_gdimensions[d]);
assert(full[n]._grid->_fdimensions[d]==split._grid->_fdimensions[d]);
}
}
int nvector =full_nproc/split_nproc;
assert(nvector*split_nproc==full_nproc);
assert(nvector == full_vecs);
std::vector<int> ratio(ndim);
for(int d=0;d<ndim;d++){
ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
}
uint64_t lsites = full_grid->lSites();
uint64_t sz = lsites * nvector;
std::vector<Sobj> tmpdata(sz);
std::vector<Sobj> alldata(sz);
std::vector<Sobj> scalardata(lsites);
for(int v=0;v<nvector;v++){
unvectorizeToLexOrdArray(scalardata,full[v]);
parallel_for(int site=0;site<lsites;site++){
alldata[v*lsites+site] = scalardata[site];
}
}
int nvec = nvector; // Counts down to 1 as we collapse dims
std::vector<int> ldims = full_grid->_ldimensions;
std::vector<int> lcoor(ndim);
for(int d=ndim-1;d>=0;d--){
if ( ratio[d] != 1 ) {
full_grid ->AllToAll(d,alldata,tmpdata);
// std::cout << GridLogMessage << "Grid_split: dim " <<d<<" ratio "<<ratio[d]<<" nvec "<<nvec<<" procs "<<split_grid->_processors[d]<<std::endl;
// for(int v=0;v<nvec;v++){
// std::cout << "Grid_split: alldata["<<v<<"] " << alldata[v] <<std::endl;
// std::cout << "Grid_split: tmpdata["<<v<<"] " << tmpdata[v] <<std::endl;
// }
//////////////////////////////////////////
//Local volume for this dimension is expanded by ratio of processor extents
// Number of vectors is decreased by same factor
// Rearrange to lexico for bigger volume
//////////////////////////////////////////
nvec /= ratio[d];
auto rdims = ldims; rdims[d] *= ratio[d];
auto rsites= lsites*ratio[d];
for(int v=0;v<nvec;v++){
// For loop over each site within old subvol
for(int lsite=0;lsite<lsites;lsite++){
Lexicographic::CoorFromIndex(lcoor, lsite, ldims);
for(int r=0;r<ratio[d];r++){ // ratio*nvec terms
auto rcoor = lcoor; rcoor[d] += r*ldims[d];
int rsite; Lexicographic::IndexFromCoor(rcoor, rsite, rdims);
rsite += v * rsites;
int rmul=nvec*lsites;
int vmul= lsites;
alldata[rsite] = tmpdata[lsite+r*rmul+v*vmul];
// if ( lsite==0 ) {
// std::cout << "Grid_split: grow alldata["<<rsite<<"] " << alldata[rsite] << " <- tmpdata["<< lsite+r*rmul+v*vmul<<"] "<<tmpdata[lsite+r*rmul+v*vmul] <<std::endl;
// }
}
}
}
ldims[d]*= ratio[d];
lsites *= ratio[d];
if ( split_grid->_processors[d] > 1 ) {
tmpdata = alldata;
split_grid->AllToAll(d,tmpdata,alldata);
}
}
}
vectorizeFromLexOrdArray(alldata,split);
}
template<class Vobj>
void Grid_split(Lattice<Vobj> &full,Lattice<Vobj> & split)
{
int nvector = full._grid->_Nprocessors / split._grid->_Nprocessors;
std::vector<Lattice<Vobj> > full_v(nvector,full._grid);
for(int n=0;n<nvector;n++){
full_v[n] = full;
}
Grid_split(full_v,split);
}
template<class Vobj>
void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
{
typedef typename Vobj::scalar_object Sobj;
int full_vecs = full.size();
assert(full_vecs>=1);
GridBase * full_grid = full[0]._grid;
GridBase *split_grid = split._grid;
int ndim = full_grid->_ndimension;
int full_nproc = full_grid->_Nprocessors;
int split_nproc =split_grid->_Nprocessors;
////////////////////////////////
// Checkerboard management
////////////////////////////////
int cb = full[0].checkerboard;
split.checkerboard = cb;
//////////////////////////////
// Checks
//////////////////////////////
assert(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){
assert(full[n].checkerboard == cb);
for(int d=0;d<ndim;d++){
assert(full[n]._grid->_gdimensions[d]==split._grid->_gdimensions[d]);
assert(full[n]._grid->_fdimensions[d]==split._grid->_fdimensions[d]);
}
}
int nvector =full_nproc/split_nproc;
assert(nvector*split_nproc==full_nproc);
assert(nvector == full_vecs);
std::vector<int> ratio(ndim);
for(int d=0;d<ndim;d++){
ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
}
uint64_t lsites = full_grid->lSites();
uint64_t sz = lsites * nvector;
std::vector<Sobj> tmpdata(sz);
std::vector<Sobj> alldata(sz);
std::vector<Sobj> scalardata(lsites);
unvectorizeToLexOrdArray(alldata,split);
/////////////////////////////////////////////////////////////////
// Start from split grid and work towards full grid
/////////////////////////////////////////////////////////////////
std::vector<int> lcoor(ndim);
std::vector<int> rcoor(ndim);
int nvec = 1;
lsites = split_grid->lSites();
std::vector<int> ldims = split_grid->_ldimensions;
// for(int d=ndim-1;d>=0;d--){
for(int d=0;d<ndim;d++){
if ( ratio[d] != 1 ) {
if ( split_grid->_processors[d] > 1 ) {
tmpdata = alldata;
split_grid->AllToAll(d,tmpdata,alldata);
}
//////////////////////////////////////////
//Local volume for this dimension is expanded by ratio of processor extents
// Number of vectors is decreased by same factor
// Rearrange to lexico for bigger volume
//////////////////////////////////////////
auto rsites= lsites/ratio[d];
auto rdims = ldims; rdims[d]/=ratio[d];
for(int v=0;v<nvec;v++){
// rsite, rcoor --> smaller local volume
// lsite, lcoor --> bigger original (single node?) volume
// For loop over each site within smaller subvol
for(int rsite=0;rsite<rsites;rsite++){
Lexicographic::CoorFromIndex(rcoor, rsite, rdims);
int lsite;
for(int r=0;r<ratio[d];r++){
lcoor = rcoor; lcoor[d] += r*rdims[d];
Lexicographic::IndexFromCoor(lcoor, lsite, ldims); lsite += v * lsites;
int rmul=nvec*rsites;
int vmul= rsites;
tmpdata[rsite+r*rmul+v*vmul]=alldata[lsite];
}
}
}
nvec *= ratio[d];
ldims[d]=rdims[d];
lsites =rsites;
full_grid ->AllToAll(d,tmpdata,alldata);
}
}
lsites = full_grid->lSites();
for(int v=0;v<nvector;v++){
assert(v<full.size());
parallel_for(int site=0;site<lsites;site++){
scalardata[site] = alldata[v*lsites+site];
}
vectorizeFromLexOrdArray(scalardata,full[v]);
}
}
}
#endif

View File

@ -50,7 +50,7 @@ namespace Grid {
return (status==0) ? res.get() : name ;
}
GridStopWatch Logger::StopWatch;
GridStopWatch Logger::GlobalStopWatch;
int Logger::timestamp;
std::ostream Logger::devnull(0);
@ -59,13 +59,15 @@ void GridLogTimestamp(int on){
}
Colours GridLogColours(0);
GridLogger GridLogError(1, "Error", GridLogColours, "RED");
GridLogger GridLogIRL (1, "IRL" , GridLogColours, "NORMAL");
GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
GridLogger GridLogError (1, "Error" , GridLogColours, "RED");
GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogIterative(1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator(1, "Integrator", GridLogColours, "BLUE");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogError.Active(0);
@ -95,7 +97,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
////////////////////////////////////////////////////////////
void Grid_quiesce_nodes(void) {
int me = 0;
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
MPI_Comm_rank(MPI_COMM_WORLD, &me);
#endif
#ifdef GRID_COMMS_SHMEM

View File

@ -85,12 +85,15 @@ class Logger {
protected:
Colours &Painter;
int active;
int timing_mode;
static int timestamp;
std::string name, topName;
std::string COLOUR;
public:
static GridStopWatch StopWatch;
static GridStopWatch GlobalStopWatch;
GridStopWatch LocalStopWatch;
GridStopWatch *StopWatch;
static std::ostream devnull;
std::string background() {return Painter.colour["NORMAL"];}
@ -101,22 +104,38 @@ public:
name(nm),
topName(topNm),
Painter(col_class),
COLOUR(col) {} ;
timing_mode(0),
COLOUR(col)
{
StopWatch = & GlobalStopWatch;
};
void Active(int on) {active = on;};
int isActive(void) {return active;};
static void Timestamp(int on) {timestamp = on;};
void Reset(void) {
StopWatch->Reset();
StopWatch->Start();
}
void TimingMode(int on) {
timing_mode = on;
if(on) {
StopWatch = &LocalStopWatch;
Reset();
}
}
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) {
stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : ";
stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : ";
stream << log.background()<< std::left << log.topName << log.background()<< " : ";
stream << log.colour() << std::left << log.name << log.background() << " : ";
if ( log.timestamp ) {
StopWatch.Stop();
GridTime now = StopWatch.Elapsed();
StopWatch.Start();
stream << log.evidence()<< now << log.background() << " : " ;
log.StopWatch->Stop();
GridTime now = log.StopWatch->Elapsed();
if ( log.timing_mode==1 ) log.StopWatch->Reset();
log.StopWatch->Start();
stream << log.evidence()<< std::setw(6)<<now << log.background() << " : " ;
}
stream << log.colour();
return stream;
@ -135,6 +154,8 @@ public:
void GridLogConfigure(std::vector<std::string> &logstreams);
extern GridLogger GridLogIRL;
extern GridLogger GridLogSolver;
extern GridLogger GridLogError;
extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage;

View File

@ -29,7 +29,7 @@
#ifndef GRID_BINARY_IO_H
#define GRID_BINARY_IO_H
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
#define USE_MPI_IO
#else
#undef USE_MPI_IO
@ -99,34 +99,38 @@ class BinaryIO {
NerscChecksum(grid,scalardata,nersc_csum);
}
template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
template <class fobj>
static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
{
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
uint64_t lsites =grid->lSites();
if (fbuf.size()==1) {
lsites=1;
}
#pragma omp parallel
uint64_t lsites = grid->lSites();
if (fbuf.size() == 1)
{
uint32_t nersc_csum_thr=0;
lsites = 1;
}
#pragma omp for
for(uint64_t local_site=0;local_site<lsites;local_site++){
uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
for(uint64_t j=0;j<size32;j++){
nersc_csum_thr=nersc_csum_thr+site_buf[j];
#pragma omp parallel
{
uint32_t nersc_csum_thr = 0;
#pragma omp for
for (uint64_t local_site = 0; local_site < lsites; local_site++)
{
uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
for (uint64_t j = 0; j < size32; j++)
{
nersc_csum_thr = nersc_csum_thr + site_buf[j];
}
}
#pragma omp critical
#pragma omp critical
{
nersc_csum += nersc_csum_thr;
}
}
}
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
{
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
@ -257,7 +261,7 @@ class BinaryIO {
GridBase *grid,
std::vector<fobj> &iodata,
std::string file,
int offset,
Integer offset,
const std::string &format, int control,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@ -352,7 +356,7 @@ class BinaryIO {
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO
std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl;
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
@ -363,16 +367,20 @@ class BinaryIO {
assert(0);
#endif
} else {
std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
std::ifstream fin;
fin.open(file,std::ios::binary|std::ios::in);
if ( control & BINARYIO_MASTER_APPEND ) {
fin.seekg(-sizeof(fobj),fin.end);
} else {
fin.seekg(offset+myrank*lsites*sizeof(fobj));
fin.open(file, std::ios::binary | std::ios::in);
if (control & BINARYIO_MASTER_APPEND)
{
fin.seekg(-sizeof(fobj), fin.end);
}
fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
else
{
fin.seekg(offset + myrank * lsites * sizeof(fobj));
}
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
assert(fin.fail() == 0);
fin.close();
}
timer.Stop();
@ -405,10 +413,30 @@ class BinaryIO {
timer.Start();
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO
std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
std::cout << GridLogMessage <<"IOobject: MPI write I/O " << file << std::endl;
ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
// std::cout << GridLogMessage << "Checking for errors" << std::endl;
if (ierr != MPI_SUCCESS)
{
char error_string[BUFSIZ];
int length_of_error_string, error_class;
MPI_Error_class(ierr, &error_class);
MPI_Error_string(error_class, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Error_string(ierr, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
}
std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
assert(ierr == 0);
std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
assert(ierr == 0);
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
@ -416,15 +444,51 @@ class BinaryIO {
assert(0);
#endif
} else {
std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
if ( control & BINARYIO_MASTER_APPEND ) {
fout.seekp(0,fout.end);
} else {
fout.seekp(offset+myrank*lsites*sizeof(fobj));
std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
std::ofstream fout;
fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
try {
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
} catch (const std::fstream::failure& exc) {
std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
if ( control & BINARYIO_MASTER_APPEND ) {
try {
fout.seekp(0,fout.end);
} catch (const std::fstream::failure& exc) {
std::cout << "Exception in seeking file end " << file << std::endl;
}
} else {
try {
fout.seekp(offset+myrank*lsites*sizeof(fobj));
} catch (const std::fstream::failure& exc) {
std::cout << "Exception in seeking file " << file <<" offset "<< offset << std::endl;
}
}
try {
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
}
catch (const std::fstream::failure& exc) {
std::cout << "Exception in writing file " << file << std::endl;
std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
fout.close();
}
timer.Stop();
@ -442,12 +506,15 @@ class BinaryIO {
//////////////////////////////////////////////////////////////////////////////
// Safety check
//////////////////////////////////////////////////////////////////////////////
// if the data size is 1 we do not want to sum over the MPI ranks
if (iodata.size() != 1){
grid->Barrier();
grid->GlobalSum(nersc_csum);
grid->GlobalXOR(scidac_csuma);
grid->GlobalXOR(scidac_csumb);
grid->Barrier();
}
}
/////////////////////////////////////////////////////////////////////////////
// Read a Lattice of object
@ -456,7 +523,7 @@ class BinaryIO {
static inline void readLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
int offset,
Integer offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@ -493,7 +560,7 @@ class BinaryIO {
static inline void writeLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
int offset,
Integer offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@ -530,7 +597,7 @@ class BinaryIO {
static inline void readRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
std::string file,
int offset,
Integer offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
@ -546,9 +613,9 @@ class BinaryIO {
int gsites = grid->gSites();
int lsites = grid->lSites();
uint32_t nersc_csum_tmp;
uint32_t scidac_csuma_tmp;
uint32_t scidac_csumb_tmp;
uint32_t nersc_csum_tmp = 0;
uint32_t scidac_csuma_tmp = 0;
uint32_t scidac_csumb_tmp = 0;
GridStopWatch timer;
@ -592,7 +659,7 @@ class BinaryIO {
static inline void writeRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
std::string file,
int offset,
Integer offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)

View File

@ -84,10 +84,6 @@ namespace QCD {
stream << "GRID_";
stream << ScidacWordMnemonic<stype>();
// std::cout << " Lorentz N/S/V/M : " << _LorentzN<<" "<<_LorentzScalar<<"/"<<_LorentzVector<<"/"<<_LorentzMatrix<<std::endl;
// std::cout << " Spin N/S/V/M : " << _SpinN <<" "<<_SpinScalar <<"/"<<_SpinVector <<"/"<<_SpinMatrix<<std::endl;
// std::cout << " Colour N/S/V/M : " << _ColourN <<" "<<_ColourScalar <<"/"<<_ColourVector <<"/"<<_ColourMatrix<<std::endl;
if ( _LorentzVector ) stream << "_LorentzVector"<<_LorentzN;
if ( _LorentzMatrix ) stream << "_LorentzMatrix"<<_LorentzN;
@ -151,7 +147,7 @@ namespace QCD {
_scidacRecord = sr;
std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
// std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
}
///////////////////////////////////////////////////////
@ -182,7 +178,7 @@ class GridLimeReader : public BinaryIO {
/////////////////////////////////////////////
// Open the file
/////////////////////////////////////////////
void open(std::string &_filename)
void open(const std::string &_filename)
{
filename= _filename;
File = fopen(filename.c_str(), "r");
@ -210,24 +206,38 @@ class GridLimeReader : public BinaryIO {
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
std::cout << GridLogMessage << limeReaderType(LimeR) <<std::endl;
uint64_t file_bytes =limeReaderBytes(LimeR);
if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
// std::cout << GridLogMessage << limeReaderType(LimeR) << " "<< file_bytes <<" bytes "<<std::endl;
// std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
off_t offset= ftell(File);
// std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
// std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
// std::cout << "R Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "R Payload expected " <<PayloadSize<<std::endl;
// std::cout << "R file size " <<file_bytes <<std::endl;
assert(PayloadSize == file_bytes);// Must match or user error
uint64_t offset= ftello(File);
// std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::readLatticeObject< sobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
/////////////////////////////////////////////
// Insist checksum is next record
/////////////////////////////////////////////
readLimeObject(scidacChecksum_,std::string("scidacChecksum"),record_name);
readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
/////////////////////////////////////////////
// Verify checksums
/////////////////////////////////////////////
scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
return;
}
}
@ -242,11 +252,16 @@ class GridLimeReader : public BinaryIO {
// should this be a do while; can we miss a first record??
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
// std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
// std::cout << GridLogMessage<< " readLimeObject matches ! " << record_name <<std::endl;
std::vector<char> xmlc(nbytes+1,'\0');
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
// std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
XmlReader RD(&xmlc[0],"");
read(RD,object_name,object);
return;
@ -261,13 +276,14 @@ class GridLimeWriter : public BinaryIO {
public:
///////////////////////////////////////////////////
// FIXME: format for RNG? Now just binary out instead
// FIXME: collective calls or not ?
// : must know if I am the I/O boss
///////////////////////////////////////////////////
FILE *File;
LimeWriter *LimeW;
std::string filename;
void open(std::string &_filename) {
void open(const std::string &_filename) {
filename= _filename;
File = fopen(filename.c_str(), "w");
LimeW = limeCreateWriter(File); assert(LimeW != NULL );
@ -302,14 +318,18 @@ class GridLimeWriter : public BinaryIO {
write(WR,object_name,object);
xmlstring = WR.XmlString();
}
// std::cout << "WriteLimeObject" << record_name <<std::endl;
uint64_t nbytes = xmlstring.size();
// std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
int err;
LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes); assert(h!= NULL);
LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes);
assert(h!= NULL);
err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
limeDestroyHeader(h);
// std::cout << " File offset is now"<<ftello(File) << std::endl;
}
////////////////////////////////////////////
// Write a generic lattice field and csum
@ -326,6 +346,10 @@ class GridLimeWriter : public BinaryIO {
uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
createLimeRecordHeader(record_name, 0, 0, PayloadSize);
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl;
////////////////////////////////////////////////////////////////////
// NB: FILE and iostream are jointly writing disjoint sequences in the
// the same file through different file handles (integer units).
@ -333,17 +357,20 @@ class GridLimeWriter : public BinaryIO {
// These are both buffered, so why I think this code is right is as follows.
//
// i) write record header to FILE *File, telegraphing the size.
// ii) ftell reads the offset from FILE *File .
// ii) ftello reads the offset from FILE *File .
// iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
// Closes iostream and flushes.
// iv) fseek on FILE * to end of this disjoint section.
// v) Continue writing scidac record.
////////////////////////////////////////////////////////////////////
off_t offset = ftell(File);
uint64_t offset = ftello(File);
// std::cout << " Writing to offset "<<offset << std::endl;
std::string format = getFormatString<vobj>();
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
// fseek(File,0,SEEK_END); offset = ftello(File);std::cout << " offset now "<<offset << std::endl;
err=limeWriterCloseRecord(LimeW); assert(err>=0);
////////////////////////////////////////
// Write checksum element, propagaing forward from the BinaryIO
// Always pair a checksum with a binary object, and close message
@ -353,8 +380,8 @@ class GridLimeWriter : public BinaryIO {
std::stringstream streamb; streamb << std::hex << scidac_csumb;
checksum.suma= streama.str();
checksum.sumb= streamb.str();
std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
writeLimeObject(0,1,checksum,std::string("scidacChecksum" ),std::string(SCIDAC_CHECKSUM));
// std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
}
};
@ -374,8 +401,6 @@ class ScidacWriter : public GridLimeWriter {
template <class vobj, class userRecord>
void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord)
{
typedef typename vobj::scalar_object sobj;
uint64_t nbytes;
GridBase * grid = field._grid;
////////////////////////////////////////
@ -397,6 +422,66 @@ class ScidacWriter : public GridLimeWriter {
}
};
class ScidacReader : public GridLimeReader {
public:
template<class SerialisableUserFile>
void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
{
scidacFile _scidacFile(grid);
readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
}
////////////////////////////////////////////////
// Write generic lattice field in scidac format
////////////////////////////////////////////////
template <class vobj, class userRecord>
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
{
typedef typename vobj::scalar_object sobj;
GridBase * grid = field._grid;
////////////////////////////////////////
// fill the Grid header
////////////////////////////////////////
FieldMetaData header;
scidacRecord _scidacRecord;
scidacFile _scidacFile;
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
}
void skipPastBinaryRecord(void) {
std::string rec_name(ILDG_BINARY_DATA);
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
return;
}
}
}
void skipPastObjectRecord(std::string rec_name) {
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
return;
}
}
}
void skipScidacFieldRecord() {
skipPastObjectRecord(std::string(GRID_FORMAT));
skipPastObjectRecord(std::string(SCIDAC_RECORD_XML));
skipPastObjectRecord(std::string(SCIDAC_PRIVATE_RECORD_XML));
skipPastBinaryRecord();
}
};
class IldgWriter : public ScidacWriter {
public:
@ -425,8 +510,6 @@ class IldgWriter : public ScidacWriter {
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj;
uint64_t nbytes;
////////////////////////////////////////
// fill the Grid header
////////////////////////////////////////
@ -557,7 +640,7 @@ class IldgReader : public GridLimeReader {
// Copy out the string
std::vector<char> xmlc(nbytes+1,'\0');
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
// std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
//////////////////////////////////
// ILDG format record
@ -601,7 +684,7 @@ class IldgReader : public GridLimeReader {
std::string xmls(&xmlc[0]);
// is it a USQCD info field
if ( xmls.find(std::string("usqcdInfo")) != std::string::npos ) {
std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
// std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
XmlReader RD(&xmlc[0],"");
read(RD,"usqcdInfo",usqcdInfo_);
found_usqcdInfo = 1;
@ -619,8 +702,7 @@ class IldgReader : public GridLimeReader {
// Binary data
/////////////////////////////////
std::cout << GridLogMessage << "ILDG Binary record found : " ILDG_BINARY_DATA << std::endl;
off_t offset= ftell(File);
uint64_t offset= ftello(File);
if ( format == std::string("IEEE64BIG") ) {
GaugeSimpleMunger<dobj, sobj> munge;
BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);

View File

@ -64,6 +64,11 @@ namespace Grid {
// file compatability, so should be correct to assume the undocumented but defacto file structure.
/////////////////////////////////////////////////////////////////////////////////
struct emptyUserRecord : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(emptyUserRecord,int,dummy);
emptyUserRecord() { dummy=0; };
};
////////////////////////
// Scidac private file xml
// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>

View File

@ -85,6 +85,9 @@ namespace Grid {
nd=4;
dimension.resize(4);
boundary.resize(4);
scidac_checksuma=0;
scidac_checksumb=0;
checksum=0;
}
};
@ -104,6 +107,7 @@ namespace Grid {
header.nd = nd;
header.dimension.resize(nd);
header.boundary.resize(nd);
header.data_start = 0;
for(int d=0;d<nd;d++) {
header.dimension[d] = grid->_fdimensions[d];
}

View File

@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES },
// 4
#ifdef AVX512
#ifdef KNL
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
{ PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },

View File

@ -0,0 +1,100 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/AbstractEOFAFermion.h
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_ABSTRACT_EOFA_FERMION_H
#define GRID_QCD_ABSTRACT_EOFA_FERMION_H
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
namespace Grid {
namespace QCD {
// DJM: Abstract base class for EOFA fermion types.
// Defines layout of additional EOFA-specific parameters and operators.
// Use to construct EOFA pseudofermion actions that are agnostic to
// Shamir / Mobius / etc., and ensure that no one can construct EOFA
// pseudofermion action with non-EOFA fermion type.
template<class Impl>
class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
public:
// Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
RealD mq1;
RealD mq2;
RealD mq3;
RealD shift;
int pm;
RealD alpha; // Mobius scale
RealD k; // EOFA normalization constant
virtual void Instantiatable(void) = 0;
// EOFA-specific operations
// Force user to implement in derived classes
virtual void Omega (const FermionField& in, FermionField& out, int sign, int dag) = 0;
virtual void Dtilde (const FermionField& in, FermionField& out) = 0;
virtual void DtildeInv(const FermionField& in, FermionField& out) = 0;
// Implement derivatives in base class:
// for EOFA both DWF and Mobius just need d(Dw)/dU
virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
this->DhopDeriv(mat, U, V, dag);
};
virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
this->DhopDerivOE(mat, U, V, dag);
};
virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
this->DhopDerivEO(mat, U, V, dag);
};
// Recompute 5D coefficients for different value of shift constant
// (needed for heatbath loop over poles)
virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
// Constructors
AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
: CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
_mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
{
int Ls = this->Ls;
this->alpha = _b + _c;
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
};
};
}}
#endif

View File

@ -77,7 +77,6 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
}
}
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
{
this->Report();
@ -119,7 +118,6 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
MooeeInvTime=0;
}
template<class Impl>
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
{
@ -414,7 +412,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
for(int i=0; i < Ls; i++){
as[i] = 1.0;
omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
// assert(fabs(omega[i])>0.0);
assert(omega[i]!=Coeff_t(0.0));
bs[i] = 0.5*(bpc/omega[i] + bmc);
cs[i] = 0.5*(bpc/omega[i] - bmc);
}
@ -429,7 +427,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
for(int i=0;i<Ls;i++){
bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);
// assert(fabs(bee[i])>0.0);
assert(bee[i]!=Coeff_t(0.0));
cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
beo[i]=as[i]*bs[i];
ceo[i]=-as[i]*cs[i];
@ -456,10 +454,16 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
if ( i < Ls-1 ) {
assert(bee[i]!=Coeff_t(0.0));
assert(bee[0]!=Coeff_t(0.0));
lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
leem[i]=mass*cee[Ls-1]/bee[0];
for(int j=0;j<i;j++) leem[i]*= aee[j]/bee[j+1];
for(int j=0;j<i;j++) {
assert(bee[j+1]!=Coeff_t(0.0));
leem[i]*= aee[j]/bee[j+1];
}
uee[i] =-aee[i]/bee[i]; // up-diag entry on the ith row
@ -478,7 +482,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
{
Coeff_t delta_d=mass*cee[Ls-1];
for(int j=0;j<Ls-1;j++) {
// assert(fabs(bee[j])>0.0);
assert(bee[j] != Coeff_t(0.0));
delta_d *= cee[j]/bee[j];
}
dee[Ls-1] += delta_d;

View File

@ -179,9 +179,9 @@ namespace Grid {
double MooeeInvTime;
protected:
void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
};
}

View File

@ -0,0 +1,438 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid_Eigen_Dense.h>
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
namespace Grid {
namespace QCD {
template<class Impl>
DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
GaugeField &_Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mq1, RealD _mq2, RealD _mq3,
RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
_shift, _pm, _M5, 1.0, 0.0, p)
{
RealD eps = 1.0;
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
assert(zdata->n == this->Ls);
std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
this->SetCoefficientsTanh(zdata, 1.0, 0.0);
Approx::zolotarev_free(zdata);
}
/***************************************************************
* Additional EOFA operators only called outside the inverter.
* Since speed is not essential, simple axpby-style
* implementations should be fine.
***************************************************************/
template<class Impl>
void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
{
int Ls = this->Ls;
Din = zero;
if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
}
// This is just the identity for DWF
template<class Impl>
void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
// This is just the identity for DWF
template<class Impl>
void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
/*****************************************************************************************************/
template<class Impl>
RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
FermionField Din(psi._grid);
this->Meooe5D(psi, Din);
this->DW(Din, chi, DaggerNo);
axpby(chi, 1.0, 1.0, chi, psi);
this->M5D(psi, chi);
return(norm2(chi));
}
template<class Impl>
RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
FermionField Din(psi._grid);
this->DW(psi, Din, DaggerYes);
this->MeooeDag5D(Din, chi);
this->M5Ddag(psi, chi);
axpby(chi, 1.0, 1.0, chi, psi);
return(norm2(chi));
}
/********************************************************************
* Performance critical fermion operators called inside the inverter
********************************************************************/
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
int pm = this->pm;
RealD shift = this->shift;
RealD mq1 = this->mq1;
RealD mq2 = this->mq2;
RealD mq3 = this->mq3;
// coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
Coeff_t shiftp(0.0), shiftm(0.0);
if(shift != 0.0){
if(pm == 1){ shiftp = shift*(mq3-mq2); }
else{ shiftm = -shift*(mq3-mq2); }
}
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
#if(0)
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
for(int i=0; i<diag.size(); ++i){
std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
}
for(int i=0; i<upper.size(); ++i){
std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
}
for(int i=0; i<lower.size(); ++i){
std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
}
#endif
this->M5D(psi, chi, chi, lower, diag, upper);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
int pm = this->pm;
RealD shift = this->shift;
RealD mq1 = this->mq1;
RealD mq2 = this->mq2;
RealD mq3 = this->mq3;
// coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
Coeff_t shiftp(0.0), shiftm(0.0);
if(shift != 0.0){
if(pm == 1){ shiftp = shift*(mq3-mq2); }
else{ shiftm = -shift*(mq3-mq2); }
}
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
#if(0)
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
for(int i=0; i<diag.size(); ++i){
std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
}
for(int i=0; i<upper.size(); ++i){
std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
}
for(int i=0; i<lower.size(); ++i){
std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
}
#endif
this->M5Ddag(psi, chi, chi, lower, diag, upper);
}
// half checkerboard operations
template<class Impl>
void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
std::vector<Coeff_t> diag = this->bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
for(int s=0; s<Ls; s++){
upper[s] = -this->cee[s];
lower[s] = -this->cee[s];
}
upper[Ls-1] = this->dm;
lower[0] = this->dp;
this->M5D(psi, psi, chi, lower, diag, upper);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
std::vector<Coeff_t> diag = this->bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
for(int s=0; s<Ls; s++){
upper[s] = -this->cee[s];
lower[s] = -this->cee[s];
}
upper[Ls-1] = this->dp;
lower[0] = this->dm;
this->M5Ddag(psi, psi, chi, lower, diag, upper);
}
/****************************************************************************************/
//Zolo
template<class Impl>
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
{
int Ls = this->Ls;
int pm = this->pm;
RealD mq1 = this->mq1;
RealD mq2 = this->mq2;
RealD mq3 = this->mq3;
RealD shift = this->shift;
////////////////////////////////////////////////////////
// Constants for the preconditioned matrix Cayley form
////////////////////////////////////////////////////////
this->bs.resize(Ls);
this->cs.resize(Ls);
this->aee.resize(Ls);
this->aeo.resize(Ls);
this->bee.resize(Ls);
this->beo.resize(Ls);
this->cee.resize(Ls);
this->ceo.resize(Ls);
for(int i=0; i<Ls; ++i){
this->bee[i] = 4.0 - this->M5 + 1.0;
this->cee[i] = 1.0;
}
for(int i=0; i<Ls; ++i){
this->aee[i] = this->cee[i];
this->bs[i] = this->beo[i] = 1.0;
this->cs[i] = this->ceo[i] = 0.0;
}
//////////////////////////////////////////
// EOFA shift terms
//////////////////////////////////////////
if(pm == 1){
this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
this->dm = mq1*this->cee[Ls-1];
} else if(this->pm == -1) {
this->dp = mq1*this->cee[0];
this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
} else {
this->dp = mq1*this->cee[0];
this->dm = mq1*this->cee[Ls-1];
}
//////////////////////////////////////////
// LDU decomposition of eeoo
//////////////////////////////////////////
this->dee.resize(Ls+1);
this->lee.resize(Ls);
this->leem.resize(Ls);
this->uee.resize(Ls);
this->ueem.resize(Ls);
for(int i=0; i<Ls; ++i){
if(i < Ls-1){
this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
this->leem[i] = this->dm/this->bee[i];
for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
this->dee[i] = this->bee[i];
this->uee[i] = -this->aee[i]/this->bee[i]; // up-diag entry on the ith row
this->ueem[i] = this->dp / this->bee[0];
for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
} else {
this->lee[i] = 0.0;
this->leem[i] = 0.0;
this->uee[i] = 0.0;
this->ueem[i] = 0.0;
}
}
{
Coeff_t delta_d = 1.0 / this->bee[0];
for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
}
int inv = 1;
this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
}
// Recompute Cayley-form coefficients for different shift
template<class Impl>
void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
{
this->shift = new_shift;
Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
this->SetCoefficientsTanh(zdata, 1.0, 0.0);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
{
int Ls = this->Ls;
GridBase* grid = this->FermionRedBlackGrid();
int LLs = grid->_rdimensions[0];
if(LLs == Ls){ return; } // Not vectorised in 5th direction
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
for(int s=0; s<Ls; s++){
Pplus(s,s) = this->bee[s];
Pminus(s,s) = this->bee[s];
}
for(int s=0; s<Ls-1; s++){
Pminus(s,s+1) = -this->cee[s];
}
for(int s=0; s<Ls-1; s++){
Pplus(s+1,s) = -this->cee[s+1];
}
Pplus (0,Ls-1) = this->dp;
Pminus(Ls-1,0) = this->dm;
Eigen::MatrixXcd PplusMat ;
Eigen::MatrixXcd PminusMat;
#if(0)
std::cout << GridLogMessage << "Pplus:" << std::endl;
for(int s=0; s<Ls; ++s){
for(int ss=0; ss<Ls; ++ss){
std::cout << Pplus(s,ss) << "\t";
}
std::cout << std::endl;
}
std::cout << GridLogMessage << "Pminus:" << std::endl;
for(int s=0; s<Ls; ++s){
for(int ss=0; ss<Ls; ++ss){
std::cout << Pminus(s,ss) << "\t";
}
std::cout << std::endl;
}
#endif
if(inv) {
PplusMat = Pplus.inverse();
PminusMat = Pminus.inverse();
} else {
PplusMat = Pplus;
PminusMat = Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
typedef typename SiteHalfSpinor::scalar_type scalar_type;
const int Nsimd = Simd::Nsimd();
Matp.resize(Ls*LLs);
Matm.resize(Ls*LLs);
for(int s2=0; s2<Ls; s2++){
for(int s1=0; s1<LLs; s1++){
int istride = LLs;
int ostride = 1;
Simd Vp;
Simd Vm;
scalar_type *sp = (scalar_type*) &Vp;
scalar_type *sm = (scalar_type*) &Vm;
for(int l=0; l<Nsimd; l++){
if(switcheroo<Coeff_t>::iscomplex()) {
sp[l] = PplusMat (l*istride+s1*ostride,s2);
sm[l] = PminusMat(l*istride+s1*ostride,s2);
} else {
// if real
scalar_type tmp;
tmp = PplusMat (l*istride+s1*ostride,s2);
sp[l] = scalar_type(tmp.real(),tmp.real());
tmp = PminusMat(l*istride+s1*ostride,s2);
sm[l] = scalar_type(tmp.real(),tmp.real());
}
}
Matp[LLs*s2+s1] = Vp;
Matm[LLs*s2+s1] = Vm;
}}
}
FermOpTemplateInstantiate(DomainWallEOFAFermion);
GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
}}

View File

@ -0,0 +1,115 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.h
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
#define GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
namespace Grid {
namespace QCD {
template<class Impl>
class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
{
public:
INHERIT_IMPL_TYPES(Impl);
public:
// Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
// for red-black preconditioned Shamir EOFA
Coeff_t dm;
Coeff_t dp;
virtual void Instantiatable(void) {};
// EOFA-specific operations
virtual void Omega (const FermionField& in, FermionField& out, int sign, int dag);
virtual void Dtilde (const FermionField& in, FermionField& out);
virtual void DtildeInv (const FermionField& in, FermionField& out);
// override multiply
virtual RealD M (const FermionField& in, FermionField& out);
virtual RealD Mdag (const FermionField& in, FermionField& out);
// half checkerboard operations
virtual void Mooee (const FermionField& in, FermionField& out);
virtual void MooeeDag (const FermionField& in, FermionField& out);
virtual void MooeeInv (const FermionField& in, FermionField& out);
virtual void MooeeInvDag(const FermionField& in, FermionField& out);
virtual void M5D (const FermionField& psi, FermionField& chi);
virtual void M5Ddag (const FermionField& psi, FermionField& chi);
/////////////////////////////////////////////////////
// Instantiate different versions depending on Impl
/////////////////////////////////////////////////////
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
virtual void RefreshShiftCoefficients(RealD new_shift);
// Constructors
DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
RealD _M5, const ImplParams& p=ImplParams());
protected:
void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
};
}}
#define INSTANTIATE_DPERP_DWF_EOFA(A)\
template void DomainWallEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
template void DomainWallEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
template void DomainWallEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
template void DomainWallEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi);
#undef DOMAIN_WALL_EOFA_DPERP_DENSE
#define DOMAIN_WALL_EOFA_DPERP_CACHE
#undef DOMAIN_WALL_EOFA_DPERP_LINALG
#define DOMAIN_WALL_EOFA_DPERP_VEC
#endif

View File

@ -0,0 +1,248 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
namespace Grid {
namespace QCD {
// FIXME -- make a version of these routines with site loop outermost for cache reuse.
// Pminus fowards
// Pplus backwards..
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
int Ls = this->Ls;
GridBase* grid = psi._grid;
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
for(int s=0; s<Ls; s++){
auto tmp = psi._odata[0];
if(s==0) {
spProj5m(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+Ls-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else if(s==(Ls-1)) {
spProj5m(tmp, psi._odata[ss+0]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else {
spProj5m(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
}
}
}
this->M5Dtime += usecond();
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
int Ls = this->Ls;
GridBase* grid = psi._grid;
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];
for(int s=0; s<Ls; s++){
if(s==0) {
spProj5p(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+Ls-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else if(s==(Ls-1)) {
spProj5p(tmp, psi._odata[ss+0]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else {
spProj5p(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
}
}
}
this->M5Dtime += usecond();
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
{
GridBase* grid = psi._grid;
int Ls = this->Ls;
chi.checkerboard = psi.checkerboard;
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
auto tmp1 = psi._odata[0];
auto tmp2 = psi._odata[0];
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
// Apply (L^{\prime})^{-1}
chi[ss] = psi[ss]; // chi[0]=psi[0]
for(int s=1; s<Ls; s++){
spProj5p(tmp1, chi[ss+s-1]);
chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
}
// L_m^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
spProj5m(tmp1, chi[ss+s]);
chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
}
// U_m^{-1} D^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
spProj5p(tmp1, chi[ss+Ls-1]);
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
}
spProj5m(tmp2, chi[ss+Ls-1]);
chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
// Apply U^{-1}
for(int s=Ls-2; s>=0; s--){
spProj5m(tmp1, chi[ss+s+1]);
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
}
}
this->MooeeInvTime += usecond();
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
{
GridBase* grid = psi._grid;
int Ls = this->Ls;
assert(psi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
std::vector<Coeff_t> ueec(Ls);
std::vector<Coeff_t> deec(Ls+1);
std::vector<Coeff_t> leec(Ls);
std::vector<Coeff_t> ueemc(Ls);
std::vector<Coeff_t> leemc(Ls);
for(int s=0; s<ueec.size(); s++){
ueec[s] = conjugate(this->uee[s]);
deec[s] = conjugate(this->dee[s]);
leec[s] = conjugate(this->lee[s]);
ueemc[s] = conjugate(this->ueem[s]);
leemc[s] = conjugate(this->leem[s]);
}
deec[Ls] = conjugate(this->dee[Ls]);
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
auto tmp1 = psi._odata[0];
auto tmp2 = psi._odata[0];
// Apply (U^{\prime})^{-dagger}
chi[ss] = psi[ss];
for(int s=1; s<Ls; s++){
spProj5m(tmp1, chi[ss+s-1]);
chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
}
// U_m^{-\dagger}
for(int s=0; s<Ls-1; s++){
spProj5p(tmp1, chi[ss+s]);
chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
}
// L_m^{-\dagger} D^{-dagger}
for(int s=0; s<Ls-1; s++){
spProj5m(tmp1, chi[ss+Ls-1]);
chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
}
spProj5p(tmp2, chi[ss+Ls-1]);
chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
// Apply L^{-dagger}
for(int s=Ls-2; s>=0; s--){
spProj5p(tmp1, chi[ss+s+1]);
chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
}
}
this->MooeeInvTime += usecond();
}
#ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
#endif
}}

View File

@ -0,0 +1,159 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid_Eigen_Dense.h>
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
namespace Grid {
namespace QCD {
/*
* Dense matrix versions of routines
*/
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
{
int Ls = this->Ls;
int LLs = psi._grid->_rdimensions[0];
int vol = psi._grid->oSites()/LLs;
chi.checkerboard = psi.checkerboard;
assert(Ls==LLs);
Eigen::MatrixXd Pplus = Eigen::MatrixXd::Zero(Ls,Ls);
Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
for(int s=0;s<Ls;s++){
Pplus(s,s) = this->bee[s];
Pminus(s,s) = this->bee[s];
}
for(int s=0; s<Ls-1; s++){
Pminus(s,s+1) = -this->cee[s];
}
for(int s=0; s<Ls-1; s++){
Pplus(s+1,s) = -this->cee[s+1];
}
Pplus (0,Ls-1) = this->dp;
Pminus(Ls-1,0) = this->dm;
Eigen::MatrixXd PplusMat ;
Eigen::MatrixXd PminusMat;
if(inv) {
PplusMat = Pplus.inverse();
PminusMat = Pminus.inverse();
} else {
PplusMat = Pplus;
PminusMat = Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
// For the non-vectorised s-direction this is simple
for(auto site=0; site<vol; site++){
SiteSpinor SiteChi;
SiteHalfSpinor SitePplus;
SiteHalfSpinor SitePminus;
for(int s1=0; s1<Ls; s1++){
SiteChi = zero;
for(int s2=0; s2<Ls; s2++){
int lex2 = s2 + Ls*site;
if(PplusMat(s1,s2) != 0.0){
spProj5p(SitePplus,psi[lex2]);
accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
}
if(PminusMat(s1,s2) != 0.0){
spProj5m(SitePminus, psi[lex2]);
accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
}
}
chi[s1+Ls*site] = SiteChi*0.5;
}
}
}
#ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
#endif
}}

View File

@ -0,0 +1,168 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
namespace Grid {
namespace QCD {
// FIXME -- make a version of these routines with site loop outermost for cache reuse.
// Pminus fowards
// Pplus backwards
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
Coeff_t one(1.0);
int Ls = this->Ls;
for(int s=0; s<Ls; s++){
if(s==0) {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
} else if (s==(Ls-1)) {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
} else {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
}
}
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
Coeff_t one(1.0);
int Ls = this->Ls;
for(int s=0; s<Ls; s++){
if(s==0) {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
} else if (s==(Ls-1)) {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
} else {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
}
}
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
{
Coeff_t one(1.0);
Coeff_t czero(0.0);
chi.checkerboard = psi.checkerboard;
int Ls = this->Ls;
FermionField tmp(psi._grid);
// Apply (L^{\prime})^{-1}
axpby_ssp(chi, one, psi, czero, psi, 0, 0); // chi[0]=psi[0]
for(int s=1; s<Ls; s++){
axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
}
// L_m^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
}
// U_m^{-1} D^{-1}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
}
axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
// Apply U^{-1}
for(int s=Ls-2; s>=0; s--){
axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1); // chi[Ls]
}
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
{
Coeff_t one(1.0);
Coeff_t czero(0.0);
chi.checkerboard = psi.checkerboard;
int Ls = this->Ls;
FermionField tmp(psi._grid);
// Apply (U^{\prime})^{-dagger}
axpby_ssp(chi, one, psi, czero, psi, 0, 0); // chi[0]=psi[0]
for(int s=1; s<Ls; s++){
axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
}
// U_m^{-\dagger}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
}
// L_m^{-\dagger} D^{-dagger}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
}
axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
// Apply L^{-dagger}
for(int s=Ls-2; s>=0; s--){
axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1); // chi[Ls]
}
}
#ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
#endif
}}

View File

@ -0,0 +1,605 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
namespace Grid {
namespace QCD {
/*
* Dense matrix versions of routines
*/
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
GridBase* grid = psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
const int nsimd = Simd::Nsimd();
Vector<iSinglet<Simd> > u(LLs);
Vector<iSinglet<Simd> > l(LLs);
Vector<iSinglet<Simd> > d(LLs);
assert(Ls/LLs == nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type* u_p = (scalar_type*) &u[0];
scalar_type* l_p = (scalar_type*) &l[0];
scalar_type* d_p = (scalar_type*) &d[0];
for(int o=0;o<LLs;o++){ // outer
for(int i=0;i<nsimd;i++){ //inner
int s = o + i*LLs;
int ss = o*nsimd + i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
}}
this->M5Dcalls++;
this->M5Dtime -= usecond();
assert(Nc == 3);
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
for(int v=0; v<LLs; v++){
int vp = (v+1)%LLs;
int vm = (v+LLs-1)%LLs;
spProj5m(hp, psi[ss+vp]);
spProj5p(hm, psi[ss+vm]);
if (vp <= v){ rotate(hp, hp, 1); }
if (vm >= v){ rotate(hm, hm, nsimd-1); }
hp = 0.5*hp;
hm = 0.5*hm;
spRecon5m(fp, hp);
spRecon5p(fm, hm);
chi[ss+v] = d[v]*phi[ss+v];
chi[ss+v] = chi[ss+v] + u[v]*fp;
chi[ss+v] = chi[ss+v] + l[v]*fm;
}
#else
for(int v=0; v<LLs; v++){
vprefetch(psi[ss+v+LLs]);
int vp = (v==LLs-1) ? 0 : v+1;
int vm = (v==0) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(2)(0);
Simd hp_01 = psi[ss+vp]()(2)(1);
Simd hp_02 = psi[ss+vp]()(2)(2);
Simd hp_10 = psi[ss+vp]()(3)(0);
Simd hp_11 = psi[ss+vp]()(3)(1);
Simd hp_12 = psi[ss+vp]()(3)(2);
Simd hm_00 = psi[ss+vm]()(0)(0);
Simd hm_01 = psi[ss+vm]()(0)(1);
Simd hm_02 = psi[ss+vm]()(0)(2);
Simd hm_10 = psi[ss+vm]()(1)(0);
Simd hm_11 = psi[ss+vm]()(1)(1);
Simd hm_12 = psi[ss+vm]()(1)(2);
if(vp <= v){
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if(vm >= v){
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
// Can force these to real arithmetic and save 2x.
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
vstream(chi[ss+v]()(0)(0), p_00);
vstream(chi[ss+v]()(0)(1), p_01);
vstream(chi[ss+v]()(0)(2), p_02);
vstream(chi[ss+v]()(1)(0), p_10);
vstream(chi[ss+v]()(1)(1), p_11);
vstream(chi[ss+v]()(1)(2), p_12);
vstream(chi[ss+v]()(2)(0), p_20);
vstream(chi[ss+v]()(2)(1), p_21);
vstream(chi[ss+v]()(2)(2), p_22);
vstream(chi[ss+v]()(3)(0), p_30);
vstream(chi[ss+v]()(3)(1), p_31);
vstream(chi[ss+v]()(3)(2), p_32);
}
#endif
}
this->M5Dtime += usecond();
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
GridBase* grid = psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
int nsimd = Simd::Nsimd();
Vector<iSinglet<Simd> > u(LLs);
Vector<iSinglet<Simd> > l(LLs);
Vector<iSinglet<Simd> > d(LLs);
assert(Ls/LLs == nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type* u_p = (scalar_type*) &u[0];
scalar_type* l_p = (scalar_type*) &l[0];
scalar_type* d_p = (scalar_type*) &d[0];
for(int o=0; o<LLs; o++){ // outer
for(int i=0; i<nsimd; i++){ //inner
int s = o + i*LLs;
int ss = o*nsimd + i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
}}
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
for(int v=0; v<LLs; v++){
int vp = (v+1)%LLs;
int vm = (v+LLs-1)%LLs;
spProj5p(hp, psi[ss+vp]);
spProj5m(hm, psi[ss+vm]);
if(vp <= v){ rotate(hp, hp, 1); }
if(vm >= v){ rotate(hm, hm, nsimd-1); }
hp = hp*0.5;
hm = hm*0.5;
spRecon5p(fp, hp);
spRecon5m(fm, hm);
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
chi[ss+v] = chi[ss+v] +l[v]*fm;
}
#else
for(int v=0; v<LLs; v++){
vprefetch(psi[ss+v+LLs]);
int vp = (v == LLs-1) ? 0 : v+1;
int vm = (v == 0 ) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(0)(0);
Simd hp_01 = psi[ss+vp]()(0)(1);
Simd hp_02 = psi[ss+vp]()(0)(2);
Simd hp_10 = psi[ss+vp]()(1)(0);
Simd hp_11 = psi[ss+vp]()(1)(1);
Simd hp_12 = psi[ss+vp]()(1)(2);
Simd hm_00 = psi[ss+vm]()(2)(0);
Simd hm_01 = psi[ss+vm]()(2)(1);
Simd hm_02 = psi[ss+vm]()(2)(2);
Simd hm_10 = psi[ss+vm]()(3)(0);
Simd hm_11 = psi[ss+vm]()(3)(1);
Simd hm_12 = psi[ss+vm]()(3)(2);
if (vp <= v){
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if(vm >= v){
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
vstream(chi[ss+v]()(0)(0), p_00);
vstream(chi[ss+v]()(0)(1), p_01);
vstream(chi[ss+v]()(0)(2), p_02);
vstream(chi[ss+v]()(1)(0), p_10);
vstream(chi[ss+v]()(1)(1), p_11);
vstream(chi[ss+v]()(1)(2), p_12);
vstream(chi[ss+v]()(2)(0), p_20);
vstream(chi[ss+v]()(2)(1), p_21);
vstream(chi[ss+v]()(2)(2), p_22);
vstream(chi[ss+v]()(3)(0), p_30);
vstream(chi[ss+v]()(3)(1), p_31);
vstream(chi[ss+v]()(3)(2), p_32);
}
#endif
}
this->M5Dtime += usecond();
}
#ifdef AVX512
#include<simd/Intel512common.h>
#include<simd/Intel512avx.h>
#include<simd/Intel512single.h>
#endif
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
{
#ifndef AVX512
{
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
SiteHalfSpinor SiteChiP;
SiteHalfSpinor SiteChiM;
// Ls*Ls * 2 * 12 * vol flops
for(int s1=0; s1<LLs; s1++){
for(int s2=0; s2<LLs; s2++){
for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
int s = s2 + l*LLs;
int lex = s2 + LLs*site;
if( s2==0 && l==0 ){
SiteChiP=zero;
SiteChiM=zero;
}
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
}}
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
}}
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
}}
}}
{
int lex = s1 + LLs*site;
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
}}
}
}
}
#else
{
// pointers
// MASK_REGS;
#define Chi_00 %%zmm1
#define Chi_01 %%zmm2
#define Chi_02 %%zmm3
#define Chi_10 %%zmm4
#define Chi_11 %%zmm5
#define Chi_12 %%zmm6
#define Chi_20 %%zmm7
#define Chi_21 %%zmm8
#define Chi_22 %%zmm9
#define Chi_30 %%zmm10
#define Chi_31 %%zmm11
#define Chi_32 %%zmm12
#define BCAST0 %%zmm13
#define BCAST1 %%zmm14
#define BCAST2 %%zmm15
#define BCAST3 %%zmm16
#define BCAST4 %%zmm17
#define BCAST5 %%zmm18
#define BCAST6 %%zmm19
#define BCAST7 %%zmm20
#define BCAST8 %%zmm21
#define BCAST9 %%zmm22
#define BCAST10 %%zmm23
#define BCAST11 %%zmm24
int incr = LLs*LLs*sizeof(iSinglet<Simd>);
for(int s1=0; s1<LLs; s1++){
for(int s2=0; s2<LLs; s2++){
int lex = s2 + LLs*site;
uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
uint64_t a2 = (uint64_t) &psi[lex];
for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
if((s2+l)==0) {
asm(
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
VBCASTCDUP(0,%2,BCAST0)
VBCASTCDUP(1,%2,BCAST1)
VBCASTCDUP(2,%2,BCAST2)
VBCASTCDUP(3,%2,BCAST3)
VBCASTCDUP(4,%2,BCAST4) VMULMEM(0,%0,BCAST0,Chi_00)
VBCASTCDUP(5,%2,BCAST5) VMULMEM(0,%0,BCAST1,Chi_01)
VBCASTCDUP(6,%2,BCAST6) VMULMEM(0,%0,BCAST2,Chi_02)
VBCASTCDUP(7,%2,BCAST7) VMULMEM(0,%0,BCAST3,Chi_10)
VBCASTCDUP(8,%2,BCAST8) VMULMEM(0,%0,BCAST4,Chi_11)
VBCASTCDUP(9,%2,BCAST9) VMULMEM(0,%0,BCAST5,Chi_12)
VBCASTCDUP(10,%2,BCAST10) VMULMEM(0,%1,BCAST6,Chi_20)
VBCASTCDUP(11,%2,BCAST11) VMULMEM(0,%1,BCAST7,Chi_21)
VMULMEM(0,%1,BCAST8,Chi_22)
VMULMEM(0,%1,BCAST9,Chi_30)
VMULMEM(0,%1,BCAST10,Chi_31)
VMULMEM(0,%1,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
} else {
asm(
VBCASTCDUP(0,%2,BCAST0) VMADDMEM(0,%0,BCAST0,Chi_00)
VBCASTCDUP(1,%2,BCAST1) VMADDMEM(0,%0,BCAST1,Chi_01)
VBCASTCDUP(2,%2,BCAST2) VMADDMEM(0,%0,BCAST2,Chi_02)
VBCASTCDUP(3,%2,BCAST3) VMADDMEM(0,%0,BCAST3,Chi_10)
VBCASTCDUP(4,%2,BCAST4) VMADDMEM(0,%0,BCAST4,Chi_11)
VBCASTCDUP(5,%2,BCAST5) VMADDMEM(0,%0,BCAST5,Chi_12)
VBCASTCDUP(6,%2,BCAST6) VMADDMEM(0,%1,BCAST6,Chi_20)
VBCASTCDUP(7,%2,BCAST7) VMADDMEM(0,%1,BCAST7,Chi_21)
VBCASTCDUP(8,%2,BCAST8) VMADDMEM(0,%1,BCAST8,Chi_22)
VBCASTCDUP(9,%2,BCAST9) VMADDMEM(0,%1,BCAST9,Chi_30)
VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
}
a0 = a0 + incr;
a1 = a1 + incr;
a2 = a2 + sizeof(Simd::scalar_type);
}
}
{
int lexa = s1+LLs*site;
asm (
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
}
}
}
#undef Chi_00
#undef Chi_01
#undef Chi_02
#undef Chi_10
#undef Chi_11
#undef Chi_12
#undef Chi_20
#undef Chi_21
#undef Chi_22
#undef Chi_30
#undef Chi_31
#undef Chi_32
#undef BCAST0
#undef BCAST1
#undef BCAST2
#undef BCAST3
#undef BCAST4
#undef BCAST5
#undef BCAST6
#undef BCAST7
#undef BCAST8
#undef BCAST9
#undef BCAST10
#undef BCAST11
#endif
};
// Z-mobius version
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
{
std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
exit(-1);
};
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
{
int Ls = this->Ls;
int LLs = psi._grid->_rdimensions[0];
int vol = psi._grid->oSites()/LLs;
chi.checkerboard = psi.checkerboard;
Vector<iSinglet<Simd> > Matp;
Vector<iSinglet<Simd> > Matm;
Vector<iSinglet<Simd> > *_Matp;
Vector<iSinglet<Simd> > *_Matm;
// MooeeInternalCompute(dag,inv,Matp,Matm);
if(inv && dag){
_Matp = &this->MatpInvDag;
_Matm = &this->MatmInvDag;
}
if(inv && (!dag)){
_Matp = &this->MatpInv;
_Matm = &this->MatmInv;
}
if(!inv){
MooeeInternalCompute(dag, inv, Matp, Matm);
_Matp = &Matp;
_Matm = &Matm;
}
assert(_Matp->size() == Ls*LLs);
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
if(switcheroo<Coeff_t>::iscomplex()){
parallel_for(auto site=0; site<vol; site++){
MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
}
} else {
parallel_for(auto site=0; site<vol; site++){
MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
}
}
this->MooeeInvTime += usecond();
}
#ifdef DOMAIN_WALL_EOFA_DPERP_VEC
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
#endif
}}

View File

@ -38,6 +38,8 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
// - ContinuedFractionFermion5D.cc
// - WilsonFermion.cc
// - WilsonKernels.cc
// - DomainWallEOFAFermion.cc
// - MobiusEOFAFermion.cc
//
// The explicit instantiation is only avoidable if we move this source to headers and end up with include/parse/recompile
// for EVERY .cc file. This define centralises the list and restores global push of impl cases
@ -55,8 +57,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
#include <Grid/qcd/action/fermion/CayleyFermion5D.h> // Cayley types
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
#include <Grid/qcd/action/fermion/MobiusFermion.h>
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
#include <Grid/qcd/action/fermion/ZMobiusFermion.h>
#include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
@ -113,6 +116,14 @@ typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
@ -121,6 +132,14 @@ typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
@ -138,6 +157,14 @@ typedef DomainWallFermion<DomainWallVec5dImplRL> DomainWallFermionVec5dRL;
typedef DomainWallFermion<DomainWallVec5dImplFH> DomainWallFermionVec5dFH;
typedef DomainWallFermion<DomainWallVec5dImplDF> DomainWallFermionVec5dDF;
typedef DomainWallEOFAFermion<DomainWallVec5dImplR> DomainWallEOFAFermionVec5dR;
typedef DomainWallEOFAFermion<DomainWallVec5dImplF> DomainWallEOFAFermionVec5dF;
typedef DomainWallEOFAFermion<DomainWallVec5dImplD> DomainWallEOFAFermionVec5dD;
typedef DomainWallEOFAFermion<DomainWallVec5dImplRL> DomainWallEOFAFermionVec5dRL;
typedef DomainWallEOFAFermion<DomainWallVec5dImplFH> DomainWallEOFAFermionVec5dFH;
typedef DomainWallEOFAFermion<DomainWallVec5dImplDF> DomainWallEOFAFermionVec5dDF;
typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
@ -146,6 +173,14 @@ typedef MobiusFermion<DomainWallVec5dImplRL> MobiusFermionVec5dRL;
typedef MobiusFermion<DomainWallVec5dImplFH> MobiusFermionVec5dFH;
typedef MobiusFermion<DomainWallVec5dImplDF> MobiusFermionVec5dDF;
typedef MobiusEOFAFermion<DomainWallVec5dImplR> MobiusEOFAFermionVec5dR;
typedef MobiusEOFAFermion<DomainWallVec5dImplF> MobiusEOFAFermionVec5dF;
typedef MobiusEOFAFermion<DomainWallVec5dImplD> MobiusEOFAFermionVec5dD;
typedef MobiusEOFAFermion<DomainWallVec5dImplRL> MobiusEOFAFermionVec5dRL;
typedef MobiusEOFAFermion<DomainWallVec5dImplFH> MobiusEOFAFermionVec5dFH;
typedef MobiusEOFAFermion<DomainWallVec5dImplDF> MobiusEOFAFermionVec5dDF;
typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
@ -206,6 +241,14 @@ typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
@ -222,6 +265,14 @@ typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
@ -237,4 +288,11 @@ typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermion
}}
////////////////////
// Scalar QED actions
// TODO: this needs to move to another header after rename to Fermion.h
////////////////////
#include <Grid/qcd/action/scalar/Scalar.h>
#include <Grid/qcd/action/gauge/Photon.h>
#endif

View File

@ -538,6 +538,12 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
}
template <class ref>
inline void loadLinkElement(Simd &reg, ref &memory) {
reg = memory;
}
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
{
conformable(Uds._grid,GaugeGrid);

Some files were not shown because too many files have changed in this diff Show More