1
0
mirror of https://github.com/paboyle/Grid.git synced 2026-05-13 13:44:30 +01:00

Compare commits

..

386 Commits

Author SHA1 Message Date
Chulwoo Jung cfa0576ffd Getting rid of one more non-auto View, comms overlap in Laplace operator 2024-02-25 22:37:48 -05:00
Chulwoo Jung fe98e9f555 Fixing Laplace flopcount Minor cleanup 2024-02-13 12:06:08 -05:00
Chulwoo Jung 948d16fb06 Laplace benchmark added 2024-02-12 21:23:36 -05:00
Chulwoo Jung 58fbcaa399 Checking in before cleaning up 2024-02-12 21:10:21 -05:00
Chulwoo Jung 9ad6836b0f Mixed precision for Laplace. Main program with Metric 2024-02-08 17:13:10 -05:00
Chulwoo Jung 026eb8a695 Wilson RMHMC main program 2023-12-12 15:34:03 -05:00
Chulwoo Jung 076580c232 Recovering mixed precision CG for Laplace
Checking in to move to aurora
2023-12-12 15:32:00 -05:00
Chulwoo Jung 7af6022a2a Added midMD checkpointing (for lattice only for now) 2023-12-04 20:05:41 -05:00
Chulwoo Jung 982a60536c Checking in before forking 2023-11-22 16:33:15 -05:00
Chulwoo Jung dc36d272ce Gauge RMHMC conserving dH 2023-11-21 13:48:51 -05:00
Chulwoo Jung 515ff6bf62 Added Laplacian metric, Gauge OpenBC 2023-11-09 21:42:46 -05:00
Peter Boyle 6d0c2de399 Deprecate teh PVC directory and make a PVC-OEM generic PVC target with
no queueing system dependency -- just interactive scripts
2023-10-03 17:04:20 +00:00
Peter Boyle 7786ea9921 Bug fix in script 2023-10-03 09:58:44 -07:00
Peter Boyle d93eac7b1c Performance regressed and is OK in icpx 2023.2 2023-10-03 15:53:14 +00:00
Peter Boyle afc316f501 Rename headers 2023-10-02 16:25:11 -04:00
Peter Boyle f14bfd5c1b Relocate sub includes 2023-10-02 16:23:38 -04:00
Peter Boyle c5f1420dea Merge remote-tracking branch 'LupoA/develop' into LupoA-develop 2023-10-02 16:22:35 -04:00
Peter Boyle 018e6da872 Merge pull request #440 from giltirn/feature/paddedcellgauge
Feature/paddedcellgauge
2023-10-02 10:00:42 -04:00
Peter Boyle b77bccfac2 Merge pull request #444 from mmphys/feature/docX
Update doc complete list of Macports needed to build Grid on a fresh Mac
2023-10-02 09:57:11 -04:00
Peter Boyle 80359e0d49 Bland SYCL compile 2023-09-26 13:20:27 -07:00
Peter Boyle 3d437c5cc4 Making SYCL happy 2023-09-26 13:19:42 -07:00
Peter Boyle b8a7004365 Partial fraction test 2023-08-14 15:17:03 -04:00
Michael Marshall bd56c95a6f Update documentation with complete list of Macports needed to build Grid on a fresh Mac 2023-07-14 13:50:06 +01:00
Peter Boyle 994512048e Merge pull request #439 from felixerben/bugfix/IRL_convergence
Bugfix/irl convergence
2023-07-12 16:32:26 -04:00
chillenzer dbd8bb49dc Merge pull request #32 from LupoA/sp2n/develop
Sp2n/develop
2023-07-04 15:23:43 +00:00
Julian Lenz 3a29af0ce4 Fixed linker error 2023-07-04 16:08:44 +01:00
Julian Lenz f7b79cdd45 Added test for ProjectSpn 2023-07-03 18:00:32 +01:00
Alessandro Lupo 075b9d22d0 adjoint rep implemented as 2indx symmetric 2023-07-02 13:58:31 +01:00
Alessandro Lupo b92428f05f better test 2023-07-02 13:34:03 +01:00
Alessandro Lupo 34b11864b6 prettiest tests 2023-07-02 13:25:57 +01:00
Christopher Kelly 1dfaa08afb The stencils for the staple and rect-staple padded cell implementations are now created and stored by workspace classes that allow for reuse providing the grids remain consistent
The workspaces are now used by the plaq+rectangle gauge action resulting in a further 2x performance improvement as measured on a 16^4 local volume for 2 nodes (16 ranks) of Crusher
2023-06-28 15:11:24 -04:00
Christopher Kelly f44dce390f Implemented acclerator-optimized versions of localCopyRegion and insertSliceLocal to speed up padding
Fixed const correctness on PaddedCell methods
Fixed compile issues on Crusher
Added timing breakdowns for PaddedCell::Expand and the padded implementations of the staples, visible under --log Performance
Optimized kernel for StaplePadded
Test_iwasaki_action_newstaple now repeats the calculation 10 times and reports average timings
2023-06-27 14:58:10 -04:00
Christopher Kelly bb71e9a96a Added PaddedCell and GeneralisedLocalStencil header includes to standard base headers
Moved versions of the padded-cell implementations of staple and rect-staple from test code to WilsonLoops header
Added StapleAndRectStapleAll which is now called by the plaq+rectangle action class. Under the hood it uses the padded cell implementations with maximal reuse of the padded gauge links
2023-06-27 11:23:30 -04:00
ferben 78bae9417c returning Nstop vectors even if not all meet true convergence criterion 2023-06-27 14:38:19 +01:00
ferben dd170ead01 whitespace 2023-06-27 11:37:01 +01:00
ferben 014704856f do one more iteration if not all vectors converged 2023-06-27 11:33:30 +01:00
Christopher Kelly 6f6844ccf1 Added new StapleAll and RectStapleAll functions that return the staples for all mu as an array
Modified plaq+rectangle gauge actions to use the above
Added a test code to confirm the above changes
2023-06-26 15:48:47 -04:00
Christopher Kelly 4c6613d72c Modified RectStapleDouble and RectStapleOptimised to use Gauge-BC respecting CshiftLink
Added test code tests/debug/Test_optimized_staple_gaugebc demonstrating equivalence of above to RectStapleUnoptimised for cconj gauge BCs
Removed optimized staple only being used for periodic gauge BCs; it is now always used
2023-06-26 10:20:23 -04:00
Peter Boyle ee92e08edb Merge pull request #435 from fjosw/fix/warnings_in_WilsonKernelsImplementation
Unused variable in WilsonKernelsImplementation
2023-06-23 11:47:19 -04:00
Peter Boyle c1dcee9328 Merge pull request #437 from fjosw/fix/stencil_debug
Added GridLogDebug to BuildSurfaceList debug message
2023-06-23 11:47:00 -04:00
Alessandro Lupo 559257bbe9 better documentation and filelist names 2023-06-23 16:16:48 +01:00
Peter Boyle 6b150961fe Better script 2023-06-23 18:09:25 +03:00
Alessandro Lupo cff1f8d3b8 rm unused variables and formatting 2023-06-23 16:04:18 +01:00
Alessandro Lupo f27d2083cd adjustments in SUn and Sp2n impl 2023-06-23 15:34:08 +01:00
Christopher Kelly 36cc9c524f Threaded the constructor of GeneralLocalStencil 2023-06-23 09:57:38 -04:00
Alessandro Lupo 2822487450 rm unncessary line 2023-06-23 14:55:23 +01:00
Alessandro Lupo e07fafe46a minor adjustments to twoindex 2023-06-23 12:18:04 +01:00
Alessandro Lupo 063d290bd8 missing function 2023-06-23 11:11:20 +01:00
Alessandro Lupo 4e6194d92a Avoid code duplication in ProjectSUn 2023-06-23 11:03:50 +01:00
Alessandro Lupo de30c4e22a minor improvements 2023-06-23 10:49:41 +01:00
Peter Boyle 5bafcaedfa Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-06-22 19:59:45 +03:00
Peter Boyle bfeceae708 FTHMC 2023-06-22 12:58:18 -04:00
Peter Boyle eacb66591f Config command 2023-06-22 19:56:40 +03:00
Peter Boyle fadaa85626 Update 2023-06-22 19:56:27 +03:00
Peter Boyle 02a5b0d786 Updating run during testing 2023-06-22 19:52:46 +03:00
Peter Boyle 0e2141442a Dennis says broken 2023-06-22 19:19:51 +03:00
Peter Boyle 769eb0eecb Precision coverage 2023-06-22 19:19:20 +03:00
Christopher Kelly 4241c7d4a3 Imported coalescedReadGeneralPermute GPU implementation from Christoph
Fixed bug in padded staple code where extract was being called on the result before the GPU view was closed
Fixed compile issue with pointer cast in padded staple code
Added timing summaries of padded staple code and timing breakdown of staple implementation to Test_padded_cell_staple
2023-06-21 16:01:01 -04:00
Christopher Kelly 7b11075102 The user can now specify the implementation of Cshift used by the PaddedCell class through a virtual base class API. Implementations for default (regular Cshift) and for gauge links (which respects the gauge BCs)
Fixed const-correctness for PaddedCell and ConjugateGimpl::setDirections
Modified test code for padded-cell implementation of staple, rect-staple to use cconj BCs
2023-06-20 17:09:56 -04:00
Christopher Kelly abc658dca5 Added coalescedReadGeneralPermute CPU implementation based on Christoph's GPT code
In a test code, implemented a padded-cell version of the staple and rectangular-staple calculation
2023-06-20 16:14:25 -04:00
Alessandro Lupo 2372275b2c Merge pull request #36 from LupoA/sp2n/gpu-bugfix
Sp2n/gpu bugfix [close #30]
2023-06-20 13:46:00 +01:00
chillenzer ef736e8aa4 Merge pull request #35 from LupoA/sp2n/enableSp
consistent enable sp config flag
2023-06-20 10:41:09 +00:00
Julian Lenz 5e539e2d54 Forgot some follow-ups on changed signature 2023-06-18 12:37:51 +01:00
Julian Lenz 96773f5254 Apparently forgot to remove one Lattice version 2023-06-18 12:21:39 +01:00
Alessandro Lupo d80df09f3b consistent enable sp config flag 2023-06-16 19:16:46 +01:00
Julian Lenz 621e612c30 Fix non-zero ret on device bug 2023-06-16 16:27:49 +01:00
Julian Lenz 8c3792721b ClangFormat 2023-06-16 15:58:23 +01:00
Julian Lenz c95bbd3948 Remove accelerated lattice version 2023-06-16 15:50:26 +01:00
Julian Lenz e28ab7a732 Re-included instantiations for symmetric 2Index AS Sp 2023-06-16 14:20:37 +01:00
Alessandro Lupo c797cbe737 deal with post-merge trauma 2023-06-16 14:20:37 +01:00
Alessandro Lupo e09dfbf1c2 definetely the right merge upstream/develop 2023-06-16 14:19:46 +01:00
fjosw 85e35c4da1 fix: added GridLogDebug to BuildSurfaceList debug message. 2023-06-16 10:31:16 +01:00
Peter Boyle d72e914cf0 Profiling temporary code until optimised 2023-06-15 10:43:04 -04:00
Peter Boyle 3b5254e2d5 Optional checkpoint smeared configs for FTHMC 2023-06-15 10:43:04 -04:00
Peter Boyle f1c358b596 Additional tests 2023-06-15 10:43:04 -04:00
Peter Boyle c0ef210265 Hot start should be properly Hot 2023-06-15 10:43:04 -04:00
Peter Boyle e3e1cc1962 Ta project 2023-06-15 10:43:04 -04:00
Peter Boyle 723eadbb5c Keep methods virtual 2023-06-15 10:43:04 -04:00
Peter Boyle e24637ec1e Clean up 2023-06-15 10:43:04 -04:00
Peter Boyle 8b01ff4ce7 Integrator over to smeared force structure 2023-06-15 10:43:04 -04:00
Peter Boyle 588197c487 Smeared action virtual class 2023-06-15 10:43:04 -04:00
Julian Lenz 116d90b0ee First attempt on #30 2023-06-15 15:09:37 +01:00
Julian Lenz b0646ca187 Remove some unused variables 2023-06-15 15:09:09 +01:00
Peter Boyle 1352bad2e4 Sunspot compile 2023-06-15 11:22:46 +00:00
chillenzer 4895ff260e Merge pull request #28 from LupoA/sp2n/config
compile sp2n fermion impl only if declared at config time
2023-06-09 13:07:48 +00:00
Alessandro Lupo 470d93006a compile sp2n fermion impl only if declared at config time 2023-06-07 12:53:33 +01:00
chillenzer 2f3d03f188 Merge pull request #27 from LupoA/sp2n/documentation
documentation for gaugegroup and sp2n
2023-06-01 16:42:27 +00:00
Alessandro Lupo 8db7c23bee improve documentation 2023-06-01 17:39:10 +01:00
chillenzer 69dc5172dc Merge pull request #26 from LupoA/sp2n/irreps
Sp2n/irreps
2023-06-01 16:28:15 +00:00
Julian Lenz fd72eb6546 Merge branch 'sp2n/algorithm' into sp2n/irreps 2023-06-01 17:24:01 +01:00
Peter Boyle ffd7301649 Updated masked / fthmc smeared config container 2023-06-01 06:23:02 -04:00
Peter Boyle d2a8494044 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-06-01 06:22:33 -04:00
Peter Boyle 0982e0d19b Jacobian action wrapper for FTHMC 2023-06-01 06:15:08 -04:00
Peter Boyle 3badbfc3c1 Refactor the Action and Smeared gauge configuration containers. Add first pass at FTHMC action 2023-06-01 06:14:28 -04:00
Peter Boyle 5465961e30 New test for FTHMC portion 2023-06-01 06:14:04 -04:00
fjosw 477b794bc5 fix: unused variable removed. 2023-05-29 14:08:53 +01:00
Peter Boyle 4835fd1a87 HIP stream synch 2023-05-27 17:58:22 +03:00
Peter Boyle 6533c25814 Lumi 2023-05-27 16:13:32 +03:00
Alessandro Lupo b405767569 make private methods private 2023-05-26 17:02:16 +01:00
Alessandro Lupo fe88a0c12f cleaner twoindex class, cleaner tests 2023-05-26 16:55:30 +01:00
Alessandro Lupo e61a9ed2b4 partial revert 2023-05-26 13:54:26 +01:00
Alessandro Lupo de8daa3824 group is SUn by default 2023-05-26 13:44:41 +01:00
Alessandro Lupo 3a50fb29cb directly call sp helper 2023-05-26 13:28:47 +01:00
Alessandro Lupo 6647d2656f rm unnecessary specialisation 2023-05-26 12:27:22 +01:00
Alessandro Lupo a6f4dbeb6d remove redundant template parameter 2023-05-26 12:13:40 +01:00
Alessandro Lupo 92a282f2d8 Merge pull request #24 from LupoA/sp2n/fix_static_assert_symmetric
Move static_assert inside of function
2023-05-26 11:13:50 +01:00
Alessandro Lupo ca2fd9fc7b documentation for gaugegroup and sp2n 2023-05-25 18:40:54 +01:00
Alessandro Lupo be1a4f5860 implement TwoIndexSymm for sp2n 2023-05-22 17:21:03 +01:00
Peter Boyle 1b2914ec09 FT-HMC smearing, derivative chain rule, log det and force first pass. 2023-05-22 10:21:37 -04:00
Peter Boyle 519f795066 Header not liked by gcc on mac? puzzling 2023-05-22 10:21:12 -04:00
Alessandro Lupo 5897b93dd4 debug tests, fix dimension 2023-05-22 13:42:21 +01:00
Alessandro Lupo af091e0881 DimensionHelper for 2index irreps 2023-05-21 16:56:06 +01:00
Alessandro Lupo 3c1e5e9517 Merge pull request #25 from LupoA/sp2n/unify_representations
Sp2n/unify representations [close #3]
2023-05-21 14:55:27 +01:00
Alessandro Lupo 85b2cb7a8a changing some hardcoded SUn lines 2023-05-21 14:50:28 +01:00
Peter Boyle 4240ad5ca8 Preparing for FTHMC 2023-05-19 21:21:55 -04:00
Peter Boyle d418347d86 public for convenience to see rho params 2023-05-19 21:21:05 -04:00
Peter Boyle 29a4bfe5e5 Clean up 2023-05-19 21:20:45 -04:00
Peter Boyle 9955bf9daf Regresses to Qlat 2023-05-19 17:32:13 -04:00
Julian Lenz b8bdc2eefb Unified two index representations 2023-05-18 18:36:29 +01:00
Julian Lenz 0078826ff1 Move static_assert inside of function 2023-05-18 18:14:53 +01:00
Julian Lenz e855c41772 Unified spfundamental.h with fundamental.h 2023-05-18 18:11:20 +01:00
chillenzer d169c275b6 Merge pull request #22 from LupoA/sp2n/unify_twoindex
Unify TwoIndex
2023-05-18 14:55:02 +00:00
Julian Lenz a5125e23f4 Typo 2023-05-18 15:41:35 +01:00
Julian Lenz 7b83c80757 Merge branch 'sp2n/unify_twoindex' of github.com:LupoA/Grid into sp2n/unify_twoindex 2023-05-18 15:36:14 +01:00
Julian Lenz e41821e206 Disable two index symmetric 2023-05-18 15:29:55 +01:00
Alessandro Lupo 5a75ab15a2 typo in 2S dim 2023-05-17 20:47:57 +01:00
Alessandro Lupo 932c783fbf 2AS for every Nc! 2023-05-17 20:22:05 +01:00
Julian Lenz 55f9cce577 Revert "Added automated HMC test for Nc=4"
This reverts commit eee27b8b30.
2023-05-17 09:17:48 +01:00
Alessandro Lupo b3533ca847 correct tests (failing) 2023-05-16 17:43:52 +01:00
Alessandro Lupo fd2a637010 test 2index 2023-05-16 14:10:39 +01:00
Julian Lenz eee27b8b30 Added automated HMC test for Nc=4 2023-05-15 18:37:33 +01:00
Julian Lenz 8522352aa3 ClangFormat 2023-05-15 18:36:05 +01:00
Alessandro Lupo 3beb8f4091 fixing typo, getting pre-changes physics 2023-05-15 16:00:15 +01:00
Alessandro Lupo 12a706e9b1 de-hardcode the number of generators 2023-05-15 15:48:21 +01:00
Alessandro Lupo 170aa7df01 fix (dimension to be improved) 2023-05-15 15:20:18 +01:00
Julian Lenz e8ad1fef53 Unify TwoIndex 2023-05-12 14:35:50 +01:00
Peter Boyle 876c8f4478 Nodes on padded cell 2023-05-11 12:35:49 -04:00
Peter Boyle 9c8750f261 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-05-11 12:29:09 -04:00
Peter Boyle 91efd08179 Option for Qlat generator basis 2023-05-11 12:27:45 -04:00
Peter Boyle 9953511b65 Mac compile 2023-05-11 12:27:29 -04:00
Peter Boyle 025fa9991a For FTHMC 2023-05-11 12:26:14 -04:00
Peter Boyle e8c60c355b Padded cell code 2023-05-11 12:25:50 -04:00
Peter Boyle 6c9c7f9d85 Permute fix 2023-05-11 12:24:21 -04:00
Peter Boyle f534523ede Debug 2023-05-11 12:23:11 -04:00
Peter Boyle 1b8a834beb Debug 2023-05-11 12:22:24 -04:00
Alessandro Lupo aa9df63a05 rename group projections based on determinants 2023-05-10 14:50:52 +01:00
chillenzer 3953312a93 Merge pull request #20 from LupoA/sp2n/unify_gaugeimpltypes
Sp2n/unify gaugeimpltypes
2023-05-03 15:17:10 +00:00
Julian Lenz 6e62f4f616 ClangFormat 2023-05-03 16:15:12 +01:00
Julian Lenz 6a7bdca53b Take over additional algebra tests from Alessandro 2023-05-03 16:02:02 +01:00
Julian Lenz c7fba9aace Take over additional group tests from Alessandro 2023-05-03 16:01:48 +01:00
Julian Lenz ac6c7cb8d6 Merge in Alessandro's changes [test fails] 2023-05-03 02:53:03 +01:00
Julian Lenz c5924833a1 ClangFormat 2023-05-03 02:39:36 +01:00
Julian Lenz ac0a74be0d Taken care of algebra tests 2023-05-03 02:32:42 +01:00
Julian Lenz 42b0e1125d Naming and argument types 2023-05-03 01:51:46 +01:00
Julian Lenz 339c4fda79 Extracted is_element_of Sp2n 2023-05-02 15:44:34 +01:00
Alessandro Lupo 9b85bf9402 better projection test 2023-05-02 15:42:20 +01:00
Alessandro Lupo 86b02c3cd8 cleaning up requested by Julian 2023-05-02 13:31:17 +01:00
Alessandro Lupo 7b3b7093fa cleaning up requested by Ed 2023-05-02 12:50:57 +01:00
Alessandro Lupo 881b08a465 Correct implementation of SpTa 2023-04-27 18:17:06 +01:00
Julian Lenz 3ee5444c69 Remove commented out stuff 2023-04-21 08:08:18 +01:00
Julian Lenz 5e28fe56d2 Remove code duplication: Iterating through vectors 2023-04-21 08:08:06 +01:00
Peter Boyle 3aa43e6065 Debug info 2023-04-20 14:21:13 -04:00
Peter Boyle 78ac4044ff HMC 2023-04-20 13:28:07 -04:00
Peter Boyle 119c3db47f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-18 15:13:16 -04:00
Peter Boyle 21bbdb8fc2 Crusher 2023-04-18 15:11:16 -04:00
Alessandro Lupo 5aabe074fe Rename Sympl* to Sp* 2023-04-18 11:50:20 +01:00
Peter Boyle 739bd7572c Example code 2023-04-17 21:51:55 +00:00
Peter Boyle 074627a5bd Pass file descriptors through AF_UNIX for level_zero 2023-04-17 21:50:52 +00:00
Peter Boyle 6a23b2c599 Drop UVM 2023-04-17 21:49:58 +00:00
Alessandro Lupo dace904c10 fix typo 2023-04-14 18:06:18 +01:00
Alessandro Lupo be98d26610 small change I missed in previous commit 2023-04-13 17:48:43 +01:00
Peter Boyle bd891fb3f5 tests to compile 2023-04-12 18:32:44 -04:00
Peter Boyle 3984265851 Merge pull request #432 from paboyle/hotfix/nvcc-warnings
Unused statements generating warnings removed
2023-04-12 16:59:02 -04:00
Peter Boyle 45361d188f Merge pull request #427 from fjosw/feat/bug_report_issue_template
Feat/bug report issue template
2023-04-12 16:58:41 -04:00
Peter Boyle 80c9d77e02 Merge pull request #433 from paboyle/hotfix/virtual-dtor
Virtual destructor for LinearOperator
2023-04-12 16:56:18 -04:00
Peter Boyle 3aff64dddb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-11 12:19:15 -07:00
Peter Boyle b4f2ca81ff Copy queue and compute queue same as better concurrency 2023-04-11 12:18:21 -07:00
Peter Boyle d1dea5f840 New driver 2023-04-11 12:16:52 -07:00
Peter Boyle 54f8b84d16 Fence 2023-04-11 12:16:08 -07:00
Peter Boyle da503fef0e Name change on barrier routine 2023-04-11 12:14:04 -07:00
Peter Boyle 4a6802098a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-07 15:43:28 -04:00
Peter Boyle f9b41a84d2 Trajectory runs to completion on Crusher within wall clock time 2023-04-07 15:42:45 -04:00
portelli 5d7e0d18b9 virtual destructor for LinearOperator 2023-04-07 14:30:38 +01:00
portelli 9e64387933 mores unused statements removed 2023-04-07 14:27:18 +01:00
portelli 983b681d46 unused statement cleaning 2023-04-07 14:12:02 +01:00
portelli 4072408b6f Update README.md 2023-04-07 11:45:28 +01:00
portelli bd76b47fbf Update CI badge in README 2023-04-07 11:44:48 +01:00
Alessandro Lupo 178376f24b minor stylistic changes 2023-04-06 12:08:17 +01:00
portelli 18ce23aa75 Fix NEON SIMD 2023-04-06 11:30:48 +01:00
chillenzer 6a0eb466ee Merge pull request #19 from LupoA/refactoring_sp2n
refactoring sp2n
2023-04-05 10:50:58 +00:00
Peter Boyle ffa7fe0cc2 Merge branch 'feature/dirichlet' into develop 2023-04-04 23:13:52 -04:00
Peter Boyle 6b979f0a69 Dirichlet improvements that I failed to commit 2023-04-04 23:13:17 -04:00
Alessandro Lupo 4ea29b8f0f Template group into GaugeImplTypes. Closing #2 2023-04-04 17:49:28 +01:00
Alessandro Lupo 778291230a expand ProjecOnGaugeGroup, change ProjectOnSp2nAlgebra into SpTa, fixing some of its issues 2023-04-04 17:48:13 +01:00
Peter Boyle 86dac5ff4f Better printing 2023-04-04 07:42:19 -07:00
Peter Boyle 4a382fad3f Use distinct SYCL queue for copies 2023-04-04 07:41:41 -07:00
Peter Boyle cc753670d9 Barrier elimination, surface list build 2023-04-04 07:39:14 -07:00
Peter Boyle cc9d88ea1c Fence changes and EXT kernel loop cout reduction 2023-04-04 07:37:23 -07:00
Peter Boyle b281b0166e Put the barrier in the subroutine 2023-04-04 07:36:03 -07:00
Peter Boyle 6a21f694ff Apply barrier in Gather kernel sequence.
Could place before comms, or in Gather, but decided to insist Gather means Gather is done
2023-04-04 07:33:24 -07:00
Peter Boyle fc4db5e963 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-04-03 18:26:11 -04:00
Peter Boyle 6252ffaf76 No unified 2023-04-03 18:25:22 -04:00
Alessandro Lupo 026e736dfa Projection on algebra can now be templated. Fix #12 2023-04-03 16:31:19 +01:00
Alessandro Lupo 4275b3f431 Fix typo and remove unnecessary lines 2023-04-03 12:01:52 +01:00
Peter Boyle af64c1c6b6 Had managed to drop the accelerator_barrier() in the Wilson Compressor gather 2023-03-30 17:34:44 -04:00
Peter Boyle 866f48391a Temporary fix for develop incorrect results 2023-03-30 17:10:13 -04:00
Peter Boyle a4df527d74 Merge pull request #428 from mmphys/bugfix/comm_none
Fixes for --enable-comms=none
2023-03-30 08:38:14 -04:00
Michael Marshall 5764d21161 Fixes for --enable-comms=none 2023-03-30 10:15:28 +01:00
Peter Boyle 496d04cd85 Weaken the Fence 2023-03-29 18:58:51 -04:00
Peter Boyle 10e6d7c6ce Merge branch 'feature/dirichlet' into develop 2023-03-29 16:26:47 -04:00
Peter Boyle c42e25e5b8 Dirichlet remove 2023-03-29 16:25:52 -04:00
Peter Boyle a00ae981e0 Fence propagation from SYCL 2023-03-29 15:00:40 -04:00
Peter Boyle 58e020b62a Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-29 14:37:40 -04:00
Peter Boyle a7e1aceeca Compile fix on Nvidia 2023-03-29 14:36:50 -04:00
Peter Boyle 7212432f43 More careful fencing 2023-03-28 20:10:22 -07:00
Peter Boyle 4a261fab30 Changes premerge to develop 2023-03-28 20:04:21 -07:00
Peter Boyle 6af97069b9 Preparing for close of feature/dirichlet
Initial code change review complete
2023-03-28 13:39:44 -07:00
Peter Boyle 5068413cdb Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 08:35:38 -07:00
Peter Boyle 71c6960eea Commet 2023-03-28 08:34:24 -07:00
Peter Boyle ddf6d5c9e3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 11:33:05 -04:00
fjosw 39214702f6 feat: indentation fixed. 2023-03-28 16:30:34 +02:00
fjosw 3e4614c63a feat: draft for bug-report issue template added. 2023-03-28 16:24:35 +02:00
Peter Boyle 900e01f49b Temporary 2023-03-27 21:35:06 -07:00
Peter Boyle 2376156fbc Merge branch 'develop' into feature/dirichlet 2023-03-27 21:33:50 -07:00
Peter Boyle 3f2fd49db4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-03-27 17:29:54 -07:00
Peter Boyle 0efa107cb6 Script update 2023-03-27 17:29:43 -07:00
Peter Boyle 8feedb4f6f Include files moved 2023-03-27 17:29:21 -07:00
Peter Boyle 05e562e3d7 Move the copy synch out to stencil and do one per call instead of one per packet 2023-03-27 17:28:38 -07:00
Peter Boyle dd3bbb8fa2 MOve the synchronise out to the stencil so one call instead of one call per packet 2023-03-27 17:27:45 -07:00
Peter Boyle 2fbcf13c46 SYCL fix 2023-03-27 14:25:14 -07:00
Peter Boyle 4ea48ef0c4 Merge pull request #419 from lehner/feature/gpt
Separate rankSum from sum
2023-03-24 15:42:16 -04:00
Peter Boyle 5c85774ee3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-24 15:40:57 -04:00
Peter Boyle d8a9a745d8 stream synchronise 2023-03-24 15:40:30 -04:00
Peter Boyle dcf172da3b Merge pull request #415 from paboyle/feature/block_lanczos22
Feature/block lanczos22
2023-03-24 12:08:16 -04:00
Peter Boyle d57ed25071 Merge branch 'feature/dirichlet' into feature/block_lanczos22 2023-03-24 12:08:09 -04:00
Peter Boyle 546be724e7 Merge pull request #421 from UniOfLeicester/feature/accel_Copy_plane
Populate the Cshift_table in the GPU
2023-03-24 12:04:06 -04:00
Peter Boyle 8a1b9073f9 Mshift update 2023-03-23 15:39:30 -04:00
Peter Boyle 1a7114d4b9 Temporary algorithm while sorting out mixed prec 2023-03-23 15:38:35 -04:00
Peter Boyle 3f385f717c Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet
Conflicts:
	systems/PVC/benchmarks/run-2tile-mpi.sh
	systems/PVC/config-command
2023-03-23 14:52:53 -04:00
Peter Boyle 481bbaf1fc Interface to query memory use 2023-03-23 12:55:31 -04:00
Peter Boyle 281488611a WriteDiscard on construct 2023-03-23 10:28:50 -04:00
Peter Boyle c180a52518 Merge branch 'feature/dirichlet' of https://www.github.com/paboyle/Grid into feature/dirichlet 2023-03-23 10:28:01 -04:00
Peter Boyle 90130e25e9 TODO list 2023-03-23 10:27:02 -04:00
Peter Boyle 23298acb81 Merge pull request #424 from giltirn/feature/dirichlet-precchange
Precision change implementation
2023-03-22 23:04:52 -04:00
Peter Boyle 52384e34cf Discard on construct 2023-03-22 19:40:32 -04:00
Peter Boyle d0bb033ea2 Device resident GPU block buffer instead of UVM as hit likely UVM
bug. Code worked on CUDA 11.4 but fails on later drivers (certainly 530.30.02, but need to
find the perlmutter driver version).
2023-03-22 19:07:32 -04:00
Peter Boyle c6621806ca Compiling on laptop and running 2023-03-21 17:27:09 -04:00
Peter Boyle 0b6f0f6d2f Merge branch 'feature/dirichlet' of https://www.github.com/paboyle/Grid into feature/dirichlet 2023-03-21 16:06:55 -04:00
Peter Boyle b5b759df73 Merge branch 'develop' into feature/dirichlet 2023-03-21 16:05:46 -04:00
Peter Boyle 7db8dd7a95 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-21 16:04:27 -04:00
Peter Boyle 8b43be39c0 Config command 2023-03-21 16:00:52 -04:00
Peter Boyle f17f879206 Test update 2023-03-21 15:59:29 -04:00
Peter Boyle 68428fceab Integrator update 2023-03-21 15:58:49 -04:00
Peter Boyle 4135f2dcd1 Compressor 2023-03-21 15:41:41 -04:00
Peter Boyle c5bdf61215 AUdit fix 2023-03-21 15:38:39 -04:00
Peter Boyle 88e218e8ee Stencil updates 2023-03-21 15:37:58 -04:00
Peter Boyle 0f2b786436 Vector -> vector 2023-03-21 15:36:11 -04:00
Peter Boyle e1c326558a COmms improvements 2023-03-21 08:53:56 -07:00
Peter Boyle bae0f8ea99 Merge pull request #425 from rrhodgson/feature/CacheLogging
Huge Cache
2023-03-21 08:59:08 -04:00
Peter Boyle bbbcd36ae5 Merge pull request #426 from rrhodgson/feature/LCDeflation
Batched Local Coherence Tools
2023-03-21 08:58:40 -04:00
Peter Boyle 39c0815d9e WriteDiscard 2023-03-21 08:57:29 -04:00
Alessandro Lupo 1b8176e2c0 fix code duplication 2023-03-17 14:58:00 +00:00
Alessandro Lupo cbc053c3db Revert "projection on Sp2n algebra, to be used instead of Ta"
This reverts commit ba7f9d7b70.
2023-03-17 11:36:58 +00:00
Alessandro Lupo cdf3f6ef6e Merge branch 'refactoring_sp2n' of https://github.com/LupoA/Grid into refactoring_sp2n 2023-03-15 15:59:50 +00:00
Alessandro Lupo ba7f9d7b70 projection on Sp2n algebra, to be used instead of Ta 2023-03-15 15:55:12 +00:00
Peter Boyle a997d24743 Remove nofma 2023-03-14 12:10:31 -07:00
Peter Boyle 861e5d7f4c SYCL version update. Why do they keep making incompatible changes 2023-03-14 12:10:02 -07:00
Peter Boyle 14cc142a14 Warning remove 2023-03-14 12:09:26 -07:00
Peter Boyle f36b87deb5 syscall fix 2023-03-14 12:09:00 -07:00
Peter Boyle eeb6e0a6e3 Renable cache blocking and efficient UPI type SHM comms 2023-03-14 09:10:27 -07:00
Peter Boyle cad5b187dd Cleanup 2023-03-14 09:08:16 -07:00
Peter Boyle 87697eb07e SHared compile 2023-03-14 09:07:36 -07:00
Alessandro Lupo 371fd123fb consequence of iSUnMatrix being no longer a member of the SU class 2023-03-14 10:47:07 +00:00
Alessandro Lupo d6ff644aab Towards the day all tests compile 2023-03-14 10:43:25 +00:00
Julian Lenz 29586f6b5e Deactivate some tests for Nc!=3 2023-03-13 08:17:14 +00:00
Alessandro Lupo fd057c838f add ProjectOnGaugeGroup and ProjectGn to allow future templating in GaugeImplTypes 2023-03-10 12:10:46 +00:00
Alessandro Lupo f51222086c Move functions from GaugeGroup to group specific implementations 2023-03-09 16:22:20 +00:00
rhodgson a3e935c902 Batched block project/promote size checks 2023-02-27 11:38:16 +00:00
rhodgson 7731c7db8e Add huge cache type and allow Ncache==0 2023-02-26 14:15:28 +00:00
rhodgson ff97340324 Expose cached bytes 2023-02-26 12:22:45 +00:00
Christopher Kelly 83d86943db Fixed compile bug in MemoryManagerShared caused by Audit function not being passed a string 2023-02-23 13:09:45 -05:00
Christopher Kelly e82cf1d311 Further prec-change improvements
Mixed prec CG algorithm has been modified to precompute precision change workspaces

As the original Test_dwf_mixedcg_prec has been coopted to do a performance stability and reproducibility test, requiring the single-prec CG to be run 200 times, I have created a new version of Test_dwf_mixedcg_prec in the solver subdirectory that just does the mixed vs double CG test
2023-02-23 09:45:29 -05:00
Christopher Kelly 1db58a8acc Precision change improvements
Added a new, much faster implementation of precision change that uses (optionally) a precomputed workspace containing pointer offsets that is device resident, such that all lattice copying occurs only on the device and no host<->device transfer is required, other than the pointer table. It also avoids the need to unpack and repack the fields using explicit lane copying. When this new precisionChange is called without a workspace, one will be computed on-the-fly; however it is still considerably faster than the original implementation.

In the special case of using double2 and when the Grids are the same, calls to the new precisionChange will automatically use precisionChangeFast, such that there is a single API call for all precision changes.

Reliable update and mixed-prec multishift have been modified to precompute precision change workspaces

Renamed the original precisionChange as precisionChangeOrig

Fixed incorrect pointer offset bug in copyLane

Added a test and a benchmark for precisionChange

Added a test for reliable update CG
2023-02-21 10:52:42 -05:00
rhodgson 920a51438d Added batched Mixed precision CG 2023-02-14 17:04:13 +00:00
rhodgson be528b6d27 Add batched block project/promote functions 2023-02-14 14:37:10 +00:00
Alessandro Lupo f73691ec47 Merge pull request #18 from nickforce989/sp2n/newbranch
Sp2n/newbranch
2023-02-13 10:22:27 +01:00
Peter Boyle ccd21f96ff Plaquette agreeing and moving to final form (slowly) need to optimise 2023-02-01 22:57:44 -05:00
Peter Boyle 4b90cb8888 First cut passes combining padded cell with general stencil towards fast plaquette and staggered force 2023-02-01 22:14:10 -05:00
Niccolo Forzano 7ebda3e9ec Merge commit 'b10e1b7bc8bec809f874e9e48a3ccc7b2619c9d1' into sp2n/newbranch 2023-01-19 12:10:18 +00:00
Niccolo Forzano b10e1b7bc8 Fixed files giving zero force computation on GPU, issue #8 2023-01-18 18:04:47 +00:00
Peter Boyle 796abfad80 Merge pull request #422 from fjosw/fix/NVCC_DIAG_PRAGMA_SUPPORT
Disable diagnostic pragma warnings for CUDA 12+
2023-01-17 09:34:49 -05:00
fjosw ad0270ac8c fix: diagnostic pragma warnings fixed for CUDA 12+ 2023-01-12 12:36:30 +00:00
Makis Kappas 7d62f1d6d2 Populate the Cshift_table in the GPU
Cshift is allocated in Unified memory and used
in the LambdaApply kernels but also populated
from the host. This creates a lot of Unified HtoD
and DtoH mem operations and has a negative effect
in performance. With this commit we populate the
Cshift table in the device with the
populate_Cshift_table() kernel.
2023-01-11 21:26:25 +00:00
Christoph Lehner 458c943987 merged upstream 2022-12-31 11:16:21 +02:00
Christoph Lehner 88015b0858 Split sum in rankSum and GlobalSum 2022-12-26 10:01:32 +01:00
Peter Boyle 4ca1bf7cca Added gauge invariance test 2022-12-21 07:23:16 -05:00
Peter Boyle 2ff868f7a5 CPU open doesn't need to free space 2022-12-20 05:10:23 -05:00
Peter Boyle ede02b6883 Memory manager debug Felix case 2022-12-20 05:10:23 -05:00
Peter Boyle 1822ced302 Bug fix 2022-12-20 05:10:23 -05:00
Peter Boyle 37ba32776f More logging 2022-12-20 05:10:23 -05:00
Peter Boyle 99b3697b03 More loggin 2022-12-20 05:10:23 -05:00
Peter Boyle 43a45ec97b SSC_START 2022-12-20 05:10:23 -05:00
Peter Boyle b00a4142e5 A=A fix 2022-12-20 05:10:23 -05:00
Peter Boyle 3791bc527b Logging pulled in from dirichlet branch 2022-12-20 05:10:23 -05:00
Alessandro Lupo d7dea44ce7 Merge pull request #17 from chillenzer/unify_gauge_groups
Fix compilation error in nvcc (closes #15)
2022-12-19 16:24:03 +00:00
Peter Boyle d8c29f5fcf Updated FFT test for PETSc 2022-12-18 12:05:00 -05:00
Julian Lenz 37b6b82869 Fix file extensions 2022-12-18 16:12:56 +00:00
Julian Lenz 92ad5b8f74 Compiler error fix: NVCC requires names for templ. par. 2022-12-18 15:50:19 +00:00
Peter Boyle 281f8101fe Matt FFT test 2022-12-17 20:35:33 -05:00
Peter Boyle 472ed2dd5c Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2022-12-17 20:17:09 -05:00
Peter Boyle 4f85672674 Simpler test for PETSc 2022-12-17 20:16:11 -05:00
Peter Boyle dc747c54be Merge branch 'develop' into feature/dirichlet
Conflicts:
	Grid/qcd/action/fermion/WilsonCompressor.h
	Grid/stencil/Stencil.h
2022-12-13 08:24:58 -05:00
Peter Boyle 07acfe89f2 Merge pull request #417 from rrhodgson/feature/fermtoprop
Feature/fermtoprop
2022-12-06 12:45:03 -05:00
rhodgson 40234f531f FermToProp accelerator_for -> thread_for 2022-12-06 17:34:51 +00:00
rhodgson d49694f38f PropToFerm fix 2022-12-06 15:48:54 +00:00
Alessandro Lupo 8c80f1c168 Merge pull request #14 from chillenzer/unify_gauge_groups
Unify gauge groups (closes #5)
2022-12-01 17:35:46 +00:00
Chulwoo Jung dc6a38f177 Minor cleanup 2022-11-30 17:13:12 -05:00
Chulwoo Jung 82c1ecf60f Block lanczos added 2022-11-30 16:08:40 -05:00
Peter Boyle 97a098636d FermToProp 2022-11-30 15:36:35 -05:00
Peter Boyle e13930c8b2 Faster fermtoprop case 2022-11-30 15:11:29 -05:00
Julian Lenz 0af7d5a793 Rename Grid/qcd/utils/<Group>_impl.h -> Grid/qcd/utils/<Group>.h 2022-11-30 17:12:00 +00:00
Julian Lenz 505fa49983 Renamed SUn.h -> GaugeGroup.h 2022-11-30 17:09:48 +00:00
Julian Lenz 7bcf33def9 Removed Sp2n.h 2022-11-30 16:59:46 +00:00
Julian Lenz a13820656a Removed iSUnMatrix, etc. 2022-11-30 15:09:03 +00:00
Julian Lenz fa71b46a41 Hide nsp 2022-11-30 14:44:23 +00:00
Julian Lenz b8b3ae6ac1 Make helper functions private 2022-11-30 13:29:14 +00:00
Julian Lenz 55c008da21 Removed forward declaration 2022-11-30 13:12:21 +00:00
Julian Lenz 2507606bd0 With function overloading (still dirty). 2022-11-30 12:54:36 +00:00
Julian Lenz 7c2ad4f8c8 Attempt with SFINAE (failed) 2022-11-30 11:57:39 +00:00
Julian Lenz 54c8025aad Remove unnecessary pwd in scripts/filelist 2022-11-28 17:50:38 +00:00
Julian Lenz 921e23e83c Separated out everything SU specific 2022-11-28 17:47:50 +00:00
Julian Lenz 6e750ecb0e Remove apparently forgotten file 2022-11-28 16:33:46 +00:00
Julian Lenz b8f1f5d2a3 Introduce GaugeGroup 2022-11-25 17:45:32 +00:00
Julian Lenz 9273f2937c Autoformat google style 2022-11-25 17:44:08 +00:00
Julian Lenz 1aa28b47ae Add existing test to check 2022-11-25 17:40:40 +00:00
Julian Lenz 629cb2987a Fix typo in Makefile.am 2022-11-25 17:40:21 +00:00
Julian Lenz 03235d6368 Fixed type in configure.ac 2022-11-25 16:57:40 +00:00
Alessandro Lupo 22064c7e4c Fixing #11 2022-11-25 13:10:29 +00:00
Alessandro Lupo 2de03e5172 Revert "Revert "Fixing issue #11: consistent use of ncolour and nsp""
This reverts commit 3af4929dda.
2022-11-23 19:40:28 +00:00
Alessandro Lupo 3af4929dda Revert "Fixing issue #11: consistent use of ncolour and nsp"
This reverts commit 1ba429345b.
2022-11-23 19:34:59 +00:00
Alessandro Lupo 1ba429345b Fixing issue #11: consistent use of ncolour and nsp 2022-11-23 18:45:01 +00:00
Peter Boyle 0655dab466 Open MP on host enabled 2022-11-08 13:38:54 -08:00
Peter Boyle 7f097bcc28 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-11-08 13:23:40 -08:00
Peter Boyle 5c75aa5008 Device mem 2022-11-08 13:22:57 -08:00
Peter Boyle 1873101362 PVC 2022-11-08 13:22:45 -08:00
Peter Boyle 63fd1dfa62 Config on PVC 2022-11-08 13:22:09 -08:00
Peter Boyle bd68861b28 SYCL sum 2022-11-08 12:49:26 -08:00
Peter Boyle 82e959f66c SYCL reduction 2022-11-08 12:45:25 -08:00
Peter Boyle 62e52de06d Merge pull request #414 from fjosw/feat/eCloverGPU
Compact Exponential Cloverterm on GPU
2022-11-01 09:15:44 -04:00
fjosw 184adeedb8 feat: renamed open_boundaries to fixedBoundaries 2022-10-26 12:53:46 +01:00
fjosw 5fa6a8b96d docs: CompactClover debug info generalized. 2022-10-26 12:41:14 +01:00
fjosw a2a879b668 docs: CompactClover Debug Info improved. 2022-10-25 17:20:42 +01:00
fjosw 9317d893b2 docs: details about inversion of CompactClover term added. 2022-10-25 17:10:06 +01:00
fjosw 86075fdd45 feat: MassTerm and ExponentiateClover merged into InstantiateClover 2022-10-25 17:05:34 +01:00
fjosw b36442e263 feat: CloverHelpers::InvertClover implemented which handles the
inversion of the Clover term depending on clover type and the boundary
conditions.
2022-10-25 16:57:01 +01:00
fjosw 513d797ea6 fix: signature of CompactWilsonCloverHelpers::Exponentiate fixed. 2022-10-25 16:17:22 +01:00
fjosw 9e4835a3e3 feat: changed CompactWilsonExpClover exponentiation to Taylor expansion
with Horner scheme.
2022-10-25 15:19:43 +01:00
Peter Boyle 477ebf24f4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-10-04 11:19:43 -07:00
Peter Boyle 0d5639f707 Run script update 2022-10-04 11:13:41 -07:00
Peter Boyle 413312f9a9 Benchmark the halo construction.
THe bye counts are out and should be doubled for SIMD directions
2022-10-04 11:12:59 -07:00
Peter Boyle 03508448f8 Remove verbose 2022-10-04 11:12:15 -07:00
Peter Boyle e1e5c75023 Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM 2022-10-04 11:11:10 -07:00
Peter Boyle 9296299b61 Better commenting 2022-10-04 11:10:34 -07:00
Alessandro Lupo 88bdd4344b 2indx antisymm representation of sp2n 2021-11-04 18:27:35 +00:00
Alessandro Lupo 4044536eea add projection on sp2n algebra 2021-10-26 10:20:44 +01:00
Alessandro Lupo 4d8ae6221c fix projection 2021-10-22 10:44:54 +01:00
Alessandro Lupo 4e31e4e094 Better tests 2021-10-13 15:07:23 +01:00
Alessandro Lupo 0d6674e489 hot start for sp2n 2021-10-12 18:53:54 +01:00
Alessandro Lupo b145fd4f5b necessary to merge 2021-10-12 17:08:46 +01:00
Alessandro Lupo 8a5b794f25 necessary change to merge with upstrm 2021-10-12 16:04:03 +01:00
Alessandro Lupo 291e80f88a sp2n as config option 2021-10-12 16:00:32 +01:00
Alessandro Lupo 1ace5850ae first hmc 2021-10-12 16:00:32 +01:00
Alessandro Lupo 283f14b7c1 fix sp2n projection 2021-10-12 16:00:32 +01:00
Alessandro Lupo 1d6e708083 tests! 2021-10-12 16:00:32 +01:00
Alessandro Lupo 89457e25e3 sp fermion instantiation 2021-10-12 16:00:32 +01:00
Alessandro Lupo 7e3b298d3d project on sp2n 2021-10-12 16:00:32 +01:00
Alessandro Lupo 7ff3e5eed4 gauge and fermion implementation for sp2n 2021-10-12 16:00:32 +01:00
Alessandro Lupo 19eb51cf41 sp2n generators 2021-10-12 15:53:33 +01:00
Alessandro Lupo 470d4dcc6d sp2n as config option 2021-10-12 15:47:56 +01:00
Alessandro Lupo ed03bfd555 first hmc 2021-10-12 12:16:47 +01:00
Alessandro Lupo 8c0fbcccae fix sp2n projection 2021-10-12 12:12:16 +01:00
Alessandro Lupo d4866157fe tests! 2021-10-12 09:06:15 +01:00
Alessandro Lupo b6496b6cb5 sp fermion instantiation 2021-10-11 16:32:10 +01:00
Alessandro Lupo 4f5fe57920 project on sp2n 2021-10-11 16:28:15 +01:00
Alessandro Lupo 11fb943b1e gauge and fermion implementation for sp2n 2021-10-11 16:21:25 +01:00
Alessandro Lupo 046a23121e sp2n generators 2021-10-05 15:51:22 +01:00
225 changed files with 17594 additions and 2584 deletions
+54
View File
@@ -0,0 +1,54 @@
name: Bug report
description: Report a bug.
title: "<insert title>"
labels: [bug]
body:
- type: markdown
attributes:
value: >
Thank you for taking the time to file a bug report.
Please check that the code is pointing to the HEAD of develop
or any commit in master which is tagged with a version number.
- type: textarea
attributes:
label: "Describe the issue:"
description: >
Describe the issue and any previous attempt to solve it.
validations:
required: true
- type: textarea
attributes:
label: "Code example:"
description: >
If relevant, show how to reproduce the issue using a minimal working
example.
placeholder: |
<< your code here >>
render: shell
validations:
required: false
- type: textarea
attributes:
label: "Target platform:"
description: >
Give a description of the target platform (CPU, network, compiler).
Please give the full CPU part description, using for example
`cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
or `sysctl machdep.cpu.brand_string` (macOS) and the full output
the `--version` option of your compiler.
validations:
required: true
- type: textarea
attributes:
label: "Configure options:"
description: >
Please give the exact configure command used and attach
`config.log`, `grid.config.summary` and the output of `make V=1`.
render: shell
validations:
required: true
+1 -1
View File
@@ -45,7 +45,7 @@ directory
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
//disables nvcc specific warning in json.hpp
#pragma nv_diag_suppress unsigned_compare_with_zero
#pragma nv_diag_suppress cast_to_qualified_type
+1 -1
View File
@@ -14,7 +14,7 @@
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress code_is_unreachable
#else
#pragma diag_suppress code_is_unreachable
+4
View File
@@ -66,6 +66,10 @@ if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
endif
if BUILD_SP
extra_sources+=$(SP_FERMION_FILES)
extra_sources+=$(SP_TWOIND_FERMION_FILES)
endif
lib_LIBRARIES = libGrid.a
+1
View File
@@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
+48
View File
@@ -460,6 +460,53 @@ class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Fi
}
};
template<class Matrix,class Field>
class QuadLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
RealD a0,a1,a2;
QuadLinearOperator(Matrix &Mat): _Mat(Mat),a0(0.),a1(0.),a2(1.) {};
QuadLinearOperator(Matrix &Mat, RealD _a0,RealD _a1,RealD _a2): _Mat(Mat),a0(_a0),a1(_a1),a2(_a2) {};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
assert(0);
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
_Mat.MdirAll(in,out);
}
void HermOp (const Field &in, Field &out){
// _Mat.M(in,out);
Field tmp1(in.Grid());
// Linop.HermOpAndNorm(psi, mmp, d, b);
_Mat.M(in,tmp1);
_Mat.M(tmp1,out);
out *= a2;
axpy(out, a1, tmp1, out);
axpy(out, a0, in, out);
// d=real(innerProduct(psi,mmp));
// b=norm2(mmp);
}
void AdjOp (const Field &in, Field &out){
assert(0);
_Mat.M(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot= innerProduct(in,out); n1=real(dot);
n2=norm2(out);
}
void Op(const Field &in, Field &out){
assert(0);
_Mat.M(in,out);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
@@ -542,6 +589,7 @@ public:
(*this)(in[i], out[i]);
}
}
virtual ~LinearFunction(){};
};
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
+7 -5
View File
@@ -36,11 +36,12 @@ NAMESPACE_BEGIN(Grid);
// Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
// and returns a forecasted solution to the system D*psi = phi (psi).
template<class Matrix, class Field>
// Changing to operator
template<class LinearOperatorBase, class Field>
class Forecast
{
public:
virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
virtual Field operator()(LinearOperatorBase &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
};
// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
@@ -54,13 +55,13 @@ public:
Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
{
int degree = prev_solns.size();
std::cout << GridLogMessage << "ChronoForecast: degree= " << degree << std::endl;
Field chi(phi); // forecasted solution
// Trivial cases
if(degree == 0){ chi = Zero(); return chi; }
else if(degree == 1){ return prev_solns[0]; }
// RealD dot;
ComplexD xp;
Field r(phi); // residual
Field Mv(phi);
@@ -83,8 +84,9 @@ public:
// Perform sparse matrix multiplication and construct rhs
for(int i=0; i<degree; i++){
b[i] = innerProduct(v[i],phi);
Mat.M(v[i],Mv);
Mat.Mdag(Mv,MdagMv[i]);
// Mat.M(v[i],Mv);
// Mat.Mdag(Mv,MdagMv[i]);
Mat.HermOp(v[i],MdagMv[i]);
G[i][i] = innerProduct(v[i],MdagMv[i]);
}
@@ -191,7 +191,7 @@ public:
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
@@ -108,7 +108,10 @@ NAMESPACE_BEGIN(Grid);
GridStopWatch PrecChangeTimer;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
@@ -123,7 +126,7 @@ NAMESPACE_BEGIN(Grid);
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
PrecChangeTimer.Stop();
sol_f = Zero();
@@ -142,7 +145,7 @@ NAMESPACE_BEGIN(Grid);
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -0,0 +1,213 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Copyright (C) 2015
Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
Integer MaxPatchupIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
bool updateResidual;
MixedPrecisionConjugateGradientBatched(RealD tol,
Integer maxinnerit,
Integer maxouterit,
Integer maxpatchit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d,
bool _updateResidual=true) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
void useGuesser(LinearFunction<FieldF> &g){
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::vector<FieldD> srcs_d_in{src_d_in};
std::vector<FieldD> sols_d{sol_d};
(*this)(srcs_d_in,sols_d);
sol_d = sols_d[0];
}
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
assert(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
Integer TotalOuterIterations = 0; //Number of restarts
std::vector<Integer> TotalInnerIterations(NBatch,0); //Number of inner CG iterations
std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
GridStopWatch TotalTimer;
TotalTimer.Start();
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
int cb = src_d_in[0].Checkerboard();
std::vector<RealD> src_norm;
std::vector<RealD> norm;
std::vector<RealD> stop;
GridBase* DoublePrecGrid = src_d_in[0].Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
std::vector<FieldD> src_d;
std::vector<FieldF> src_f;
std::vector<FieldF> sol_f;
for (int i=0; i<NBatch; i++) {
sol_d[i].Checkerboard() = cb;
src_norm.push_back(norm2(src_d_in[i]));
norm.push_back(0.);
stop.push_back(src_norm[i] * Tolerance*Tolerance);
src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
src_f.push_back(SinglePrecGrid);
src_f[i].Checkerboard() = cb;
sol_f.push_back(SinglePrecGrid);
sol_f[i].Checkerboard() = cb;
}
RealD inner_tol = InnerTolerance;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
bool allConverged = true;
for (int i=0; i<NBatch; i++) {
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d[i], tmp_d);
norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
PrecChangeTimer.Start();
precisionChange(src_f[i], src_d[i]);
PrecChangeTimer.Stop();
sol_f[i] = Zero();
if(norm[i] > OuterLoopNormMult * stop[i]) {
allConverged = false;
}
}
if (allConverged) break;
if (updateResidual) {
RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
CG_f.Tolerance = inner_tol;
}
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL) {
(*guesser)(src_f, sol_f);
}
for (int i=0; i<NBatch; i++) {
//Inner CG
InnerCGtimer.Start();
CG_f(Linop_f, src_f[i], sol_f[i]);
InnerCGtimer.Stop();
TotalInnerIterations[i] += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f[i]);
PrecChangeTimer.Stop();
axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
}
}
//Final trial CG
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
for (int i=0; i<NBatch; i++) {
ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
CG_d(Linop_d, src_d_in[i], sol_d[i]);
TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
}
TotalTimer.Stop();
std::cout << GridLogMessage << std::endl;
for (int i=0; i<NBatch; i++) {
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
}
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif
@@ -0,0 +1,373 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//PB Pure single, then double fixup
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
GridBase *DoublePrecGrid = src_d.Grid();
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD rsqf[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF p_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
psi_f[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
// ps_d[s] = src_d;
precisionChange(ps_f[s],src_d);
}
// r and p for primary
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChange(p_f,p_d);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChange(tmp_d,mmp_f);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_d,b,mmp_d,r_d);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
precisionChange(psi_f[s],psi_d[s]);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(r_f, r_d);
PrecChangeTimer.Stop();
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_f[s],a,ps_f[s],r_f);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
}
}
}
AXPYTimer.Stop();
cp=c;
PrecChangeTimer.Start();
precisionChange(p_f, p_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChange(mmp_d, mmp_f); // From Float to Double
PrecChangeTimer.Stop();
d=real(innerProduct(p_d,mmp_d));
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update single precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
}
}
c = axpy_norm(r_d,b,mmp_d,r_d);
AXPYTimer.Stop();
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsqf[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
for(int s=0;s<nshift;s++){
precisionChange(psi_d[s],psi_f[s]);
}
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);
@@ -81,6 +81,7 @@ public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
@@ -95,9 +96,9 @@ public:
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq
) :
MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
@@ -130,6 +131,9 @@ public:
GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
GridBase *DoublePrecGrid = src_d.Grid();
precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
@@ -200,14 +204,14 @@ public:
r_d = p_d;
//MdagM+m[0]
precisionChangeFast(p_f,p_d);
precisionChange(p_f, p_d, pc_wk_d_to_s);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChangeFast(tmp_d,mmp_f);
precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4);
assert(norm2(tmp_d)< 1.0);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
@@ -244,7 +248,7 @@ public:
// Iteration loop
int k;
for (k=1;k<=MaxIterations;k++){
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
AXPYTimer.Start();
@@ -263,7 +267,7 @@ public:
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
PrecChangeTimer.Stop();
cp=c;
@@ -272,7 +276,7 @@ public:
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(mmp_d, mmp_f); // From Float to Double
precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
PrecChangeTimer.Stop();
AXPYTimer.Start();
@@ -350,12 +354,17 @@ public:
}
}
if ( all_converged ){
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
@@ -396,12 +405,10 @@ public:
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
assert(0);
}
};
@@ -48,7 +48,7 @@ public:
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter
RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback;
@@ -65,7 +65,9 @@ public:
ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true),
Linop_fallback(NULL)
{};
{
assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback;
@@ -116,9 +118,12 @@ public:
}
//Single prec initialization
precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
FieldF r_f(SinglePrecGrid);
r_f.Checkerboard() = r.Checkerboard();
precisionChange(r_f, r);
precisionChange(r_f, r, pc_wk_dp_to_sp);
FieldF psi_f(r_f);
psi_f = Zero();
@@ -134,7 +139,8 @@ public:
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
GridStopWatch PrecChangeTimer;
SolverTimer.Start();
int k = 0;
int l = 0;
@@ -173,7 +179,9 @@ public:
// Stopping condition
if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f);
PrecChangeTimer.Start();
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp;
@@ -194,7 +202,10 @@ public:
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
@@ -214,14 +225,21 @@ public:
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
PrecChangeTimer.Start();
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp;
MatrixTimer.Start();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
MatrixTimer.Stop();
r = src - mmp;
psi_f = Zero();
precisionChange(r_f, r);
PrecChangeTimer.Start();
precisionChange(r_f, r, pc_wk_dp_to_sp);
PrecChangeTimer.Stop();
cp = norm2(r);
MaxResidSinceLastRelUp = cp;
File diff suppressed because it is too large Load Diff
@@ -419,14 +419,15 @@ until convergence
}
}
if ( Nconv < Nstop )
if ( Nconv < Nstop ) {
std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
}
eval=eval2;
//Keep only converged
eval.resize(Nconv);// Nstop?
evec.resize(Nconv,grid);// Nstop?
eval.resize(Nstop);// was Nconv
evec.resize(Nstop,grid);// was Nconv
basisSortInPlace(evec,eval,reverse);
}
+37 -13
View File
@@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuSmall (1)
#define Acc (2)
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
#define CpuHuge (1)
#define CpuSmall (2)
#define Acc (3)
#define AccHuge (4)
#define AccSmall (5)
#define Shared (6)
#define SharedHuge (7)
#define SharedSmall (8)
#undef GRID_MM_VERBOSE
uint64_t total_shared;
uint64_t total_device;
@@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void)
}
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
@@ -170,6 +176,16 @@ void MemoryManager::Init(void)
}
}
str= getenv("GRID_ALLOC_NCACHE_HUGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuHuge]=Nc;
Ncache[AccHuge]=Nc;
Ncache[SharedHuge]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
@@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) {
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
#endif
#ifdef GRID_UVM
@@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) {
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
int cache;
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else
return ptr;
@@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
if (ncache == 0) return ptr;
void * ret = NULL;
int v = -1;
@@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
void *MemoryManager::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
int cache;
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else
return NULL;
@@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
+41 -3
View File
@@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define GRID_ALLOC_SMALL_LIMIT (4096)
#define GRID_ALLOC_HUGE_LIMIT (2147483648)
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
@@ -70,6 +71,21 @@ enum ViewMode {
CpuWriteDiscard = 0x10 // same for now
};
struct MemoryStatus {
uint64_t DeviceBytes;
uint64_t DeviceLRUBytes;
uint64_t DeviceMaxBytes;
uint64_t HostToDeviceBytes;
uint64_t DeviceToHostBytes;
uint64_t HostToDeviceXfer;
uint64_t DeviceToHostXfer;
uint64_t DeviceEvictions;
uint64_t DeviceDestroy;
uint64_t DeviceAllocCacheBytes;
uint64_t HostAllocCacheBytes;
};
class MemoryManager {
private:
@@ -83,7 +99,7 @@ private:
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=6;
static const int NallocType=9;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
@@ -97,8 +113,8 @@ private:
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
static void PrintBytes(void);
public:
static void PrintBytes(void);
static void Audit(std::string s);
static void Init(void);
static void InitMessage(void);
@@ -119,7 +135,28 @@ private:
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
static uint64_t DeviceEvictions;
static uint64_t DeviceDestroy;
static uint64_t DeviceCacheBytes();
static uint64_t HostCacheBytes();
static MemoryStatus GetFootprint(void) {
MemoryStatus stat;
stat.DeviceBytes = DeviceBytes;
stat.DeviceLRUBytes = DeviceLRUBytes;
stat.DeviceMaxBytes = DeviceMaxBytes;
stat.HostToDeviceBytes = HostToDeviceBytes;
stat.DeviceToHostBytes = DeviceToHostBytes;
stat.HostToDeviceXfer = HostToDeviceXfer;
stat.DeviceToHostXfer = DeviceToHostXfer;
stat.DeviceEvictions = DeviceEvictions;
stat.DeviceDestroy = DeviceDestroy;
stat.DeviceAllocCacheBytes = DeviceCacheBytes();
stat.HostAllocCacheBytes = HostCacheBytes();
return stat;
};
private:
#ifndef GRID_UVM
//////////////////////////////////////////////////////////////////////
@@ -176,6 +213,7 @@ private:
public:
static void Print(void);
static void PrintAll(void);
static void PrintState( void* CpuPtr);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);
+89 -30
View File
@@ -28,6 +28,8 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
////////////////////////////////////
// Priority ordering for unlocked entries
@@ -115,8 +117,10 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceDestroy++;
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
@@ -126,8 +130,14 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove entry
// Cannot be locked. If allocated must be in LRU pool.
// Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
// Cannot be acclocked. If allocated must be in LRU pool.
//
// Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
// and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
// but there is a weakness where CpuLock entries are attempted for erase
// Take these OUT LRU queue when CPU locked?
// Cannot take out the table as cpuLock data is important.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
@@ -139,15 +149,17 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
// uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++;
// EntryErase(CpuPtr);
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
@@ -221,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
uint64_t victim = LRU.back();
uint64_t victim = LRU.back(); // From the LRU
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
Evict(AccCache);
} else {
return;
}
}
}
@@ -322,7 +337,8 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(0);
}
// If view is opened on device remove from LRU
assert(AccCache.accLock>0);
// If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU \n");
@@ -388,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
// CPU doesn't need to free space
// if (!AccCache.AccPtr) {
// EvictVictims(bytes);
// }
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
@@ -444,20 +461,28 @@ void MemoryManager::NotifyDeletion(void *_ptr)
void MemoryManager::Print(void)
{
PrintBytes();
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "Memory Manager " << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogMessage << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogMessage << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl;
std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl;
std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
}
void MemoryManager::PrintAll(void)
{
Print();
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
@@ -467,13 +492,13 @@ void MemoryManager::Print(void)
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
}
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
};
int MemoryManager::isOpen (void* _CpuPtr)
@@ -489,6 +514,24 @@ int MemoryManager::isOpen (void* _CpuPtr)
}
void MemoryManager::Audit(std::string s)
{
uint64_t CpuBytes=0;
uint64_t AccBytes=0;
uint64_t LruBytes1=0;
uint64_t LruBytes2=0;
uint64_t LruCnt=0;
std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){
uint64_t cpuPtr = *it;
assert(EntryPresent(cpuPtr));
auto AccCacheIterator = EntryLookup(cpuPtr);
auto & AccCache = AccCacheIterator->second;
LruBytes2+=AccCache.bytes;
assert(AccCache.LRU_valid==1);
assert(AccCache.LRU_entry==it);
}
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
@@ -498,7 +541,14 @@ void MemoryManager::Audit(std::string s)
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
if ( AccCache.cpuLock || AccCache.accLock ) {
CpuBytes+=AccCache.bytes;
if( AccCache.AccPtr ) AccBytes+=AccCache.bytes;
if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
if( AccCache.LRU_valid ) LruCnt++;
if ( AccCache.cpuLock || AccCache.accLock ) {
assert(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t cpuLock " << AccCache.cpuLock
@@ -509,6 +559,15 @@ void MemoryManager::Audit(std::string s)
assert( AccCache.cpuLock== 0 ) ;
assert( AccCache.accLock== 0 ) ;
}
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
assert(LruBytes1==LruBytes2);
assert(LruBytes1==DeviceLRUBytes);
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
assert(AccBytes==DeviceBytes);
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
assert(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
}
void MemoryManager::PrintState(void* _CpuPtr)
@@ -526,8 +585,8 @@ void MemoryManager::PrintState(void* _CpuPtr)
if ( AccCache.state==EvictNext) str = std::string("EvictNext");
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
+4 -1
View File
@@ -12,8 +12,10 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
void MemoryManager::Audit(void){};
void MemoryManager::Audit(std::string s){};
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
@@ -22,6 +24,7 @@ void MemoryManager::PrintState(void* CpuPtr)
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
};
void MemoryManager::Print(void){};
void MemoryManager::PrintAll(void){};
void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid);
-3
View File
@@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
acceleratorCopySynchronise();
StencilBarrier();// Synch shared memory on a single nodes
int nreq=list.size();
if (nreq==0) return;
+1 -1
View File
@@ -128,7 +128,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return 2.0*bytes;
return xbytes+rbytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
+53
View File
@@ -91,6 +91,59 @@ void *SharedMemory::ShmBufferSelf(void)
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
return ShmCommBufs[ShmRank];
}
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
NAMESPACE_END(Grid);
+158 -75
View File
@@ -27,6 +27,8 @@ Author: Christoph Lehner <christoph@lhnr.de>
*************************************************************************************/
/* END LEGAL */
#define Mheader "SharedMemoryMpi: "
#include <Grid/GridCore.h>
#include <pwd.h>
@@ -36,12 +38,120 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef GRID_HIP
#include <hip/hip_runtime_api.h>
#endif
#ifdef GRID_SYCl
#ifdef GRID_SYCL
#define GRID_SYCL_LEVEL_ZERO_IPC
#include <syscall.h>
#define SHM_SOCKETS
#endif
#include <sys/socket.h>
#include <sys/un.h>
NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: "
#ifdef SHM_SOCKETS
/*
* Barbaric extra intranode communication route in case we need sockets to pass FDs
* Forced by level_zero not being nicely designed
*/
static int sock;
static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
static char sock_path[256];
class UnixSockets {
public:
static void Open(int rank)
{
int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
unlink(sa_un.sun_path);
if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
perror("bind failure");
exit(EXIT_FAILURE);
}
}
static int RecvFileDescriptor(void)
{
int n;
int fd;
char buf[1];
struct iovec iov;
struct msghdr msg;
struct cmsghdr *cmsg;
char cms[CMSG_SPACE(sizeof(int))];
iov.iov_base = buf;
iov.iov_len = 1;
memset(&msg, 0, sizeof msg);
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = (caddr_t)cms;
msg.msg_controllen = sizeof cms;
if((n=recvmsg(sock, &msg, 0)) < 0) {
perror("recvmsg failed");
return -1;
}
if(n == 0){
perror("recvmsg returned 0");
return -1;
}
cmsg = CMSG_FIRSTHDR(&msg);
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
return fd;
}
static void SendFileDescriptor(int fildes,int xmit_to_rank)
{
struct msghdr msg;
struct iovec iov;
struct cmsghdr *cmsg = NULL;
char ctrl[CMSG_SPACE(sizeof(int))];
char data = ' ';
memset(&msg, 0, sizeof(struct msghdr));
memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
iov.iov_base = &data;
iov.iov_len = sizeof(data);
sprintf(sock_path,sock_path_fmt,xmit_to_rank);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
msg.msg_name = (void *)&sa_un;
msg.msg_namelen = sizeof(sa_un);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_controllen = CMSG_SPACE(sizeof(int));
msg.msg_control = ctrl;
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
*((int *) CMSG_DATA(cmsg)) = fildes;
sendmsg(sock, &msg, 0);
};
};
#endif
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
@@ -64,8 +174,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldRank == 0) {
std::cout << header " World communicator of size " <<WorldSize << std::endl;
std::cout << header " Node communicator of size " <<WorldShmSize << std::endl;
std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;
std::cout << Mheader " Node communicator of size " <<WorldShmSize << std::endl;
}
// WorldShmComm, WorldShmSize, WorldShmRank
@@ -168,59 +278,7 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
}
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
////////////////////////////////////////////////////////////////
@@ -394,7 +452,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
#ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
@@ -479,7 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE);
}
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes);
@@ -522,7 +580,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
@@ -530,8 +588,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef SHM_SOCKETS
UnixSockets::Open(WorldShmRank);
#endif
for(int r=0;r<WorldShmSize;r++){
MPI_Barrier(WorldShmComm);
#ifndef GRID_MPI3_SHM_NONE
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
@@ -539,24 +602,32 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; } clone_mem_t;
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;
if ( r==WorldShmRank ) {
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid();
memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
#ifdef SHM_SOCKETS
for(int rr=0;rr<WorldShmSize;rr++){
if(rr!=r){
UnixSockets::SendFileDescriptor(handle.fd,rr);
}
}
#endif
}
#endif
#ifdef GRID_CUDA
@@ -584,6 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
{
MPI_Barrier(WorldShmComm);
int ierr=MPI_Bcast(&handle,
sizeof(handle),
MPI_BYTE,
@@ -599,6 +671,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) {
thisBuf = nullptr;
int myfd;
#ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor();
#else
std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/"
<<handle.fd<<std::endl;
@@ -606,16 +682,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
int myfd = syscall(438,pidfd,handle.fd,0);
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno;
if (myfd < 0) {
fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
perror("pidfd_getfd failed ");
assert(0);
}
#endif
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
@@ -650,6 +732,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#else
WorldShmCommBufs[r] = ShmCommBuf;
#endif
MPI_Barrier(WorldShmComm);
}
_ShmAllocBytes=bytes;
@@ -661,7 +744,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -698,7 +781,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
// std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
@@ -708,7 +791,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -755,7 +838,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);
+40
View File
@@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
}
}
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
template <typename T>
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
template <typename T>
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx >= e1*e2) return;
int n, b, o;
n = idx / e2;
b = idx % e2;
o = n*stride + b;
vector[2*idx + 0] = lo + o;
vector[2*idx + 1] = ro + o;
}
#endif
//////////////////////////////////////////////////////
// local to node block strided copies
//////////////////////////////////////////////////////
@@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int ent=0;
if(cbmask == 0x3 ){
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
#endif
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
@@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int ent=0;
if ( cbmask == 0x3 ) {
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
#endif
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
+1
View File
@@ -47,3 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h>
#include <Grid/lattice/PaddedCell.h>
+4
View File
@@ -345,7 +345,9 @@ GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnarySpTa, SpTa(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryProjectOnSpGroup, ProjectOnSpGroup(a));
GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a));
@@ -456,7 +458,9 @@ GRID_DEF_UNOP(operator!, UnaryNot);
GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(SpTa, UnarySpTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(ProjectOnSpGroup, UnaryProjectOnSpGroup);
GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
+2 -2
View File
@@ -291,8 +291,8 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r);
this->checkerboard = r.Checkerboard();
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
@@ -306,8 +306,8 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard();
conformable(*this,r);
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
+28 -15
View File
@@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
#if defined(GRID_CUDA)||defined(GRID_HIP)
#include <Grid/lattice/Lattice_reduction_gpu.h>
#endif
#if defined(GRID_SYCL)
#include <Grid/lattice/Lattice_reduction_sycl.h>
#endif
NAMESPACE_BEGIN(Grid);
@@ -124,7 +127,7 @@ inline Double max(const Double *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
@@ -133,7 +136,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
@@ -142,7 +145,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu_large(arg,osites);
#else
return sumD_cpu(arg,osites);
@@ -150,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu(&arg_v[0],osites);
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead);
return sum_gpu(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
return sum_cpu(&arg_v[0],osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto ssum = rankSum(arg);
arg.Grid()->GlobalSum(ssum);
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu_large(&arg_v[0],osites);
return sum_gpu_large(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
return sum_cpu(&arg_v[0],osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
auto ssum = rankSumLarge(arg);
arg.Grid()->GlobalSum(ssum);
return ssum;
}
@@ -232,11 +246,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// This code could read coalesce
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
+16 -4
View File
@@ -211,13 +211,25 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
assert(ok);
Integer smemSize = numThreads * sizeof(sobj);
// Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise
#undef UVM_BLOCK_BUFFER
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
auto result = buffer_v[0];
result = *buffer_v;
#endif
return result;
}
+125
View File
@@ -0,0 +1,125 @@
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD;
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
sobj identity; zeroit(identity);
sobj ret ;
Integer nsimd= vobj::Nsimd();
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{osites},
Reduction,
[=] (cl::sycl::id<1> item, auto &sum) {
auto osite = item[0];
sum +=Reduce(lat[osite]);
});
});
theGridAccelerator->wait();
ret = mysum[0];
free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret);
return dret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
return sumD_gpu_tensor(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result;
}
template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}
NAMESPACE_END(Grid);
/*
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
{
Double sumResult; zeroit(sumResult);
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
Double identity; zeroit(identity);
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{L},
Reduction,
[=] (cl::sycl::id<1> index, auto &sum) {
sum +=vec[index];
});
});
theGridAccelerator->wait();
Double ret = d_sum[0];
free(d_sum,*theGridAccelerator);
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
return ret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/
+1
View File
@@ -440,6 +440,7 @@ public:
_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
assert(rank == _grid->ThisRank() );
int l_idx=generator_idx(o_idx,i_idx);
+59
View File
@@ -66,6 +66,65 @@ inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<
return ret;
};
template<int N, class Vec>
Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
{
GridBase *grid=Umu.Grid();
auto lvol = grid->lSites();
Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
typedef typename Vec::scalar_type scalar;
autoView(Umu_v,Umu,CpuRead);
autoView(ret_v,ret,CpuWrite);
thread_for(site,lvol,{
Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
iScalar<iScalar<iMatrix<scalar, N> > > Us;
peekLocalSite(Us, Umu_v, lcoor);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
scalar tmp= Us()()(i,j);
ComplexD ztmp(real(tmp),imag(tmp));
EigenU(i,j)=ztmp;
}}
ComplexD detD = EigenU.determinant();
typename Vec::scalar_type det(detD.real(),detD.imag());
pokeLocalSite(det,ret_v,lcoor);
});
return ret;
}
template<int N>
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
{
GridBase *grid=Umu.Grid();
auto lvol = grid->lSites();
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
autoView(Umu_v,Umu,CpuRead);
autoView(ret_v,ret,CpuWrite);
thread_for(site,lvol,{
Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
peekLocalSite(Us, Umu_v, lcoor);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
EigenU(i,j) = Us()()(i,j);
}}
Eigen::MatrixXcd EigenUinv = EigenU.inverse();
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
Ui()()(i,j) = EigenUinv(i,j);
}}
pokeLocalSite(Ui,ret_v,lcoor);
});
return ret;
}
NAMESPACE_END(Grid);
#endif
+316 -15
View File
@@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
}
}
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
const std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = fineData.size();
assert(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid();
Lattice<iScalar<CComplex>> ip(coarse);
std::vector<Lattice<vobj>> fineDataCopy = fineData;
autoView(ip_, ip, AcceleratorWrite);
for(int v=0;v<nbasis;v++) {
for (int k=0; k<NBatch; k++) {
autoView( coarseData_ , coarseData[k], AcceleratorWrite);
blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);
}
}
}
template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ,
@@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
}
#endif
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = coarseData.size();
assert(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid();
for (int k=0; k<NBatch; k++)
fineData[k]=Zero();
for (int i=0;i<nbasis;i++) {
for (int k=0; k<NBatch; k++) {
Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
}
}
}
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
// Simd layouts need not match since we use peek/poke Local
template<class vobj,class vvobj>
@@ -648,8 +697,68 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
for(int d=0;d<nd;d++){
assert(Fg->_processors[d] == Tg->_processors[d]);
}
// the above should guarantee that the operations are local
#if 1
size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
size_t tbytes = 4*nsite*sizeof(int);
int *table = (int*)malloc(tbytes);
thread_for(idx, nsite, {
Coordinate from_coor, to_coor;
size_t rem = idx;
for(int i=0;i<nd;i++){
size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
from_coor[i] = base_i + FromLowerLeft[i];
to_coor[i] = base_i + ToLowerLeft[i];
}
int foidx = Fg->oIndex(from_coor);
int fiidx = Fg->iIndex(from_coor);
int toidx = Tg->oIndex(to_coor);
int tiidx = Tg->iIndex(to_coor);
int* tt = table + 4*idx;
tt[0] = foidx;
tt[1] = fiidx;
tt[2] = toidx;
tt[3] = tiidx;
});
int* table_d = (int*)acceleratorAllocDevice(tbytes);
acceleratorCopyToDevice(table,table_d,tbytes);
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
autoView(from_v,From,AcceleratorRead);
autoView(to_v,To,AcceleratorWrite);
accelerator_for(idx,nsite,1,{
static const int words=sizeof(vobj)/sizeof(vector_type);
int* tt = table_d + 4*idx;
int from_oidx = *tt++;
int from_lane = *tt++;
int to_oidx = *tt++;
int to_lane = *tt;
const vector_type* from = (const vector_type *)&from_v[from_oidx];
vector_type* to = (vector_type *)&to_v[to_oidx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
putlane(to[w], stmp, to_lane);
}
});
acceleratorFreeDevice(table_d);
free(table);
#else
Coordinate ldf = Fg->_ldimensions;
Coordinate rdf = Fg->_rdimensions;
Coordinate isf = Fg->_istride;
@@ -658,9 +767,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride;
autoView( t_v , To, AcceleratorWrite);
autoView( f_v , From, AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{
autoView( t_v , To, CpuWrite);
autoView( f_v , From, CpuRead);
thread_for(idx,Fg->lSites(),{
sobj s;
Coordinate Fcoor(nd);
Coordinate Tcoor(nd);
@@ -673,17 +782,24 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
}
if (in_region) {
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
vector_type * fp = (vector_type *)&f_v[odx_f];
vector_type * tp = (vector_type *)&t_v[odx_t];
#if 0
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
scalar_type * fp = (scalar_type *)&f_v[odx_f];
scalar_type * tp = (scalar_type *)&t_v[odx_t];
for(int w=0;w<words;w++){
tp[w].putlane(fp[w].getlane(idx_f),idx_t);
}
#else
peekLocalSite(s,f_v,Fcoor);
pokeLocalSite(s,t_v,Tcoor);
#endif
}
});
#endif
}
@@ -776,6 +892,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
}
//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
template<class vobj>
void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
{
@@ -792,11 +910,70 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
}
#if 1
size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
size_t tbytes = 4*nsite*sizeof(int);
int *table = (int*)malloc(tbytes);
thread_for(idx,nsite,{
Coordinate lcoor(nl);
Coordinate hcoor(nh);
lcoor[orthog] = slice_lo;
hcoor[orthog] = slice_hi;
size_t rem = idx;
for(int mu=0;mu<nl;mu++){
if(mu != orthog){
int xmu = rem % lg->LocalDimensions()[mu]; rem /= lg->LocalDimensions()[mu];
lcoor[mu] = hcoor[mu] = xmu;
}
}
int loidx = lg->oIndex(lcoor);
int liidx = lg->iIndex(lcoor);
int hoidx = hg->oIndex(hcoor);
int hiidx = hg->iIndex(hcoor);
int* tt = table + 4*idx;
tt[0] = loidx;
tt[1] = liidx;
tt[2] = hoidx;
tt[3] = hiidx;
});
int* table_d = (int*)acceleratorAllocDevice(tbytes);
acceleratorCopyToDevice(table,table_d,tbytes);
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
autoView(lowDim_v,lowDim,AcceleratorRead);
autoView(higherDim_v,higherDim,AcceleratorWrite);
accelerator_for(idx,nsite,1,{
static const int words=sizeof(vobj)/sizeof(vector_type);
int* tt = table_d + 4*idx;
int from_oidx = *tt++;
int from_lane = *tt++;
int to_oidx = *tt++;
int to_lane = *tt;
const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
vector_type* to = (vector_type *)&higherDim_v[to_oidx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
putlane(to[w], stmp, to_lane);
}
});
acceleratorFreeDevice(table_d);
free(table);
#else
// the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
@@ -812,6 +989,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
pokeLocalSite(s,higherDimv,hcoor);
}
});
#endif
}
@@ -1080,6 +1258,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
});
}
//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
template<class VobjOut, class VobjIn>
void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{
@@ -1097,9 +1276,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
precisionChange(vout,vin,N);
});
}
//Convert a Lattice from one precision to another
//Convert a Lattice from one precision to another (original, slow implementation)
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{
assert(out.Grid()->Nd() == in.Grid()->Nd());
for(int d=0;d<out.Grid()->Nd();d++){
@@ -1145,6 +1324,128 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
});
}
//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
class precisionChangeWorkspace{
std::pair<Integer,Integer>* fmap_device; //device pointer
//maintain grids for checking
GridBase* _out_grid;
GridBase* _in_grid;
public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
assert(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
}
int Nsimd_out = out_grid->Nsimd();
std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
for(int lane=0; lane < out_grid->Nsimd(); lane++)
out_grid->iCoorFromIindex(out_icorrs[lane], lane);
std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocorr;
out_grid->oCoorFromOindex(out_ocorr, out_oidx);
Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
//int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
//Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
//Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
int in_oidx = 0, in_lane = 0;
for(int d=0;d<in_grid->_ndimension;d++){
in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
}
fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
}
});
//Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes);
}
//Prevent moving or copying
precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
void checkGrids(GridBase* out, GridBase* in) const{
conformable(out, _out_grid);
conformable(in, _in_grid);
}
~precisionChangeWorkspace(){
acceleratorFreeDevice(fmap_device);
}
};
//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
template<class VobjOut, class VobjIn>
auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
if(out.Grid() == in.Grid()){
precisionChangeFast(out,in);
return 1;
}else{
return 0;
}
}
template<class VobjOut, class VobjIn>
int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
return 0;
}
//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
//which contains the mapping data.
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
if(_precisionChangeFastWrap(out,in,0)) return;
static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
out.Checkerboard() = in.Checkerboard();
constexpr int Nsimd_out = VobjOut::Nsimd();
workspace.checkGrids(out.Grid(),in.Grid());
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
//Do the copy/precision change
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
int in_oidx = fmap_osite[out_lane].first;
int in_lane = fmap_osite[out_lane].second;
copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
}
});
}
//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
if(_precisionChangeFastWrap(out,in,0)) return;
precisionChangeWorkspace workspace(out.Grid(), in.Grid());
precisionChange(out, in, workspace);
}
////////////////////////////////////////////////////////////////////////////////
// Communicate between grids
////////////////////////////////////////////////////////////////////////////////
+174
View File
@@ -0,0 +1,174 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/PaddedCell.h
Copyright (C) 2019
Author: Peter Boyle pboyle@bnl.gov
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include<Grid/cshift/Cshift.h>
NAMESPACE_BEGIN(Grid);
//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
template<typename vobj>
struct CshiftImplBase{
virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
virtual ~CshiftImplBase(){}
};
template<typename vobj>
struct CshiftImplDefault: public CshiftImplBase<vobj>{
Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
};
template<typename Gimpl>
struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
};
class PaddedCell {
public:
GridCartesian * unpadded_grid;
int dims;
int depth;
std::vector<GridCartesian *> grids;
~PaddedCell()
{
DeleteGrids();
}
PaddedCell(int _depth,GridCartesian *_grid)
{
unpadded_grid = _grid;
depth=_depth;
dims=_grid->Nd();
AllocateGrids();
Coordinate local =unpadded_grid->LocalDimensions();
for(int d=0;d<dims;d++){
assert(local[d]>=depth);
}
}
void DeleteGrids(void)
{
for(int d=0;d<grids.size();d++){
delete grids[d];
}
grids.resize(0);
};
void AllocateGrids(void)
{
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate simd =unpadded_grid->_simd_layout;
Coordinate processors=unpadded_grid->_processors;
Coordinate plocal =unpadded_grid->LocalDimensions();
Coordinate global(dims);
// expand up one dim at a time
for(int d=0;d<dims;d++){
plocal[d] += 2*depth;
for(int d=0;d<dims;d++){
global[d] = plocal[d]*processors[d];
}
grids.push_back(new GridCartesian(global,simd,processors));
}
};
template<class vobj>
inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
{
Lattice<vobj> out(unpadded_grid);
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate fll(dims,depth); // depends on the MPI spread
Coordinate tll(dims,0); // depends on the MPI spread
localCopyRegion(in,out,fll,tll,local);
return out;
}
template<class vobj>
inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{
GridBase *old_grid = in.Grid();
int dims = old_grid->Nd();
Lattice<vobj> tmp = in;
for(int d=0;d<dims;d++){
tmp = Expand(d,tmp,cshift); // rvalue && assignment
}
return tmp;
}
// expand up one dim at a time
template<class vobj>
inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{
GridBase *old_grid = in.Grid();
GridCartesian *new_grid = grids[dim];//These are new grids
Lattice<vobj> padded(new_grid);
Lattice<vobj> shifted(old_grid);
Coordinate local =old_grid->LocalDimensions();
Coordinate plocal =new_grid->LocalDimensions();
if(dim==0) conformable(old_grid,unpadded_grid);
else conformable(old_grid,grids[dim-1]);
std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
double tins=0, tshift=0;
// Middle bit
double t = usecond();
for(int x=0;x<local[dim];x++){
InsertSliceLocal(in,padded,x,depth+x,dim);
}
tins += usecond() - t;
// High bit
t = usecond();
shifted = cshift.Cshift(in,dim,depth);
tshift += usecond() - t;
t=usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
}
tins += usecond() - t;
// Low bit
t = usecond();
shifted = cshift.Cshift(in,dim,-depth);
tshift += usecond() - t;
t = usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,x,x,dim);
}
tins += usecond() - t;
std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
return padded;
}
};
NAMESPACE_END(Grid);
+6
View File
@@ -30,6 +30,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_PERFCOUNT_H
#define GRID_PERFCOUNT_H
#ifndef __SSC_START
#define __SSC_START
#define __SSC_STOP
#endif
#include <sys/time.h>
#include <ctime>
#include <chrono>
+1 -1
View File
@@ -16,7 +16,7 @@
#ifdef __NVCC__
#pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#else
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
+37
View File
@@ -104,6 +104,7 @@ template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iSca
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
template<typename vtype> using iLorentzComplex = iVector<iScalar<iScalar<vtype> >, Nd > ;
template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
@@ -178,6 +179,15 @@ typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;
// LorentzComplex
typedef iLorentzComplex<Complex > LorentzComplex;
typedef iLorentzComplex<ComplexF > LorentzComplexF;
typedef iLorentzComplex<ComplexD > LorentzComplexD;
typedef iLorentzComplex<vComplex > vLorentzComplex;
typedef iLorentzComplex<vComplexF> vLorentzComplexF;
typedef iLorentzComplex<vComplexD> vLorentzComplexD;
// DoubleStored gauge field
typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
@@ -307,6 +317,10 @@ typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;
typedef Lattice<vLorentzComplex> LatticeLorentzComplex;
typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF;
typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD;
// DoubleStored gauge field
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
@@ -507,9 +521,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
// Fermion <-> propagator assignements
//////////////////////////////////////////////
//template <class Prop, class Ferm>
#define FAST_FERM_TO_PROP
template <class Fimpl>
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
{
#ifdef FAST_FERM_TO_PROP
autoView(p_v,p,CpuWrite);
autoView(f_v,f,CpuRead);
thread_for(idx,p_v.oSites(),{
for(int ss = 0; ss < Ns; ++ss) {
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
}}
});
#else
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
@@ -521,12 +546,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
}
pokeSpin(p, pjs, j, s);
}
#endif
}
//template <class Prop, class Ferm>
template <class Fimpl>
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
{
#ifdef FAST_FERM_TO_PROP
autoView(p_v,p,CpuRead);
autoView(f_v,f,CpuWrite);
thread_for(idx,p_v.oSites(),{
for(int ss = 0; ss < Ns; ++ss) {
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
}}
});
#else
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
@@ -538,6 +574,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
}
pokeSpin(f, fj, j);
}
#endif
}
//////////////////////////////////////////////
+43 -1
View File
@@ -34,10 +34,24 @@ directory
NAMESPACE_BEGIN(Grid);
///////////////////////////////////
// Smart configuration base class
///////////////////////////////////
template< class Field >
class ConfigurationBase
{
public:
ConfigurationBase() {}
virtual ~ConfigurationBase() {}
virtual void set_Field(Field& U) =0;
virtual void smeared_force(Field&) = 0;
virtual Field& get_SmearedU() =0;
virtual Field &get_U(bool smeared = false) = 0;
};
template <class GaugeField >
class Action
{
public:
bool is_smeared = false;
RealD deriv_norm_sum;
@@ -77,11 +91,39 @@ public:
void refresh_timer_stop(void) { refresh_us+=usecond(); }
void S_timer_start(void) { S_us-=usecond(); }
void S_timer_stop(void) { S_us+=usecond(); }
/////////////////////////////
// Heatbath?
/////////////////////////////
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
/////////////////////////////////////////////////////////////
// virtual smeared interface through configuration container
/////////////////////////////////////////////////////////////
virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
{
refresh(U.get_U(is_smeared),sRNG,pRNG);
}
virtual RealD S(ConfigurationBase<GaugeField>& U)
{
return S(U.get_U(is_smeared));
}
virtual RealD Sinitial(ConfigurationBase<GaugeField>& U)
{
return Sinitial(U.get_U(is_smeared));
}
virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
{
deriv(U.get_U(is_smeared),dSdU);
if ( is_smeared ) {
U.smeared_force(dSdU);
}
}
///////////////////////////////
// Logging
///////////////////////////////
virtual std::string action_name() = 0; // return the action name
virtual std::string LogParameters() = 0; // prints action parameters
virtual ~Action(){}
+3
View File
@@ -30,6 +30,8 @@ directory
#ifndef QCD_ACTION_CORE
#define QCD_ACTION_CORE
#include <Grid/qcd/action/gauge/GaugeImplementations.h>
#include <Grid/qcd/action/ActionBase.h>
NAMESPACE_CHECK(ActionBase);
#include <Grid/qcd/action/ActionSet.h>
@@ -65,6 +67,7 @@ NAMESPACE_CHECK(Scalar);
#include <Grid/qcd/utils/Metric.h>
NAMESPACE_CHECK(Metric);
#include <Grid/qcd/utils/CovariantLaplacian.h>
#include <Grid/qcd/utils/CovariantLaplacianRat.h>
NAMESPACE_CHECK(CovariantLaplacian);
+13
View File
@@ -65,6 +65,19 @@ struct WilsonImplParams {
}
};
struct GaugeImplParams {
// bool overlapCommsCompute;
// AcceleratorVector<Real,Nd> twist_n_2pi_L;
AcceleratorVector<Complex,Nd> boundary_phases;
GaugeImplParams() {
boundary_phases.resize(Nd, 1.0);
// twist_n_2pi_L.resize(Nd, 0.0);
};
GaugeImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi) {
// twist_n_2pi_L.resize(Nd, 0.0);
}
};
struct StaggeredImplParams {
Coordinate dirichlet; // Blocksize of dirichlet BCs
int partialDirichlet;
+59 -161
View File
@@ -205,15 +205,18 @@ public:
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void MassTerm(CloverField& Clover, RealD diag_mass) {
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
Clover += diag_mass;
}
static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
CloverTriangleField& Triangle,
RealD csw_t, RealD diag_mass) {
static void InvertClover(CloverField& InvClover,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv,
bool fixedBoundaries) {
// Do nothing
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
}
// TODO: implement Cmunu for better performances with compact layout, but don't do it
@@ -238,9 +241,17 @@ public:
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void MassTerm(CloverField& Clover, RealD diag_mass) {
// do nothing!
// mass term is multiplied to exp(Clover) below
// Can this be avoided?
static void IdentityTimesC(const CloverField& in, RealD c) {
int DimRep = Impl::Dimension;
autoView(in_v, in, AcceleratorWrite);
accelerator_for(ss, in.Grid()->oSites(), 1, {
for (int sa=0; sa<Ns; sa++)
for (int ca=0; ca<DimRep; ca++)
in_v[ss]()(sa,sa)(ca,ca) = c;
});
}
static int getNMAX(RealD prec, RealD R) {
@@ -255,175 +266,62 @@ public:
return NMAX;
}
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
typedef iMatrix<ComplexD,6> mat;
GridBase* grid = Clover.Grid();
CloverField ExpClover(grid);
RealD qn[6];
RealD qnold[6];
RealD p[5];
RealD trA2, trA3, trA4;
int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
mat A2, A3, A4, A5;
A2 = alpha * alpha * arg * arg;
A3 = alpha * arg * A2;
A4 = A2 * A2;
A5 = A2 * A3;
Clover *= (1.0/diag_mass);
trA2 = toReal( trace(A2) );
trA3 = toReal( trace(A3) );
trA4 = toReal( trace(A4));
p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
p[3] = trA3 / 3.0;
p[4] = 0.5 * trA2;
qnold[0] = cN[Niter];
qnold[1] = 0.0;
qnold[2] = 0.0;
qnold[3] = 0.0;
qnold[4] = 0.0;
qnold[5] = 0.0;
for(int i = Niter-1; i >= 0; i--)
{
qn[0] = p[0] * qnold[5] + cN[i];
qn[1] = p[1] * qnold[5] + qnold[0];
qn[2] = p[2] * qnold[5] + qnold[1];
qn[3] = p[3] * qnold[5] + qnold[2];
qn[4] = p[4] * qnold[5] + qnold[3];
qn[5] = qnold[4];
qnold[0] = qn[0];
qnold[1] = qn[1];
qnold[2] = qn[2];
qnold[3] = qn[3];
qnold[4] = qn[4];
qnold[5] = qn[5];
}
mat unit(1.0);
dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
}
static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
GridBase* grid = Diagonal.Grid();
int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
//
// Implementation completely in Daniel's layout
//
// Taylor expansion with Cayley-Hamilton recursion
// underlying Horner scheme as above
// Taylor expansion, slow but generic
// Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
// qN = cN
// qn = cn + qn+1 X
std::vector<RealD> cn(NMAX+1);
cn[0] = 1.0;
for (int i=1; i<=NMAX; i++){
for (int i=1; i<=NMAX; i++)
cn[i] = cn[i-1] / RealD(i);
}
// Taken over from Daniel's implementation
conformable(Diagonal, Triangle);
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * Clover + cn[i];
long lsites = grid->lSites();
{
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
typedef iMatrix<ComplexD,6> mat;
// prepare inverse
CloverInv = (-1.0)*Clover;
autoView(diagonal_v, Diagonal, CpuRead);
autoView(triangle_v, Triangle, CpuRead);
autoView(diagonalExp_v, Diagonal, CpuWrite);
autoView(triangleExp_v, Triangle, CpuWrite);
Clover = ExpClover * diag_mass;
thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * CloverInv + cn[i];
mat srcCloverOpUL(0.0); // upper left block
mat srcCloverOpLR(0.0); // lower right block
mat ExpCloverOp;
CloverInv = ExpClover * (1.0/diag_mass);
scalar_object_diagonal diagonal_tmp = Zero();
scalar_object_diagonal diagonal_exp_tmp = Zero();
scalar_object_triangle triangle_tmp = Zero();
scalar_object_triangle triangle_exp_tmp = Zero();
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
peekLocalSite(triangle_tmp, triangle_v, lcoor);
int block;
block = 0;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
}
else{
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
}
}
}
block = 1;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
}
else{
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
}
}
}
// exp(Clover)
ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
block = 0;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
}
else if(i < j){
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
}
}
}
ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
block = 1;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
}
else if(i < j){
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
}
}
}
pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
});
}
Diagonal *= diag_mass;
Triangle *= diag_mass;
}
static void InvertClover(CloverField& InvClover,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv,
bool fixedBoundaries) {
if (fixedBoundaries)
{
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
}
else
{
CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
}
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);
@@ -225,7 +225,7 @@ public:
RealD csw_t;
RealD cF;
bool open_boundaries;
bool fixedBoundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
+10
View File
@@ -126,6 +126,16 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermi
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
// Sp(2n)
typedef WilsonFermion<SpWilsonImplF> SpWilsonFermionF;
typedef WilsonFermion<SpWilsonImplD> SpWilsonFermionD;
typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplF> SpWilsonTwoIndexAntiSymmetricFermionF;
typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplD> SpWilsonTwoIndexAntiSymmetricFermionD;
typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplF> SpWilsonTwoIndexSymmetricFermionF;
typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplD> SpWilsonTwoIndexSymmetricFermionD;
// Twisted mass fermion
typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2;
typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
+23 -20
View File
@@ -36,7 +36,7 @@ NAMESPACE_BEGIN(Grid);
// Wilson compressor will need FaceGather policies for:
// Periodic, Dirichlet, and partial Dirichlet for DWF
///////////////////////////////////////////////////////////////
const int dwf_compressor_depth=1;
const int dwf_compressor_depth=2;
#define DWF_COMPRESS
class FaceGatherPartialDWF
{
@@ -110,7 +110,7 @@ public:
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
GridBase *Grid = rhs.Grid();
@@ -209,7 +209,7 @@ public:
}
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
// std::cout << " face gather exch DWF partial "<<partial <<std::endl;
@@ -320,7 +320,7 @@ public:
typedef decltype(coalescedRead(in0)) sobj;
typedef decltype(coalescedRead(out0)) hsobj;
unsigned int Nsimd = vobj::Nsimd();
constexpr unsigned int Nsimd = vobj::Nsimd();
unsigned int mask = Nsimd >> (type + 1);
int lane = acceleratorSIMTlane(Nsimd);
int j0 = lane &(~mask); // inner coor zero
@@ -484,27 +484,30 @@ public:
int dag = compress.dag;
int face_idx=0;
#define vet_same_node(a,b) \
{ auto tmp = b; }
if ( dag ) {
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx));
vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx));
vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx));
vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx));
vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx));
vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx));
} else {
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx));
vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx));
vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx));
vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx));
vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx));
vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx));
}
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);
accelerator_barrier();
}
};
+17 -1
View File
@@ -261,6 +261,22 @@ typedef WilsonImpl<vComplex, TwoIndexAntiSymmetricRepresentation, CoeffReal > W
typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD; // Double
//sp 2n
typedef WilsonImpl<vComplex, SpFundamentalRepresentation, CoeffReal > SpWilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, SpFundamentalRepresentation, CoeffReal > SpWilsonImplF; // Float
typedef WilsonImpl<vComplexD, SpFundamentalRepresentation, CoeffReal > SpWilsonImplD; // Double
typedef WilsonImpl<vComplex, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplD; // Double
typedef WilsonImpl<vComplex, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplD; // Double
typedef WilsonImpl<vComplex, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplR; // Real.. whichever prec // adj = 2indx symmetric for Sp(2N)
typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplF; // Float // adj = 2indx symmetric for Sp(2N)
typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplD; // Double // adj = 2indx symmetric for Sp(2N)
NAMESPACE_END(Grid);
-7
View File
@@ -52,13 +52,6 @@ public:
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
public:
#ifdef GRID_SYCL
#define SYCL_HACK
#endif
#ifdef SYCL_HACK
static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf,
int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
#endif
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
@@ -48,7 +48,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
, csw_r(_csw_r)
, csw_t(_csw_t)
, cF(_cF)
, open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, Diagonal(&Fgrid), Triangle(&Fgrid)
, DiagonalEven(&Hgrid), TriangleEven(&Hgrid)
, DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid)
@@ -67,7 +67,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
csw_r /= clover_anisotropy.xi_0;
ImportGauge(_Umu);
if (open_boundaries) {
if (fixedBoundaries) {
this->BoundaryMaskEven.Checkerboard() = Even;
this->BoundaryMaskOdd.Checkerboard() = Odd;
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
@@ -77,31 +77,31 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
WilsonBase::Dhop(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopOE(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopEO(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
WilsonBase::DhopDir(in, out, dir, disp);
if(this->open_boundaries) ApplyBoundaryMask(out);
if(this->fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
WilsonBase::DhopDirAll(in, out);
if(this->open_boundaries) {
if(this->fixedBoundaries) {
for(auto& o : out) ApplyBoundaryMask(o);
}
}
@@ -112,7 +112,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in,
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
Mooee(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
@@ -121,19 +121,19 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& i
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
MooeeDag(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
WilsonBase::Meooe(in, out);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
WilsonBase::MeooeDag(in, out);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
@@ -147,7 +147,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField&
} else {
MooeeInternal(in, out, Diagonal, Triangle);
}
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
@@ -166,7 +166,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionFiel
} else {
MooeeInternal(in, out, DiagonalInv, TriangleInv);
}
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
@@ -186,7 +186,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
assert(!open_boundaries); // TODO check for changes required for open bc
assert(!fixedBoundaries); // TODO check for changes required for open bc
// NOTE: code copied from original clover term
conformable(X.Grid(), Y.Grid());
@@ -305,6 +305,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
GridBase* grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
CloverField TmpOriginal(grid);
CloverField TmpInverse(grid);
// Compute the field strength terms mu>nu
double t2 = usecond();
@@ -324,24 +325,27 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
// Handle mass term based on clover policy
CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
// Convert the data layout of the clover term
// Instantiate the clover term
// - In case of the standard clover the mass term is added
// - In case of the exponential clover the clover term is exponentiated
double t4 = usecond();
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
// Convert the data layout of the clover term
double t5 = usecond();
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
// Exponentiate the clover (nothing happens in case of the standard clover)
double t5 = usecond();
CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
// Possible modify the boundary values
// Modify the clover term at the temporal boundaries in case of open boundary conditions
double t6 = usecond();
if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
// Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
// Invert the Clover term
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
// TODO: For now this inversion is explictly done on the CPU
double t7 = usecond();
CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
// Fill the remaining clover fields
double t8 = usecond();
@@ -362,10 +366,10 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "exponentiation = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "inversions = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
}
@@ -63,6 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
_tmp(&FiveDimRedBlackGrid),
Dirichlet(0)
{
Stencil.lo = &Lebesgue;
StencilEven.lo = &LebesgueEvenOdd;
StencilOdd.lo = &LebesgueEvenOdd;
// some assertions
assert(FiveDimGrid._ndimension==5);
assert(FourDimGrid._ndimension==4);
@@ -328,8 +332,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
/////////////////////////////
{
GRID_TRACE("Gather");
st.HaloExchangeOptGather(in,compressor);
accelerator_barrier();
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
}
std::vector<std::vector<CommsRequest_t> > requests;
@@ -60,6 +60,9 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
_tmp(&Hgrid),
anisotropyCoeff(anis)
{
Stencil.lo = &Lebesgue;
StencilEven.lo = &LebesgueEvenOdd;
StencilOdd.lo = &LebesgueEvenOdd;
// Allocate the required comms buffer
ImportGauge(_Umu);
if (anisotropyCoeff.isAnisotropic){
@@ -423,21 +423,33 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
#define KERNEL_CALL_EXT(A) \
const uint64_t NN = Nsite*Ls; \
const uint64_t sz = st.surface_list.size(); \
auto ptr = &st.surface_list[0]; \
accelerator_forNB( ss, sz, Simd::Nsimd(), { \
int sF = ptr[ss]; \
int sU = ss/Ls; \
int sU = sF/Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
});
}); \
accelerator_barrier();
#define ASM_CALL(A) \
thread_for( ss, Nsite, { \
thread_for( sss, Nsite, { \
int ss = st.lo->Reorder(sss); \
int sU = ss; \
int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
});
#define ASM_CALL_SLICE(A) \
auto grid = in.Grid() ; \
int nt = grid->LocalDimensions()[4]; \
int nxyz = Nsite/nt ; \
for(int t=0;t<nt;t++){ \
thread_for( sss, nxyz, { \
int ss = t*nxyz+sss; \
int sU = ss; \
int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
});}
template <class Impl>
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
@@ -451,11 +463,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifdef SYCL_HACK
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; }
#else
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#endif
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif
@@ -466,8 +474,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif
} else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
// dependent on result of merge
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
#endif
@@ -490,21 +500,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif
acceleratorFenceComputeStream();
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif
} else if( exterior ) {
// Dependent on result of merge
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif
acceleratorFenceComputeStream();
}
assert(0 && " Kernel optimisation case not covered ");
}
@@ -0,0 +1 @@
#define IMPLEMENTATION SpWilsonImplD
@@ -0,0 +1 @@
../WilsonCloverFermionInstantiation.cc.master
@@ -0,0 +1 @@
../WilsonFermionInstantiation.cc.master
@@ -0,0 +1 @@
../WilsonTMFermionInstantiation.cc.master
@@ -0,0 +1 @@
#define IMPLEMENTATION SpWilsonImplF
@@ -0,0 +1 @@
../WilsonCloverFermionInstantiation.cc.master
@@ -0,0 +1 @@
#define IMPLEMENTATION SpWilsonTwoIndexAntiSymmetricImplD
@@ -0,0 +1 @@
../WilsonCloverFermionInstantiation.cc.master
@@ -0,0 +1 @@
#define IMPLEMENTATION SpWilsonTwoIndexAntiSymmetricImplF
@@ -0,0 +1 @@
../WilsonCloverFermionInstantiation.cc.master
@@ -0,0 +1 @@
../WilsonFermionInstantiation.cc.master
@@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1 @@
../WilsonTMFermionInstantiation.cc.master
@@ -0,0 +1 @@
#define IMPLEMENTATION SpWilsonTwoIndexSymmetricImplD
@@ -0,0 +1 @@
../WilsonCloverFermionInstantiation.cc.master
@@ -0,0 +1 @@
../WilsonFermionInstantiation.cc.master
@@ -0,0 +1 @@
../WilsonKernelsInstantiation.cc.master
@@ -0,0 +1 @@
../WilsonTMFermionInstantiation.cc.master
@@ -0,0 +1 @@
#define IMPLEMENTATION SpWilsonTwoIndexSymmetricImplF
@@ -1 +0,0 @@
../CayleyFermion5DInstantiation.cc.master
@@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master
@@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master
@@ -1 +0,0 @@
../MobiusEOFAFermionInstantiation.cc.master
@@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master
@@ -1 +0,0 @@
../WilsonFermion5DInstantiation.cc.master
@@ -1 +0,0 @@
#define IMPLEMENTATION WilsonImplD2
@@ -1 +0,0 @@
../CayleyFermion5DInstantiation.cc.master
@@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master
@@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master
@@ -1 +0,0 @@
../MobiusEOFAFermionInstantiation.cc.master
@@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master
@@ -1 +0,0 @@
../WilsonFermion5DInstantiation.cc.master
@@ -1 +0,0 @@
#define IMPLEMENTATION ZWilsonImplD2
@@ -10,12 +10,18 @@ WILSON_IMPL_LIST=" \
WilsonImplF \
WilsonImplD \
WilsonImplD2 \
SpWilsonImplF \
SpWilsonImplD \
WilsonAdjImplF \
WilsonAdjImplD \
WilsonTwoIndexSymmetricImplF \
WilsonTwoIndexSymmetricImplD \
WilsonTwoIndexAntiSymmetricImplF \
WilsonTwoIndexAntiSymmetricImplD \
SpWilsonTwoIndexAntiSymmetricImplF \
SpWilsonTwoIndexAntiSymmetricImplD \
SpWilsonTwoIndexSymmetricImplF \
SpWilsonTwoIndexSymmetricImplD \
GparityWilsonImplF \
GparityWilsonImplD "
+3
View File
@@ -39,6 +39,9 @@ NAMESPACE_BEGIN(Grid);
typedef WilsonGaugeAction<PeriodicGimplR> WilsonGaugeActionR;
typedef WilsonGaugeAction<PeriodicGimplF> WilsonGaugeActionF;
typedef WilsonGaugeAction<PeriodicGimplD> WilsonGaugeActionD;
typedef WilsonGaugeAction<SpPeriodicGimplR> SpWilsonGaugeActionR;
typedef WilsonGaugeAction<SpPeriodicGimplF> SpWilsonGaugeActionF;
typedef WilsonGaugeAction<SpPeriodicGimplD> SpWilsonGaugeActionD;
typedef PlaqPlusRectangleAction<PeriodicGimplR> PlaqPlusRectangleActionR;
typedef PlaqPlusRectangleAction<PeriodicGimplF> PlaqPlusRectangleActionF;
typedef PlaqPlusRectangleAction<PeriodicGimplD> PlaqPlusRectangleActionD;
+21 -9
View File
@@ -32,7 +32,7 @@ directory
NAMESPACE_BEGIN(Grid);
#define CPS_MD_TIME
#undef CPS_MD_TIME
#ifdef CPS_MD_TIME
#define HMC_MOMENTUM_DENOMINATOR (2.0)
@@ -61,7 +61,7 @@ NAMESPACE_BEGIN(Grid);
typedef typename Impl::Field Field;
// hardcodes the exponential approximation in the template
template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplTypes {
template <class S, int Nrepresentation = Nc, int Nexp = 12, class Group = SU<Nc> > class GaugeImplTypes {
public:
typedef S Simd;
typedef typename Simd::scalar_type scalar_type;
@@ -78,8 +78,6 @@ public:
typedef Lattice<SiteLink> LinkField;
typedef Lattice<SiteField> Field;
typedef SU<Nrepresentation> Group;
// Guido: we can probably separate the types from the HMC functions
// this will create 2 kind of implementations
// probably confusing the users
@@ -119,6 +117,7 @@ public:
//
LinkField Pmu(P.Grid());
Pmu = Zero();
for (int mu = 0; mu < Nd; mu++) {
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ;
@@ -126,8 +125,12 @@ public:
PokeIndex<LorentzIndex>(P, Pmu, mu);
}
}
static inline Field projectForce(Field &P) { return Ta(P); }
static inline Field projectForce(Field &P) {
Field ret(P.Grid());
Group::taProj(P, ret);
return ret;
}
static inline void update_field(Field& P, Field& U, double ep){
//static std::chrono::duration<double> diff;
@@ -137,14 +140,15 @@ public:
autoView(P_v,P,AcceleratorRead);
accelerator_for(ss, P.Grid()->oSites(),1,{
for (int mu = 0; mu < Nd; mu++) {
U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu);
U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu));
}
});
//auto end = std::chrono::high_resolution_clock::now();
// diff += end - start;
// std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
}
static inline RealD FieldSquareNorm(Field& U){
LatticeComplex Hloc(U.Grid());
Hloc = Zero();
@@ -157,7 +161,7 @@ public:
}
static inline void Project(Field &U) {
ProjectSUn(U);
Group::ProjectOnSpecialGroup(U);
}
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
@@ -171,6 +175,7 @@ public:
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
Group::ColdConfiguration(pRNG, U);
}
};
@@ -178,10 +183,17 @@ typedef GaugeImplTypes<vComplex, Nc> GimplTypesR;
typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF;
typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD;
typedef GaugeImplTypes<vComplex, Nc, 12, Sp<Nc> > SpGimplTypesR;
typedef GaugeImplTypes<vComplexF, Nc, 12, Sp<Nc> > SpGimplTypesF;
typedef GaugeImplTypes<vComplexD, Nc, 12, Sp<Nc> > SpGimplTypesD;
typedef GaugeImplTypes<vComplex, SU<Nc>::AdjointDimension> GimplAdjointTypesR;
typedef GaugeImplTypes<vComplexF, SU<Nc>::AdjointDimension> GimplAdjointTypesF;
typedef GaugeImplTypes<vComplexD, SU<Nc>::AdjointDimension> GimplAdjointTypesD;
NAMESPACE_END(Grid);
#endif // GRID_GAUGE_IMPL_TYPES_H
+6 -1
View File
@@ -176,7 +176,7 @@ public:
return PeriodicBC::CshiftLink(Link,mu,shift);
}
static inline void setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
static inline void setDirections(const std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
static inline std::vector<int> getDirections(void) { return _conjDirs; }
static inline bool isPeriodicGaugeField(void) { return false; }
};
@@ -193,6 +193,11 @@ typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever pre
typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
typedef PeriodicGaugeImpl<SpGimplTypesR> SpPeriodicGimplR; // Real.. whichever prec
typedef PeriodicGaugeImpl<SpGimplTypesF> SpPeriodicGimplF; // Float
typedef PeriodicGaugeImpl<SpGimplTypesD> SpPeriodicGimplD; // Double
NAMESPACE_END(Grid);
#endif
@@ -43,7 +43,7 @@ public:
private:
RealD c_plaq;
RealD c_rect;
typename WilsonLoops<Gimpl>::StapleAndRectStapleAllWorkspace workspace;
public:
PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){};
@@ -79,27 +79,18 @@ public:
GridBase *grid = Umu.Grid();
std::vector<GaugeLinkField> U (Nd,grid);
std::vector<GaugeLinkField> U2(Nd,grid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
}
std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
GaugeLinkField dSdU_mu(grid);
GaugeLinkField staple(grid);
for (int mu=0; mu < Nd; mu++){
// Staple in direction mu
WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
dSdU_mu = Ta(U[mu]*staple)*factor_p;
WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
}
+48 -6
View File
@@ -42,9 +42,13 @@ template <class Gimpl>
class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
public:
INHERIT_GIMPL_TYPES(Gimpl);
typedef GaugeImplParams ImplParams;
ImplParams Params;
/////////////////////////// constructors
explicit WilsonGaugeAction(RealD beta_):beta(beta_){};
explicit WilsonGaugeAction(RealD beta_,
const ImplParams &p = ImplParams()
):beta(beta_),Params(p){};
virtual std::string action_name() {return "WilsonGaugeAction";}
@@ -56,14 +60,53 @@ public:
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){}; // noop as no pseudoferms
// Umu<->U maximally confusing
virtual void boundary(const GaugeField &Umu, GaugeField &Ub){
typedef typename Simd::scalar_type scalar_type;
assert(Params.boundary_phases.size() == Nd);
GridBase *GaugeGrid=Umu.Grid();
GaugeLinkField U(GaugeGrid);
GaugeLinkField tmp(GaugeGrid);
Lattice<iScalar<vInteger> > coor(GaugeGrid);
for (int mu = 0; mu < Nd; mu++) {
////////// boundary phase /////////////
auto pha = Params.boundary_phases[mu];
scalar_type phase( real(pha),imag(pha) );
std::cout<< GridLogIterative << "[WilsonGaugeAction] boundary "<<mu<<" "<<phase<< std::endl;
int L = GaugeGrid->GlobalDimensions()[mu];
int Lmu = L - 1;
LatticeCoordinate(coor, mu);
U = PeekIndex<LorentzIndex>(Umu, mu);
tmp = where(coor == Lmu, phase * U, U);
PokeIndex<LorentzIndex>(Ub, tmp, mu);
// PokeIndex<LorentzIndex>(Ub, U, mu);
// PokeIndex<LorentzIndex>(Umu, tmp, mu);
}
};
virtual RealD S(const GaugeField &U) {
RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
RealD vol = U.Grid()->gSites();
GaugeField Ub(U.Grid());
this->boundary(U,Ub);
static RealD lastG=0.;
RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(Ub);
RealD vol = Ub.Grid()->gSites();
RealD action = beta * (1.0 - plaq) * (Nd * (Nd - 1.0)) * vol * 0.5;
std::cout << GridLogMessage << "[WilsonGaugeAction] dH: " << action-lastG << std::endl;
RealD plaq_o = WilsonLoops<Gimpl>::avgPlaquette(U);
RealD action_o = beta * (1.0 - plaq_o) * (Nd * (Nd - 1.0)) * vol * 0.5;
std::cout << GridLogMessage << "[WilsonGaugeAction] U: " << action_o <<" Ub: "<< action << std::endl;
lastG=action;
return action;
};
virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
GaugeField Ub(U.Grid());
this->boundary(U,Ub);
// not optimal implementation FIXME
// extend Ta to include Lorentz indexes
@@ -73,10 +116,9 @@ public:
GaugeLinkField dSdU_mu(U.Grid());
for (int mu = 0; mu < Nd; mu++) {
Umu = PeekIndex<LorentzIndex>(U, mu);
Umu = PeekIndex<LorentzIndex>(Ub, mu);
// Staple in direction mu
WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
WilsonLoops<Gimpl>::Staple(dSdU_mu, Ub, mu);
dSdU_mu = Ta(Umu * dSdU_mu) * factor;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);

Some files were not shown because too many files have changed in this diff Show More